diff --git a/.gitignore b/.gitignore
index e4e47aa4b705c9035394f5ea15c0d5e653859206..3b236ea5bdd793ed34603010c520fa1f1a43e34b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,8 +9,10 @@ buildGCC
 _skbuild/
 dist/
 *.egg-info/
-__pycache__/
+**/__pycache__/
 .venv/
+pythonbindings/pyfluids/bindings*
+pythonbindings/pymuparser/bindings*
 
 # IDE
 .vscode/
@@ -38,4 +40,7 @@ stl/
 .DS_Store
 
 # Settings
-.gitconfig
\ No newline at end of file
+.gitconfig
+
+# User Settings
+CMakeUserPresets.json
\ No newline at end of file
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b34c5a8f66c1340670b6acd80ea6a9901b2760d1..e171e2e7fbe1984588355f5a833a21160024da32 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,7 +1,7 @@
 ###############################################################################
 ##                       VirtualFluids CI Pipeline                           ##
 ###############################################################################
-image: git.rz.tu-bs.de:4567/irmb/virtualfluids/ubuntu20_04:1.3
+image: git.rz.tu-bs.de:4567/irmb/virtualfluids/ubuntu20_04:1.4
 
 stages:
   - build
@@ -49,7 +49,7 @@ stages:
     - cd $CI_PROJECT_DIR/$BUILD_FOLDER
     - rm -r -f ./*
     - cmake .. -LAH
-      --preset=all_make
+      --preset=make_all
       -DBUILD_WARNINGS_AS_ERRORS=ON
       -DCMAKE_CUDA_ARCHITECTURES=60
     - make -j4
@@ -75,7 +75,7 @@ clang_10:
     - export CXX=clang++
 
 ###############################################################################
-msvc_16:
+msvc_17:
   stage: build
 
   tags:
@@ -92,14 +92,14 @@ msvc_16:
     - git --version
     - $env:Path += ";C:\Program Files\CMake\bin\"
     - cmake --version
-    - $env:Path += ";C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Current\Bin"
+    - $env:Path += ";C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin"
     - MSBuild.exe -version
 
   script:
     - cd $CI_PROJECT_DIR
     - md -force $env:BUILD_FOLDER
     - cd $env:BUILD_FOLDER
-    - cmake .. --preset=all_msvc -DCMAKE_CUDA_ARCHITECTURES=61 -DBUILD_WARNINGS_AS_ERRORS=ON
+    - cmake .. --preset=msvc_all -DCMAKE_CUDA_ARCHITECTURES=61 -DBUILD_WARNINGS_AS_ERRORS=ON
     - MSBuild.exe VirtualFluids.sln /property:Configuration=$env:BUILD_CONFIGURATION /verbosity:minimal /maxcpucount:4
 
   artifacts:
@@ -126,33 +126,44 @@ gcc_9_python:
     paths:
       - build/
       - dist/
+      - _skbuild/
 
   before_script:
     - export CCACHE_BASEDIR=$CI_PROJECT_DIR
     - export CCACHE_DIR=$CI_PROJECT_DIR/cache
 
   script:
-    - python3 setup.py bdist_wheel build_ext --build-temp=build
+    - python3 setup.py bdist_wheel build_ext --build-temp=_skbuild -- -DBUILD_VF_CPU=ON -DBUILD_VF_DOUBLE_ACCURACY=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache
 
 ###############################################################################
 ##                            Container Upload                               ##
 ###############################################################################
-build_singularity_image:
+build_poiseuille_test_container:
+  image: 
+    name: quay.io/singularity/singularity:v3.10.2
+    entrypoint: [""]
+
   stage: container_upload
 
-  needs:
-    - gcc_9_python
+  rules:
+    - if: $REMOTE_USER && $REMOTE_HOST && $PRIVATE_KEY && $CI_PIPELINE_SOURCE == "schedule"
+      when: always
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+      when: never
+    - when: manual
+      allow_failure: true
 
   tags:
     - linux
     - privileged
 
-  rules:
-    - if: $CI_COMMIT_TAG
+  artifacts:
+    expire_in: 1 hrs
+    paths:
+      - Containers/PoiseuilleTestContainer.sif
 
   script:
-    - singularity build Containers/VirtualFluidsPython.sif Containers/VirtualFluidsPython.def
-    - singularity push --docker-username "${CI_REGISTRY_USER}" --docker-password "${CI_REGISTRY_PASSWORD}" Containers/VirtualFluidsPython.sif oras://"$CI_REGISTRY_IMAGE"/"$CI_PROJECT_NAME":"$CI_COMMIT_TAG"
+    - singularity build "Containers/PoiseuilleTestContainer.sif" "Python/SlurmTests/poiseuille/PoiseuilleTestContainer.def"
 
 ###############################################################################
 ##                                Tests                                      ##
@@ -169,14 +180,14 @@ gcc_9_unit_tests:
     - ctest
 
 ###############################################################################
-msvc_16_unit_tests:
+msvc_17_unit_tests:
   stage: test
 
   tags:
     - win
     - gpu
 
-  needs: ["msvc_16"]
+  needs: ["msvc_17"]
 
   before_script:
     - $env:Path += ";C:\Program Files\CMake\bin\"
@@ -202,6 +213,52 @@ gcc_9_python_bindings_test:
     - python3 -m unittest discover -s Python -v
 
 
+###############################################################################
+gcc_9_python_hpc_test:
+  image: python:latest
+  stage: test
+
+  needs: ["build_poiseuille_test_container"]
+
+  rules:
+    - if: $REMOTE_USER && $REMOTE_HOST && $PRIVATE_KEY && $CI_PIPELINE_SOURCE == "schedule"
+      when: always
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+      when: never
+    - when: manual
+      allow_failure: true
+
+  before_script:
+    - pip install hpc-rocket
+
+  script:
+    - hpc-rocket launch --watch Python/SlurmTests/poiseuille/rocket.yml
+
+###############################################################################
+multigpu_hpc_test:
+  image: python:latest
+  stage: test
+
+  rules:
+    - if: $REMOTE_USER && $REMOTE_HOST && $PRIVATE_KEY && $CI_PIPELINE_SOURCE == "schedule"
+      when: always
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+      when: never
+    - when: manual
+      allow_failure: true
+
+  before_script:
+    - pip install hpc-rocket
+    - pip install "fieldcompare[all]"
+
+  script:
+    - hpc-rocket launch --watch regression-tests/multigpu_test/rocket.yml
+    - git clone --depth 1 --filter=blob:none --sparse https://github.com/irmb/test_data
+    - cd test_data
+    - git sparse-checkout set regression_tests/gpu/DrivenCavity_4GPU_2Levels
+    - cd ..
+    - fieldcompare dir output/results test_data/regression_tests/gpu/DrivenCavity_4GPU_2Levels --include-files "*.vtu"
+
 ###############################################################################
 ##                            Benchmark                                      ##
 ###############################################################################
@@ -248,7 +305,7 @@ gpu_numerical_tests:
     - cd $CI_PROJECT_DIR/build
     - rm -r -f ./*
     - cmake ..
-      --preset=gpu_numerical_tests_make
+      --preset=make_numerical_tests_gpu
       -DCMAKE_CUDA_ARCHITECTURES=60
       -DPATH_NUMERICAL_TESTS=/tmp/test_data/numerical_tests_gpu
     - make -j4
@@ -319,8 +376,7 @@ clang_build_analyzer_clang_10:
     - mkdir -p $CI_PROJECT_DIR/build
     - cd $CI_PROJECT_DIR/build
     - cmake ..
-      -DBUILD_VF_CPU=ON
-      -DBUILD_VF_GPU=ON
+      --preset=make_all
       -DCMAKE_CUDA_ARCHITECTURES=60
       -DCMAKE_CXX_FLAGS=-ftime-trace
     - ClangBuildAnalyzer --start .
@@ -352,8 +408,7 @@ include_what_you_use_clang_10:
     - mkdir -p $CI_PROJECT_DIR/build
     - cd $CI_PROJECT_DIR/build
     - cmake ..
-      -DBUILD_VF_CPU=ON
-      -DBUILD_VF_GPU=ON
+      --preset=make_all
       -DCMAKE_CUDA_ARCHITECTURES=60
       -DBUILD_VF_INCLUDE_WHAT_YOU_USE=ON
     - make
@@ -430,7 +485,7 @@ gcov_gcc_9:
     - mkdir -p $CI_PROJECT_DIR/build
     - cd $CI_PROJECT_DIR/build
     - cmake ..
-      --preset=all_make
+      --preset=make_all
       -DCMAKE_CUDA_ARCHITECTURES=60
       -DBUILD_VF_COVERAGE=ON
     - make -j4
@@ -473,6 +528,7 @@ clang-tidy:
     - cd $CI_PROJECT_DIR/build
     - cmake ..
       -DBUILD_VF_CPU=ON
+      -DBUILD_VF_DOUBLE_ACCURACY=ON
       -DBUILD_VF_GPU=OFF
     - python3 ../utilities/filterCompileCommands.py compile_commands.json
     - run-clang-tidy -quiet > clangtidy.txt
@@ -604,26 +660,3 @@ sonar-scanner:
   script:
     - cd $CI_PROJECT_DIR
     - sonar-scanner -X -Dsonar.verbose=true -Dsonar.login=$SONAR_SECURITY_TOKEN
-
-###############################################################################
-##                              Release                                      ##
-###############################################################################
-create_release:
-  stage: release
-
-  image: registry.gitlab.com/gitlab-org/release-cli:latest
-
-  needs: ["build_singularity_image"]
-
-  rules:
-    - if: $CI_COMMIT_TAG
-
-  script:
-    - echo "Creating release with tag $CI_COMMIT_TAG"
-    - release-cli create --name "VirtualFluids $CI_COMMIT_TAG" \
-      --description "VirtualFluids CFD Simulator" \
-      --tag-name "$CI_COMMIT_TAG" \
-      --ref "$CI_COMMIT_SHA" \
-      --job-token "$CI_JOB_TOKEN" \
-      --assets-link="{'name':'VirtualFluidsSingularityImage_OpenMPI','url':'','type':'other','filepath':'Containers/VirtualFluidsOpenMPI.sif'}"
-    - build/bin/basicsTests
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..50d4989d5c269521392644515d716fa93b3cf6e3
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,40 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+type: software
+authors:
+  - family-names: Kutscher
+    given-names: Konstantin
+    orcid: https://orcid.org/0000-0002-1099-1608
+  - family-names: Schönherr
+    given-names: Martin
+    orcid: https://orcid.org/0000-0002-4774-1776
+  - family-names: Geier
+    given-names: Martin
+    orcid: https://orcid.org/0000-0002-8367-9412
+  - family-names: Krafczyk
+    given-names: Manfred
+    orcid: https://orcid.org/0000-0002-8509-0871
+  - family-names: Alihussein
+    given-names: Hussein
+    orcid: https://orcid.org/0000-0003-3656-7028
+  - family-names: Linxweiler
+    given-names: Jan
+    orcid: https://orcid.org/0000-0002-2755-5087
+  - family-names: Peters
+    given-names: Sören
+    orcid: https://orcid.org/0000-0001-5236-3776
+  - family-names: Wellmann
+    given-names: Anna
+    orcid: https://orcid.org/0000-0002-8825-2995
+  - family-names: Safari
+    given-names: Hesameddin
+    orcid: https://orcid.org/0000-0002-2755-5087
+  - family-names: Marcus
+    given-names: Sven
+    orcid: https://orcid.org/0000-0003-3689-2162
+title: "VirtualFluids"
+version: 0.1.0
+license: GPL-3.0-or-later
+repository-code: "https://git.rz.tu-bs.de/irmb/VirtualFluids"
+date-released: "XXXXXXX"
+
diff --git a/CMake/FileUtilities.cmake b/CMake/FileUtilities.cmake
index 151000a681795923d4e31ed8c5f06dfd1e7af7fd..13057ef832b5aa2d7ce303fe55e95a91284f5f56 100644
--- a/CMake/FileUtilities.cmake
+++ b/CMake/FileUtilities.cmake
@@ -5,7 +5,7 @@
 ## After function call the files are stored in: MY_SRCS
 #################################################################################
 
-macro(includeAllFiles targetName file_path)
+macro(includeAllFiles folderName targetName file_path)
 	if(NOT DEFINED collectTestFiles)
 	    set(collectTestFiles ON)
 	endif()
@@ -14,11 +14,11 @@ macro(includeAllFiles targetName file_path)
         set(collectProductionFiles ON)
     endif()
 
-	includeFiles(${targetName} "${file_path}")
+	includeFiles(${folderName} ${targetName} "${file_path}")
 endmacro(includeAllFiles)
 
 
-macro(includeProductionFiles targetName file_path)
+macro(includeProductionFiles folderName targetName file_path)
 	if(NOT DEFINED collectTestFiles)
 	    set(collectTestFiles OFF)
 	endif()
@@ -27,12 +27,12 @@ macro(includeProductionFiles targetName file_path)
         set(collectProductionFiles ON)
     endif()
 
-	includeFiles(${targetName} "${file_path}")
+	includeFiles(${folderName}  ${targetName} "${file_path}")
 endmacro(includeProductionFiles)
 
 
 
-macro(includeTestFiles targetName file_paths)
+macro(includeTestFiles folderName file_paths)
 	if(NOT DEFINED collectTestFiles)
 		set(collectTestFiles ON)
 	endif()
@@ -41,13 +41,13 @@ macro(includeTestFiles targetName file_paths)
 		set(collectProductionFiles OFF)
 	endif()
 
-	includeFiles(${targetName} "${file_paths}")
+	includeFiles(${folderName} ${folderName} "${file_paths}")
 endmacro(includeTestFiles)
 
 
 
 
-macro(includeFiles targetName file_paths)
+macro(includeFiles folderName targetName file_paths)
 
 	foreach(file ${file_paths})
 
@@ -57,7 +57,7 @@ macro(includeFiles targetName file_paths)
 
 		collectFilesFrom(${file})
 		if (package_dir)
-		   setSourceGroupForFilesIn(${file} ${package_dir} ${targetName})
+		   setSourceGroupForFilesIn(${file} ${package_dir} ${targetName} ${folderName})
 		endif()
 
 	endforeach()
@@ -90,9 +90,9 @@ endmacro()
 
 
 
-macro(setSourceGroupForFilesIn file package_dir targetName)
+macro(setSourceGroupForFilesIn file package_dir targetName folderName)
 #input: target_name PACKAGE_SRCS
-	buildSourceGroup(${targetName} ${package_dir})
+	buildSourceGroup(${folderName} ${package_dir})
 
 	if(isAllTestSuite)
 		source_group(${targetName}\\${SOURCE_GROUP} FILES ${file})
@@ -105,20 +105,20 @@ endmacro(setSourceGroupForFilesIn)
 
 
 
-macro(buildSourceGroup targetName path)
-#input: targetName (e.g. lib name, exe name)
+macro(buildSourceGroup folderName path)
+#input: folderName (e.g. name of folder after src/)
 
 	unset(SOURCE_GROUP)
 	string(REPLACE "/" ";" folderListFromPath ${path})
-	set(findTargetName 0)
+	set(findFolderName 0)
 
 	foreach(folder ${folderListFromPath})
-		if(findTargetName)
+		if(findFolderName)
 			set(SOURCE_GROUP ${SOURCE_GROUP}\\${folder})
 		endif()
 
-		if(${folder} STREQUAL ${targetName})
-			SET(findTargetName 1)
+		if(${folder} STREQUAL ${folderName})
+			SET(findFolderName 1)
 		endif()
 	endforeach()
 
diff --git a/CMake/VirtualFluidsMacros.cmake b/CMake/VirtualFluidsMacros.cmake
index 63503f5f14221bb8cec7670dbdda6aa92497d327..4fd163b2cc1b53fe461ef482d906f4cb1255a76c 100644
--- a/CMake/VirtualFluidsMacros.cmake
+++ b/CMake/VirtualFluidsMacros.cmake
@@ -105,14 +105,15 @@ function(vf_add_library)
 
     set( options )
     set( oneValueArgs NAME BUILDTYPE)
-    set( multiValueArgs PUBLIC_LINK PRIVATE_LINK FILES FOLDER EXCLUDE)
+    set( multiValueArgs PUBLIC_LINK PRIVATE_LINK FILES FOLDER EXCLUDE MODULEFOLDER)
     cmake_parse_arguments( ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} )
 
-    if(DEFINED ARG_NAME)
+    if(DEFINED ARG_NAME) 
         set(library_name ${ARG_NAME})
     else()
         vf_get_library_name (library_name)
     endif()
+    vf_get_library_name (folder_name) # folder_name is not equal to library_name when ARG_NAME was set
 
     if(NOT DEFINED ARG_BUILDTYPE)
         if(BUILD_SHARED_LIBS)
@@ -122,12 +123,16 @@ function(vf_add_library)
         endif()
     endif()
 
+    if(DEFINED ARG_MODULEFOLDER)
+        set(folder_name ${ARG_MODULEFOLDER})
+    endif()
+
     status("Configuring the target: ${library_name} (type=${ARG_BUILDTYPE})...")
 
 
     collectFiles(sourceFiles "${ARG_FILES}" "${ARG_FOLDER}" "${ARG_EXCLUDE}")
 
-    includeProductionFiles (${library_name} "${sourceFiles}")
+    includeProductionFiles (${folder_name} ${library_name} "${sourceFiles}")
 
     #################################################################
     ###   ADD TARGET                                              ###
@@ -325,4 +330,4 @@ function(vf_load_user_apps)
     foreach(app IN LISTS USER_APPS)
       add_subdirectory(${app})
     endforeach()
-endfunction()
\ No newline at end of file
+endfunction()
diff --git a/CMake/cmake_config_files/MOLLOK.config.cmake b/CMake/cmake_config_files/MOLLOK.config.cmake
index f700f3cd7a4b5669ef6ffee9436a1528e50e9dc9..72470da1bc52a242cb8e3c341e0e7f87bb06ab26 100644
--- a/CMake/cmake_config_files/MOLLOK.config.cmake
+++ b/CMake/cmake_config_files/MOLLOK.config.cmake
@@ -12,4 +12,5 @@ set(PATH_NUMERICAL_TESTS "D:/out/numericalTests/")
 list(APPEND VF_COMPILER_DEFINITION "PATH_NUMERICAL_TESTS=${PATH_NUMERICAL_TESTS}")
 
 # add invidual apps here
-list(APPEND USER_APPS "apps/gpu/LBM/WTG_RUB")
\ No newline at end of file
+list(APPEND USER_APPS "apps/gpu/LBM/WTG_RUB")
+list(APPEND USER_APPS "apps/gpu/LBM/TGV_3D_GridRef")
diff --git a/CMake/cmake_config_files/MULE.config.cmake b/CMake/cmake_config_files/MULE.config.cmake
index 02f61b7988c5b3af9cd58bc52e46b1b2edfe8aae..2afbce6cc257fa0b8ff4dd7de580cb50c01369f1 100644
--- a/CMake/cmake_config_files/MULE.config.cmake
+++ b/CMake/cmake_config_files/MULE.config.cmake
@@ -1 +1,4 @@
-SET(CMAKE_CUDA_ARCHITECTURES "75")
\ No newline at end of file
+SET(CMAKE_CUDA_ARCHITECTURES "75")
+
+list(APPEND USER_APPS "apps/gpu/LBM/ActuatorLine")
+list(APPEND USER_APPS "apps/gpu/LBM/SphereScaling")
diff --git a/CMake/cmake_config_files/PHOENIX.config.cmake b/CMake/cmake_config_files/PHOENIX.config.cmake
index d31d8684a53a769e48408ad5febe7d2c6b22c623..5ca4d9821d918f66745fc27363975811dc278440 100644
--- a/CMake/cmake_config_files/PHOENIX.config.cmake
+++ b/CMake/cmake_config_files/PHOENIX.config.cmake
@@ -28,7 +28,7 @@ set(CMAKE_CUDA_ARCHITECTURES 60) # NVIDIA Tesla P100
 
 set(GPU_APP "apps/gpu/LBM/")
 list(APPEND USER_APPS 
-    # "${GPU_APP}DrivenCavityMultiGPU"
+    "${GPU_APP}DrivenCavityMultiGPU"
     # "${GPU_APP}SphereScaling"
     # "${GPU_APP}MusselOyster"
     )
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b3af407acd66ec3223f55de7753df879786ce561..c6498bf19bb021f3ae19d69c4131aa56476149be 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@
 cmake_minimum_required(VERSION 3.15..3.20 FATAL_ERROR)
 
 project(VirtualFluids
-        VERSION 1.0.0
+        VERSION 0.1.0
         DESCRIPTION "CFD code based on the Lattice Boltzmann Method"
         HOMEPAGE_URL "https://www.tu-braunschweig.de/irmb/forschung/virtualfluids"
         LANGUAGES CXX)
diff --git a/CMakePresets.json b/CMakePresets.json
index 0f360fd303cdcad923b01d56df5c6d48ad62ca2c..6e2658d148bddf55950e5849adcf10709a8b8caf 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,34 +1,47 @@
 {
-    "version": 2,
+    "version": 3,
     "cmakeMinimumRequired": {
         "major": 3,
-        "minor": 20,
+        "minor": 21,
         "patch": 0
     },
     "configurePresets": [
         {
             "name": "default",
+            "binaryDir": "build",
+            "hidden": true
+        },
+        {
+            "name": "msvc",
             "hidden": true,
-            "binaryDir": "${sourceDir}/build/",
-            "cacheVariables": {
-                "BUILD_VF_UNIT_TESTS": "ON"
-            }
+            "generator": "Visual Studio 17 2022",
+            "architecture": "x64",
+            "condition": {
+                "type": "equals",
+                "lhs": "${hostSystemName}",
+                "rhs": "Windows"
+              }
         },
         {
-            "name": "default_make",
-            "inherits": "default",
+            "name": "make",
             "hidden": true,
-            "generator": "Unix Makefiles"
+            "generator": "Unix Makefiles",
+            "condition": {
+                "type": "notEquals",
+                "lhs": "${hostSystemName}",
+                "rhs": "Windows"
+              }
         },
         {
-            "name": "default_msvc",
-            "inherits": "default",
+            "name": "unit_tests",
             "hidden": true,
-            "generator": "Visual Studio 16 2019",
-            "architecture": "x64"
+            "cacheVariables": {
+                "BUILD_VF_UNIT_TESTS": "ON"
+            }
         },
         {
-            "name": "default_cpu",
+            "name": "cpu",
+            "inherits": "default",
             "hidden": true,
             "description": "CPU build of VirtualFluids",
             "cacheVariables": {
@@ -37,7 +50,8 @@
             }
         },
         {
-            "name": "default_gpu",
+            "name": "gpu",
+            "inherits": "default",
             "hidden": true,
             "description": "GPU build of VirtualFluids",
             "cacheVariables": {
@@ -46,9 +60,10 @@
             }
         },
         {
-            "name": "default_gpu_numerical_tests",
+            "name": "gpu_numerical_tests",
             "inherits": [
-                "default_gpu"
+                "gpu",
+                "unit_tests"
             ],
             "hidden": true,
             "description": "GPU numerical tests of VirtualFluids",
@@ -58,78 +73,74 @@
             }
         },
         {
-            "name": "default_all",
-            "hidden": true,
-            "description": "All build of VirtualFluids",
+            "name": "make_all",
             "inherits": [
-                "default_cpu",
-                "default_gpu"
+                "cpu",
+                "gpu",
+                "unit_tests",
+                "make"
             ],
-            "cacheVariables": {
-                "BUILD_VF_DOUBLE_ACCURACY": "ON"
-            }
+            "displayName": "all make configuration"
         },
         {
-            "name": "cpu_make",
+            "name": "make_cpu",
             "inherits": [
-                "default_make",
-                "default_cpu"
+                "cpu",
+                "unit_tests",
+                "make"
             ],
             "displayName": "cpu make configuration"
         },
         {
-            "name": "cpu_msvc",
-            "inherits": [
-                "default_msvc",
-                "default_cpu"
-            ],
-            "displayName": "cpu msvc configuration"
-        },
-        {
-            "name": "gpu_make",
+            "name": "make_gpu",
             "inherits": [
-                "default_make",
-                "default_gpu"
+                "gpu",
+                "unit_tests",
+                "make"
             ],
             "displayName": "gpu make configuration"
         },
         {
-            "name": "gpu_msvc",
+            "name": "msvc_all",
             "inherits": [
-                "default_msvc",
-                "default_gpu"
+                "cpu",
+                "gpu",
+                "unit_tests",
+                "msvc"
             ],
-            "displayName": "gpu msvc configuration"
+            "displayName": "all msvc configuration"
         },
         {
-            "name": "all_make",
+            "name": "msvc_cpu",
             "inherits": [
-                "default_make",
-                "default_all"
+                "cpu",
+                "unit_tests",
+                "msvc"
             ],
-            "displayName": "all make configuration"
+            "displayName": "cpu msvc configuration"
         },
         {
-            "name": "all_msvc",
+            "name": "msvc_gpu",
             "inherits": [
-                "default_msvc",
-                "default_all"
+                "gpu",
+                "unit_tests",
+                "msvc"
             ],
-            "displayName": "all msvc configuration"
+            "displayName": "gpu msvc configuration"
         },
         {
-            "name": "gpu_numerical_tests_make",
+            "name": "make_numerical_tests_gpu",
             "inherits": [
-                "default_make",
-                "default_gpu_numerical_tests"
+                "gpu_numerical_tests",
+                "make"
             ],
             "displayName": "gpu numerical tests make configuration"
         },
         {
-            "name": "gpu_numerical_tests_msvc",
+            "name": "msvc_numerical_tests_gpu",
             "inherits": [
-                "default_msvc",
-                "default_gpu_numerical_tests"
+                "msvc",
+                "gpu_numerical_tests"
             ],
             "displayName": "gpu numerical tests msvc configuration"
         }
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..adafcf99560acd9da79aa060194df8263b6e77e0
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include pythonbindings/*/bindings*
\ No newline at end of file
diff --git a/Python/SlurmTests/poiseuille/PoiseuilleTestContainer.def b/Python/SlurmTests/poiseuille/PoiseuilleTestContainer.def
index a3836e7906b9be66ec79f68bf53ccc079db9d9ef..d31a7b82a4e9e988f815139fb46318d231d450f8 100644
--- a/Python/SlurmTests/poiseuille/PoiseuilleTestContainer.def
+++ b/Python/SlurmTests/poiseuille/PoiseuilleTestContainer.def
@@ -1,11 +1,13 @@
 BootStrap: docker
 From: ubuntu:20.04
+Stage: build
 
 %files
     3rdParty 3rdParty
     apps apps
     CMake CMake
     Python Python
+    pythonbindings pythonbindings
     src src
     CMakeLists.txt CMakeLists.txt
     cpu.cmake cpu.cmake
@@ -19,7 +21,8 @@ From: ubuntu:20.04
     apt-get update &&          \
     apt-get install -y         \
     build-essential            \
-    cmake=3.16.3-1ubuntu1      \
+    ccache                     \
+    git                        \
     python3                    \
     python3-dev                \
     python3-pip                \
@@ -27,10 +30,32 @@ From: ubuntu:20.04
     libomp-dev                 \
     libgl1
 
-    pip3 install setuptools wheel numpy scipy pyvista
+    pip3 install setuptools wheel cmake numpy scipy pyvista scikit-build
 
     export PYTHONPATH=Python
-    python3 /setup.py install
+    python3 /setup.py bdist_wheel build_ext --build-temp=_skbuild -- -DBUILD_VF_CPU=ON -DBUILD_VF_DOUBLE_ACCURACY=ON
+
+    pip3 install $(find dist/*.whl)
+
+
+BootStrap: docker
+From: ubuntu:20.04
+Stage: runtime
+
+%files from build
+    Python Python
+    dist dist
+
+%post
+    export DEBIAN_FRONTEND=noninteractive
+    apt-get update &&          \
+    apt-get install -y         \
+    python3                    \
+    python3-pip                \
+    mpich                      \
+    libgl1
+
+    pip3 install $(find dist/*.whl)
 
 %environment
     export PYTHONPATH=/Python
diff --git a/Python/SlurmTests/poiseuille/rocket.yml b/Python/SlurmTests/poiseuille/rocket.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b186469a4d3fd4b8edfafa4fc3f6dcd64e311d70
--- /dev/null
+++ b/Python/SlurmTests/poiseuille/rocket.yml
@@ -0,0 +1,23 @@
+host: $REMOTE_HOST
+user: $REMOTE_USER
+private_keyfile: $PRIVATE_KEY
+
+copy:
+  - from: Python/SlurmTests/poiseuille/slurm.job
+    to: poiseuille_test/slurm.job
+    overwrite: true
+
+  - from: Containers/PoiseuilleTestContainer.sif
+    to: poiseuille_test/PoiseuilleTestContainer.sif
+    overwrite: true
+
+collect:
+  - from: poiseuille_test/POISEUILLE_TEST.out
+    to: POISEUILLE_TEST.out
+    overwrite: true
+
+#clean:
+#  - poiseuille_test/PoiseuilleTestContainer.sif
+
+sbatch: poiseuille_test/slurm.job
+continue_if_job_fails: true
diff --git a/Python/SlurmTests/poiseuille/settings.py b/Python/SlurmTests/poiseuille/settings.py
index 4b4a1e4e9cc7f6118a60c22a40c70b027e3ac4e2..a3cdc5dc8b627612c2d57a58db36c9fbaa72efac 100644
--- a/Python/SlurmTests/poiseuille/settings.py
+++ b/Python/SlurmTests/poiseuille/settings.py
@@ -1,25 +1,58 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file settings.py
+! \ingroup Poiseuille
+! \author Sven Marcus, Henry Korb
+=======================================================================================
+"""
 import os
 from acousticscaling import OneDirectionalAcousticScaling
-from pyfluids.cpu.kernel import LBMKernel, KernelType
-from pyfluids.cpu.parameters import RuntimeParameters, GridParameters, PhysicalParameters
+from pyfluids import cpu
 
 
-grid_params = GridParameters()
+grid_params = cpu.parameters.GridParameters()
 grid_params.node_distance = 1
 grid_params.number_of_nodes_per_direction = [1, 1, 16]
 grid_params.blocks_per_direction = [1, 1, 4]
 grid_params.periodic_boundary_in_x1 = True
 grid_params.periodic_boundary_in_x2 = True
 
-physical_params = PhysicalParameters()
+physical_params = cpu.parameters.PhysicalParameters()
 physical_params.lattice_viscosity = 1e-4
 
-runtime_params = RuntimeParameters()
+runtime_params = cpu.parameters.RuntimeParameters()
 runtime_params.number_of_threads = int(os.environ["PYFLUIDS_NUM_THREADS"])
 runtime_params.number_of_timesteps = 4_000_000
 runtime_params.timestep_log_interval = 1_000_000
 
-kernel = LBMKernel(KernelType.CompressibleCumulantFourthOrderViscosity)
+kernel = cpu.kernel.LBMKernel(cpu.kernel.KernelType.CompressibleCumulantFourthOrderViscosity)
 kernel.use_forcing = True
 kernel.forcing_in_x1 = 5e-10
 
diff --git a/Python/SlurmTests/poiseuille/simulation_runner.py b/Python/SlurmTests/poiseuille/simulation_runner.py
index 03fb24be7ea1a6468ae25ec3aa40ab59962ef91e..d54a35e72b298562f8ccec82677089f3898eec9b 100644
--- a/Python/SlurmTests/poiseuille/simulation_runner.py
+++ b/Python/SlurmTests/poiseuille/simulation_runner.py
@@ -5,7 +5,7 @@ from poiseuille.simulation import run_simulation
 from pyfluids.cpu.writer import Writer, OutputFormat
 
 
-scale_level = int(os.environ["PYFLUIDS_SCALE_LEVEL"])
+scale_level = int(os.environ.get("PYFLUIDS_SCALE_LEVEL", 1))
 grid_params, physical_params, runtime_params, kernel = Scaling.configuration_for_scale_level(scale_level)
 
 writer = Writer()
diff --git a/Python/SlurmTests/poiseuille/slurm.job b/Python/SlurmTests/poiseuille/slurm.job
index 488fc9a42f261d69a8212cff389721fdfb9cbf6e..b4e4da271920479ade008b28d4d2e6ce6343c3d3 100644
--- a/Python/SlurmTests/poiseuille/slurm.job
+++ b/Python/SlurmTests/poiseuille/slurm.job
@@ -1,5 +1,6 @@
 #!/bin/bash
 #SBATCH -J PyFluidsTest
+#SBATCH -o poiseuille_test/POISEUILLE_TEST.out
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=20
@@ -9,6 +10,9 @@
 #SBATCH --partition=standard
 
 source $HOME/.bashrc
+module load singularity/3.9.9
+
+cd poiseuille_test
 
 echo "PyFluids Poiseuille Test Case"
 echo "Number of tasks: ${SLURM_NTASKS}"
diff --git a/Python/acousticscaling.py b/Python/acousticscaling.py
index a664b8e924d648b680562b9aef11bee87b3562b1..7e71fed9fdd9f86415261ef4e22797021581f60c 100644
--- a/Python/acousticscaling.py
+++ b/Python/acousticscaling.py
@@ -1,22 +1,55 @@
-from pyfluids.cpu.kernel import LBMKernel
-from pyfluids.cpu.parameters import GridParameters, PhysicalParameters, RuntimeParameters
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file acousticscaling.py
+! \ingroup tests
+! \author Sven Marcus, Henry Korb
+=======================================================================================
+"""
+from pyfluids import cpu
 
 
 class OneDirectionalAcousticScaling:
 
-    def __init__(self, grid_parameters: GridParameters,
-                 physical_parameters: PhysicalParameters,
-                 runtime_parameters: RuntimeParameters,
-                 kernel: LBMKernel):
+    def __init__(self, grid_parameters: cpu.parameters.GridParameters,
+                 physical_parameters: cpu.parameters.PhysicalParameters,
+                 runtime_parameters: cpu.parameters.RuntimeParameters,
+                 kernel: cpu.kernel.LBMKernel):
         self._grid_params = grid_parameters
         self._physical_params = physical_parameters
         self._runtime_params = runtime_parameters
         self._kernel = kernel
 
-    def configuration_for_scale_level(self, level: int = 1) -> tuple[GridParameters,
-                                                                PhysicalParameters,
-                                                                RuntimeParameters,
-                                                                LBMKernel]:
+    def configuration_for_scale_level(self, level: int = 1) -> tuple[cpu.parameters.GridParameters,
+                                                                cpu.parameters.PhysicalParameters,
+                                                                cpu.parameters.RuntimeParameters,
+                                                                cpu.kernel.LBMKernel]:
         if level < 0:
             raise ValueError("level must be >= 0")
 
@@ -27,8 +60,8 @@ class OneDirectionalAcousticScaling:
 
         return grid_params, physical_params, runtime_params, kernel
 
-    def clone_grid_params_for_level(self, level) -> GridParameters:
-        grid_params = GridParameters()
+    def clone_grid_params_for_level(self, level) -> cpu.parameters.GridParameters:
+        grid_params = cpu.parameters.GridParameters()
         grid_params.reference_direction_index = self._grid_params.reference_direction_index
         grid_params.periodic_boundary_in_x1 = self._grid_params.periodic_boundary_in_x1
         grid_params.periodic_boundary_in_x2 = self._grid_params.periodic_boundary_in_x2
@@ -51,7 +84,7 @@ class OneDirectionalAcousticScaling:
         return grid_params
 
     def clone_physical_parameters(self, level):
-        physical_params = PhysicalParameters()
+        physical_params = cpu.parameters.PhysicalParameters()
         physical_params.lattice_viscosity = self._physical_params.lattice_viscosity
 
         if level > 0:
@@ -60,7 +93,7 @@ class OneDirectionalAcousticScaling:
         return physical_params
 
     def clone_runtime_params_for_level(self, level):
-        runtime_params = RuntimeParameters()
+        runtime_params = cpu.parameters.RuntimeParameters()
         runtime_params.number_of_timesteps = self._runtime_params.number_of_timesteps
         runtime_params.number_of_threads = self._runtime_params.number_of_threads
         runtime_params.timestep_log_interval = self._runtime_params.timestep_log_interval
@@ -71,7 +104,7 @@ class OneDirectionalAcousticScaling:
         return runtime_params
 
     def clone_kernel_for_level(self, level):
-        kernel = LBMKernel(self._kernel.type)
+        kernel = cpu.kernel.LBMKernel(self._kernel.type)
         kernel.use_forcing = self._kernel.use_forcing
         kernel.forcing_in_x1 = self._kernel.forcing_in_x1
         kernel.forcing_in_x2 = self._kernel.forcing_in_x2
diff --git a/Python/actuator_line/actuator_line.py b/Python/actuator_line/actuator_line.py
index 6e3c8608617df1267535984d53307dea9184c6ab..721af737ff6ef3340c3c2f6204aa6a7824cd1d2f 100644
--- a/Python/actuator_line/actuator_line.py
+++ b/Python/actuator_line/actuator_line.py
@@ -1,23 +1,48 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file actuator_line.py
+! \ingroup actuator_line
+! \author Henry Korb, Henrik Asmuth
+=======================================================================================
+"""
 #%%
 import numpy as np
 from pathlib import Path
 from mpi4py import MPI
-from pyfluids import basics, gpu, logger
+from pyfluids.bindings import basics, gpu, logger
 #%%
-reference_diameter = 126
-
-length = np.array([29,6,6])*reference_diameter
-viscosity = 1.56e-5
-velocity = 9
-mach = 0.1
-nodes_per_diameter = 32
-
-sim_name = "ActuatorLine"
-config_file = Path(__file__).parent/Path("config.txt")
+sim_name = "ABL"
+config_file = Path(__file__).parent/"configActuatorLine.txt"
 output_path = Path(__file__).parent/Path("output")
 output_path.mkdir(exist_ok=True)
-t_out = 100.
-t_end = 500.
+
 
 #%%
 logger.Logger.initialize_logger()
@@ -25,87 +50,175 @@ basics.logger.Logger.add_stdout()
 basics.logger.Logger.set_debug_level(basics.logger.Level.INFO_LOW)
 basics.logger.Logger.time_stamp(basics.logger.TimeStamp.ENABLE)
 basics.logger.Logger.enable_printed_rank_numbers(True)
-# %%
-comm = gpu.Communicator.get_instance()
 #%%
 grid_factory = gpu.grid_generator.GridFactory.make()
 grid_builder = gpu.grid_generator.MultipleGridBuilder.make_shared(grid_factory)
+communicator = gpu.Communicator.get_instance()
 
-#%%
-dx = reference_diameter/nodes_per_diameter
-
-grid_builder.add_coarse_grid(0.0, 0.0, 0.0, *length, dx)
-grid_builder.set_periodic_boundary_condition(False, False, False)
-grid_builder.build_grids(basics.LbmOrGks.LBM, False)
-#%%
 config = basics.ConfigurationFile()
 config.load(str(config_file))
+
+para = gpu.Parameter(communicator.get_number_of_process(), communicator.get_pid(), config)
+bc_factory = gpu.BoundaryConditionFactory()
+
 #%%
-para = gpu.Parameter(config, comm.get_number_of_process(), comm.get_pid())
+turbine_diameter = config.get_float_value("turbineDiameter", 126)
+boundary_layer_height = config.get_float_value("boundaryLayerHeight", 1000)
+z0 = config.get_float_value("z0", 0.1)
+u_star = config.get_float_value("u_star", 0.4)
+
+kappa = config.get_float_value("vonKarmanConstant", 0.4) # von Karman constant
+
+viscosity = config.get_float_value("viscosity", 1.56e-5)
+
+velocity  = 0.5*u_star/kappa*np.log(boundary_layer_height/z0+1) #0.5 times max mean velocity at the top in m/s
+
+mach = config.get_float_value("Ma", 0.1)
+nodes_per_height = config.get_uint_value("nz", 64)
+
+
+turb_pos = np.array([3,3,3])*turbine_diameter
+epsilon = config.get_float_value("SmearingWidth", 5)
+density = config.get_float_value("Density", 1.225)
+level = 0
+n_blades = 3
+n_blade_nodes = config.get_int_value("NumberOfNodesPerAL", 32)
+
+read_precursor = config.get_bool_value("readPrecursor", False)
+
+if read_precursor:
+    nTReadPrecursor = config.get_int_value("nTimestepsReadPrecursor")
+    use_distributions = config.get_bool_value("useDistributions", False)
+    precursor_directory = config.get_string_value("precursorDirectory")
+
+# all in s
+t_start_out   = config.get_float_value("tStartOut")
+t_out        = config.get_float_value("tOut")
+t_end        = config.get_float_value("tEnd") # total time of simulation
 
+t_start_averaging     =  config.get_float_value("tStartAveraging")
+t_start_tmp_averaging  =  config.get_float_value("tStartTmpAveraging")
+t_averaging          =  config.get_float_value("tAveraging")
+t_start_out_probe      =  config.get_float_value("tStartOutProbe")
+t_out_probe           =  config.get_float_value("tOutProbe")
+
+#%%
+length = np.array([6,4,1])*boundary_layer_height
+dx = boundary_layer_height/nodes_per_height
 dt = dx * mach / (np.sqrt(3) * velocity)
-velocity_lb = velocity * dt / dx # LB units
-viscosity_lb = viscosity * dt / (dx * dx) # LB units
+velocity_ratio = dx/dt
+velocity_LB = velocity / velocity_ratio # LB units
+viscosity_LB = viscosity / (velocity_ratio * dx) # LB units
+pressure_gradient = u_star * u_star / boundary_layer_height
+pressure_gradient_LB = pressure_gradient * (dt*dt)/dx
+
+logger.vf_log_info(f"velocity  [dx/dt] = {velocity_LB}")
+logger.vf_log_info(f"dt   = {dt}")
+logger.vf_log_info(f"dx   = {dx}")
+logger.vf_log_info(f"viscosity [10^8 dx^2/dt] = {viscosity_LB*1e8}")
+logger.vf_log_info(f"u* /(dx/dt) = {u_star*dt/dx}")
+logger.vf_log_info(f"dpdx  = {pressure_gradient}")
+logger.vf_log_info(f"dpdx /(dx/dt^2) = {pressure_gradient_LB}")
+
 
 #%%
-para.set_devices([0])
 para.set_output_prefix(sim_name)
-para.set_output_path(str(output_path))
-para.set_f_name(para.get_output_path() + "/" + para.get_output_prefix())
 para.set_print_files(True)
-para.set_max_level(1)
-#%%
-para.set_velocity(velocity_lb)
-para.set_viscosity(viscosity_lb)    
+
+para.set_forcing(pressure_gradient_LB, 0, 0)
+para.set_velocity_LB(velocity_LB)
+para.set_viscosity_LB(viscosity_LB)    
 para.set_velocity_ratio(dx/dt)
 para.set_viscosity_ratio(dx*dx/dt)
-para.set_main_kernel("TurbulentViscosityCumulantK17CompChim")
-para.set_use_AMD(True)
-para.set_SGS_constant(0.083)
+para.set_density_ratio(1.0)
 
-def init_func(coord_x, coord_y, coord_z):
-    return [0.0, velocity_lb, 0.0, 0.0]
+para.set_main_kernel("CumulantK17")
 
-para.set_initial_condition(init_func)
-para.set_t_out(int(t_out/dt))
-para.set_t_end(int(t_end/dt))
+para.set_timestep_start_out(int(t_start_out/dt))
+para.set_timestep_out(int(t_out/dt))
+para.set_timestep_end(int(t_end/dt))
 para.set_is_body_force(True)
-
 #%%
-grid_builder.set_velocity_boundary_condition(gpu.SideType.MX, velocity_lb, 0.0, 0.0)
-
-grid_builder.set_velocity_boundary_condition(gpu.SideType.MY, velocity_lb, 0.0, 0.0)
-grid_builder.set_velocity_boundary_condition(gpu.SideType.PY, velocity_lb, 0.0, 0.0)
-
-grid_builder.set_velocity_boundary_condition(gpu.SideType.MZ, velocity_lb, 0.0, 0.0)
-grid_builder.set_velocity_boundary_condition(gpu.SideType.PZ, velocity_lb, 0.0, 0.0)
+tm_factory = gpu.TurbulenceModelFactory(para)
+tm_factory.read_config_file(config)
+#%%
+grid_scaling_factory = gpu.GridScalingFactory()
+grid_scaling_factory.set_scaling_factory(gpu.GridScaling.ScaleCompressible)
 
-grid_builder.set_pressure_boundary_condition(gpu.SideType.PX, 0.0)
+grid_builder.add_coarse_grid(0.0, 0.0, 0.0, *length, dx)
+grid_builder.set_periodic_boundary_condition(not read_precursor, True, False)
+grid_builder.build_grids(basics.LbmOrGks.LBM, False)
 
+sampling_offset = 2
+if read_precursor:
+    precursor = gpu.create_file_collection(precursor_directory + "/precursor", gpu.FileType.VTK)
+    grid_builder.set_precursor_boundary_condition(gpu.SideType.MX, precursor, nTReadPrecursor, 0, 0, 0)
+
+grid_builder.set_stress_boundary_condition(gpu.SideType.MZ, 0, 0, 1, sampling_offset, z0, dx)
+para.set_has_wall_model_monitor(True)
+grid_builder.set_slip_boundary_condition(gpu.SideType.PZ, 0, 0, -1)
+
+if read_precursor:
+    grid_builder.set_pressure_boundary_condition(gpu.SideType.PX, 0)
+bc_factory.set_stress_boundary_condition(gpu.StressBC.StressPressureBounceBack)
+bc_factory.set_slip_boundary_condition(gpu.SlipBC.SlipBounceBack) 
+bc_factory.set_pressure_boundary_condition(gpu.PressureBC.OutflowNonReflective)
+if read_precursor:
+    bc_factory.set_precursor_boundary_condition(gpu.PrecursorBC.DistributionsPrecursor if use_distributions else gpu.PrecursorBC.VelocityPrecursor)
+para.set_outflow_pressure_correction_factor(0.0); 
 #%%
-cuda_memory_manager = gpu.CudaMemoryManager(para)
-grid_generator = gpu.GridProvider.make_grid_generator(grid_builder, para, cuda_memory_manager, comm)
+# don't use python init functions, they are very slow! Just kept as an example.
+# Define lambda in bindings and set it here.
+# def init_func(coord_x, coord_y, coord_z):
+#     return [
+#         0.0, 
+#         (u_star/0.4 * np.log(np.maximum(coord_z,z0)/z0) + 2.0*np.sin(np.pi*16*coord_x/length[0])*np.sin(np.pi*8*coord_z/boundary_layer_height)/(np.square(coord_z/boundary_layer_height)+1))  * dt / dx, 
+#         2.0*np.sin(np.pi*16.*coord_x/length[0])*np.sin(np.pi*8.*coord_z/boundary_layer_height)/(np.square(coord_z/boundary_layer_height)+1.)  * dt / dx, 
+#         8.0*u_star/0.4*(np.sin(np.pi*8.0*coord_y/boundary_layer_height)*np.sin(np.pi*8.0*coord_z/boundary_layer_height)+np.sin(np.pi*8.0*coord_x/length[0]))/(np.square(length[2]/2.0-coord_z)+1.) * dt / dx]
+# para.set_initial_condition(init_func)
+para.set_initial_condition_perturbed_log_law(u_star, z0, length[0], length[2], boundary_layer_height, velocity_ratio)
+
 #%%
-turb_pos = np.array([3,3,3])*reference_diameter
-epsilon = 5
+turb_pos = np.array([3,3,3])*turbine_diameter
+epsilon = 1.5*dx
 density = 1.225
 level = 0
 n_blades = 3
 n_blade_nodes = 32
-alm = gpu.ActuatorLine(n_blades, density, n_blade_nodes, epsilon, *turb_pos, reference_diameter, level, dt, dx)
+omega = 1
+blade_radii = np.arange(n_blade_nodes, dtype=np.float32)/(0.5*turbine_diameter)
+alm = gpu.ActuatorFarm(n_blades, density, n_blade_nodes, epsilon, level, dt, dx, True)
+alm.add_turbine(turb_pos[0],turb_pos[1],turb_pos[2], turbine_diameter, omega, 0, 0, blade_radii)
 para.add_actuator(alm)
 #%%
-point_probe = gpu.probes.PointProbe("pointProbe", str(output_path), 100, 1, 500, 100)
-point_probe.add_probe_points_from_list(np.array([1,2,5])*reference_diameter, np.array([3,3,3])*reference_diameter, np.array([3,3,3])*reference_diameter)
-point_probe.add_statistic(gpu.probes.Statistic.Means)
-
-para.add_probe(point_probe)
-
-plane_probe = gpu.probes.PlaneProbe("planeProbe", str(output_path), 100, 1, 500, 100)
-plane_probe.set_probe_plane(5*reference_diameter, 0, 0, dx, length[1], length[2])
-para.add_probe(plane_probe)
+planar_average_probe = gpu.probes.PlanarAverageProbe("horizontalPlanes", para.get_output_path(), 0, int(t_start_tmp_averaging/dt), int(t_averaging/dt) , int(t_start_out_probe/dt), int(t_out_probe/dt), 'z')
+planar_average_probe.add_all_available_statistics()
+planar_average_probe.set_file_name_to_n_out()
+para.add_probe(planar_average_probe)
 #%%
-sim = gpu.Simulation(para, cuda_memory_manager, comm, grid_generator)
+wall_model_probe = gpu.probes.WallModelProbe("wallModelProbe", para.get_output_path(), 0, int(t_start_tmp_averaging/dt), int(t_averaging/dt/4), int(t_start_out_probe/dt), int(t_out_probe/dt))
+wall_model_probe.add_all_available_statistics()
+wall_model_probe.set_file_name_to_n_out()
+wall_model_probe.set_force_output_to_stress(True)
+if para.get_is_body_force():
+    wall_model_probe.set_evaluate_pressure_gradient(True)
+para.add_probe(wall_model_probe)
+
+plane_locs = [100,]
+if read_precursor: plane_locs.extend([1000, 1500, 2000, 2500, 0])
+
+for n_probe, probe_pos in enumerate(plane_locs):
+    plane_probe = gpu.probes.PlaneProbe(f"planeProbe_{n_probe+1}", para.get_output_path(), int(t_start_averaging/dt), 10, int(t_start_out_probe/dt), int(t_out_probe/dt))
+    plane_probe.set_probe_plane(probe_pos, 0, 0, dx, length[1], length[2])
+    plane_probe.add_all_available_statistics()
+    para.add_probe(plane_probe)
+#%%
+cuda_memory_manager = gpu.CudaMemoryManager(para)
+grid_generator = gpu.GridProvider.make_grid_generator(grid_builder, para, cuda_memory_manager, communicator)
+#%%
+#%%
+sim = gpu.Simulation(para, cuda_memory_manager, communicator, grid_generator, bc_factory, tm_factory, grid_scaling_factory)
 #%%
 sim.run()
-MPI.Finalize()
\ No newline at end of file
+MPI.Finalize()
+
diff --git a/Python/actuator_line/config.txt b/Python/actuator_line/config.txt
deleted file mode 100644
index e4c778c4cc048f54c0a32310e6bf4a7343a263fa..0000000000000000000000000000000000000000
--- a/Python/actuator_line/config.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Path = .
-GridPath = .
diff --git a/Python/actuator_line/configActuatorLine.txt b/Python/actuator_line/configActuatorLine.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c45d170f039274ab355f3fe1dc044536f1f29e6f
--- /dev/null
+++ b/Python/actuator_line/configActuatorLine.txt
@@ -0,0 +1,39 @@
+##################################################
+#informations for Writing
+##################################################
+Path = .
+##################################################
+#informations for reading
+##################################################
+GridPath = .
+##################################################
+Devices = 0 
+##################################################
+tStartOut           = 0
+tOut                = 100000
+tEnd                = 300000
+##################################################
+tStartAveraging     = 0
+tStartTmpAveraging  = 100000
+tAveraging          = 200
+tStartOutProbe      = 0
+tOutProbe           = 1000 
+##################################################
+Ma = 0.1
+nz = 96 
+
+bodyForce = true
+SGSconstant = 0.333
+TurbulenceModel = QR
+
+QuadricLimiterP = 100000.0
+QuadricLimiterM = 100000.0
+QuadricLimiterD = 100000.0
+
+##################################################
+readPrecursor = false
+nTimestepsReadPrecursor = 1
+precursorFile = precursor/Precursor
+
+##################################################
+turbineDiameter = 126.0
diff --git a/Python/boundary_layer/boundary_layer.py b/Python/boundary_layer/boundary_layer.py
index 1c01f50946b49bc0ddab7e50065a24aab4ae869f..6f6c64bc072d3afbb8aa5febbec209c26af2deee 100644
--- a/Python/boundary_layer/boundary_layer.py
+++ b/Python/boundary_layer/boundary_layer.py
@@ -1,37 +1,48 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file boundary_layer.py
+! \ingroup boundary_layer
+! \author Henry Korb, Henrik Asmuth
+=======================================================================================
+"""
 #%%
 import numpy as np
 from pathlib import Path
 from mpi4py import MPI
 from pyfluids import basics, gpu, logger
 #%%
-reference_height = 1000 # boundary layer height in m
-
-length = np.array([6,4,1])*reference_height
-viscosity = 1.56e-5
-mach = 0.1
-nodes_per_height = 32
-
-z_0 = 0.1
-u_star = 0.4
-kappa = 0.4
-
-velocity = 0.5*u_star/kappa*np.log(length[2]/z_0+1)
-flow_through_time = length[0]/velocity
-use_AMD = True
-
-
-sim_name = "BoundaryLayer"
-config_file = Path(__file__).parent/Path("config.txt")
+sim_name = "ABL"
+config_file = Path(__file__).parent/"configBoundaryLayer.txt"
 output_path = Path(__file__).parent/Path("output")
 output_path.mkdir(exist_ok=True)
-t_out = 1000.
-t_end = 5000.
 
-t_start_averaging = 0
-t_start_tmp_averaging =  100_000
-t_averaging = 200
-t_start_out_probe = 0
-t_out_probe = 1000
 
 #%%
 logger.Logger.initialize_logger()
@@ -39,95 +50,161 @@ basics.logger.Logger.add_stdout()
 basics.logger.Logger.set_debug_level(basics.logger.Level.INFO_LOW)
 basics.logger.Logger.time_stamp(basics.logger.TimeStamp.ENABLE)
 basics.logger.Logger.enable_printed_rank_numbers(True)
-# %%
-comm = gpu.Communicator.get_instance()
 #%%
 grid_factory = gpu.grid_generator.GridFactory.make()
 grid_builder = gpu.grid_generator.MultipleGridBuilder.make_shared(grid_factory)
+communicator = gpu.Communicator.get_instance()
+
+config = basics.ConfigurationFile()
+config.load(str(config_file))
+
+para = gpu.Parameter(communicator.get_number_of_process(), communicator.get_pid(), config)
+bc_factory = gpu.BoundaryConditionFactory()
 
 #%%
-dx = reference_height/nodes_per_height
-dt = dx * mach / (np.sqrt(3) * velocity)
-velocity_lb = velocity * dt / dx # LB units
-viscosity_lb = viscosity * dt / (dx * dx) # LB units
+boundary_layer_height = config.get_float_value("boundaryLayerHeight", 1000)
+z0 = config.get_float_value("z0", 0.1)
+u_star = config.get_float_value("u_star", 0.4)
 
-pressure_gradient = u_star**2 / reference_height
-pressure_gradient_lb = pressure_gradient * dt**2 / dx
+kappa = config.get_float_value("vonKarmanConstant", 0.4) # von Karman constant
 
-logger.vf_log_info(f"velocity    = {velocity_lb:1.6} dx/dt")
-logger.vf_log_info(f"dt          = {dt:1.6}")
-logger.vf_log_info(f"dx          = {dx:1.6}")
-logger.vf_log_info(f"u*          = {u_star:1.6}")
-logger.vf_log_info(f"dpdx        = {pressure_gradient:1.6}")
-logger.vf_log_info(f"dpdx        = {pressure_gradient_lb:1.6} dx/dt^2")
-logger.vf_log_info(f"viscosity   = {viscosity_lb:1.6} dx^2/dt")
+viscosity = config.get_float_value("viscosity", 1.56e-5)
 
+velocity  = 0.5*u_star/kappa*np.log(boundary_layer_height/z0+1) #0.5 times max mean velocity at the top in m/s
 
-#%%
-config = basics.ConfigurationFile()
-config.load(str(config_file))
-#%%
-para = gpu.Parameter(config, comm.get_number_of_process(), comm.get_pid())
+mach = config.get_float_value("Ma", 0.1)
+nodes_per_height = config.get_uint_value("nz", 64)
+
+
+
+write_precursor = config.get_bool_value("_p", False)
+read_precursor = config.get_bool_value("readPrecursor", False)
+
+if write_precursor:
+    nTWritePrecursor      = config.get_int_value("nTimestepsWritePrecursor")
+    t_start_precursor      = config.get_float_value("tStartPrecursor")
+    pos_x_precursor        = config.get_float_value("posXPrecursor")
 
+if read_precursor:
+    nTReadPrecursor = config.get_int_value("nTimestepsReadPrecursor")
 
+if write_precursor or read_precursor:
+    use_distributions = config.get_bool_value("useDistributions", False)
+    precursor_directory = config.get_string_value("precursorDirectory")
+
+# all in s
+t_start_out   = config.get_float_value("tStartOut")
+t_out        = config.get_float_value("tOut")
+t_end        = config.get_float_value("tEnd") # total time of simulation
+
+t_start_averaging     =  config.get_float_value("tStartAveraging")
+t_start_tmp_averaging  =  config.get_float_value("tStartTmpAveraging")
+t_averaging          =  config.get_float_value("tAveraging")
+t_start_out_probe      =  config.get_float_value("tStartOutProbe")
+t_out_probe           =  config.get_float_value("tOutProbe")
+
+#%%
+length = np.array([6,4,1])*boundary_layer_height
+dx = boundary_layer_height/nodes_per_height
+dt = dx * mach / (np.sqrt(3) * velocity)
+velocity_LB = velocity * dt / dx # LB units
+viscosity_LB = viscosity * dt / (dx * dx) # LB units
+pressure_gradient = u_star * u_star / boundary_layer_height
+pressure_gradient_LB = pressure_gradient * (dt*dt)/dx
+
+logger.vf_log_info(f"velocity  [dx/dt] = {velocity_LB}")
+logger.vf_log_info(f"dt   = {dt}")
+logger.vf_log_info(f"dx   = {dx}")
+logger.vf_log_info(f"viscosity [10^8 dx^2/dt] = {viscosity_LB*1e8}")
+logger.vf_log_info(f"u* /(dx/dt) = {u_star*dt/dx}")
+logger.vf_log_info(f"dpdx  = {pressure_gradient}")
+logger.vf_log_info(f"dpdx /(dx/dt^2) = {pressure_gradient_LB}")
+    
+#%%
 
 #%%
-para.set_devices([0])
 para.set_output_prefix(sim_name)
-para.set_output_path(str(output_path))
-para.set_f_name(para.get_output_path() + "/" + para.get_output_prefix())
 para.set_print_files(True)
-para.set_max_level(1)
-#%%
-para.set_velocity(velocity_lb)
-para.set_viscosity(viscosity_lb)    
+
+para.set_forcing(pressure_gradient_LB, 0, 0)
+para.set_velocity_LB(velocity_LB)
+para.set_viscosity_LB(viscosity_LB)    
 para.set_velocity_ratio(dx/dt)
 para.set_viscosity_ratio(dx*dx/dt)
-para.set_use_AMD(use_AMD)
+para.set_density_ratio(1.0)
+
+para.set_main_kernel("CumulantK17")
 
-para.set_main_kernel("TurbulentViscosityCumulantK17CompChim" if para.get_use_AMD() else "CummulantK17CompChim")
+para.set_timestep_start_out(int(t_start_out/dt))
+para.set_timestep_out(int(t_out/dt))
+para.set_timestep_end(int(t_end/dt))
+para.set_is_body_force(config.get_bool_value("bodyForce"))
+#%%
+tm_factory = gpu.TurbulenceModelFactory(para)
+tm_factory.read_config_file(config)
+#%%
+grid_builder.add_coarse_grid(0.0, 0.0, 0.0, *length, dx)
+grid_builder.set_periodic_boundary_condition(not read_precursor, True, False)
+grid_builder.build_grids(basics.LbmOrGks.LBM, False)
 
-para.set_SGS_constant(0.083)
+sampling_offset = 2
+if read_precursor:
+    precursor = gpu.create_file_collection(precursor_directory + "/precursor", gpu.FileType.VTK)
+    grid_builder.set_precursor_boundary_condition(gpu.SideType.MX, precursor, nTReadPrecursor, 0, 0, 0)
 
+grid_builder.set_stress_boundary_condition(gpu.SideType.MZ, 0, 0, 1, sampling_offset, z0/dx)
+para.set_has_wall_model_monitor(True)
+grid_builder.set_slip_boundary_condition(gpu.SideType.PZ, 0, 0, -1)
+
+if read_precursor:
+    grid_builder.set_pressure_boundary_condition(gpu.SideType.PX, 0)
+bc_factory.set_stress_boundary_condition(gpu.StressBC.StressPressureBounceBack)
+bc_factory.set_slip_boundary_condition(gpu.SlipBC.SlipBounceBack) 
+bc_factory.set_pressure_boundary_condition(gpu.PressureBC.OutflowNonReflective)
+bc_factory.set_precursor_boundary_condition(gpu.PrecursorBC.DistributionsPrecursor if use_distributions else gpu.PrecursorBC.VelocityPrecursor)
+para.set_outflow_pressure_correction_factor(0.0); 
+#%%
 def init_func(coord_x, coord_y, coord_z):
     return [
         0.0, 
-        (u_star/kappa*np.log(max(coord_z/z_0,0)+1) + 2*np.sin(np.pi*16*coord_x/length[0])*np.sin(np.pi*8*coord_z/length[2]))/((coord_z/reference_height)**2+0.1)*dt/dx, 
-        2*np.sin(np.pi*16*coord_x/length[0])*np.sin(np.pi*8*coord_z/length[2])/((coord_z/reference_height)**2+0.1)*dt/dx, 
-        8*u_star/kappa*(np.sin(np.pi*8*coord_y/reference_height)*np.sin(np.pi*8*coord_z/reference_height)+np.sin(np.pi*8*coord_x/length[0]))/((length[2]/2-coord_z)**2+0.1)*dt/dx
-        ]
-
+        (u_star/0.4 * np.log(np.maximum(coord_z,z0)/z0) + 2.0*np.sin(np.pi*16*coord_x/length[0])*np.sin(np.pi*8*coord_z/boundary_layer_height)/(np.square(coord_z/boundary_layer_height)+1))  * dt / dx, 
+        2.0*np.sin(np.pi*16.*coord_x/length[0])*np.sin(np.pi*8.*coord_z/boundary_layer_height)/(np.square(coord_z/boundary_layer_height)+1.)  * dt / dx, 
+        8.0*u_star/0.4*(np.sin(np.pi*8.0*coord_y/boundary_layer_height)*np.sin(np.pi*8.0*coord_z/boundary_layer_height)+np.sin(np.pi*8.0*coord_x/length[0]))/(np.square(length[2]/2.0-coord_z)+1.) * dt / dx]
 para.set_initial_condition(init_func)
-para.set_t_out(int(t_out/dt))
-para.set_t_end(int(t_end/dt))
-para.set_is_body_force(True)
-para.set_has_wall_model_monitor(True)
 
-
-grid_builder.add_coarse_grid(0.0, 0.0, 0.0, *length, dx)
-grid_builder.set_periodic_boundary_condition(True, True, False)
-grid_builder.build_grids(basics.LbmOrGks.LBM, False)
 #%%
-sampling_offset = 2
-grid_builder.set_stress_boundary_condition(gpu.SideType.MZ, 0.0, 0.0, 1.0, sampling_offset, z_0/dx)
-grid_builder.set_slip_boundary_condition(gpu.SideType.PZ, 0.0, 0.0, 0.0)
+planar_average_probe = gpu.probes.PlanarAverageProbe("horizontalPlanes", para.get_output_path(), 0, int(t_start_tmp_averaging/dt), int(t_averaging/dt) , int(t_start_out_probe/dt), int(t_out_probe/dt), 'z')
+planar_average_probe.add_all_available_statistics()
+planar_average_probe.set_file_name_to_n_out()
+para.add_probe(planar_average_probe)
+#%%
+wall_model_probe = gpu.probes.WallModelProbe("wallModelProbe", para.get_output_path(), 0, int(t_start_tmp_averaging/dt), int(t_averaging/dt/4), int(t_start_out_probe/dt), int(t_out_probe/dt))
+wall_model_probe.add_all_available_statistics()
+wall_model_probe.set_file_name_to_n_out()
+wall_model_probe.set_force_output_to_stress(True)
+if para.get_is_body_force():
+    wall_model_probe.set_evaluate_pressure_gradient(True)
+para.add_probe(wall_model_probe)
+
+plane_locs = [100,]
+if read_precursor: plane_locs.extend([1000, 1500, 2000, 2500, 0])
+
+for n_probe, probe_pos in enumerate(plane_locs):
+    plane_probe = gpu.probes.PlaneProbe(f"planeProbe_{n_probe+1}", para.get_output_path(), int(t_start_averaging/dt), 10, int(t_start_out_probe/dt), int(t_out_probe/dt))
+    plane_probe.set_probe_plane(probe_pos, 0, 0, dx, length[1], length[2])
+    plane_probe.add_all_available_statistics()
+    para.add_probe(plane_probe)
+
+if write_precursor:
+    precursor_writer = gpu.PrecursorWriter("precursor", para.get_output_path() + precursor_directory, pos_x_precursor, 0,length[1], 0, length[2], t_start_precursor/dt, nTWritePrecursor, gpu.OutputVariable.Distributions if use_distributions else gpu.OutputVariable.Velocities)
+    para.add_probe(precursor_writer)
 
 #%%
 cuda_memory_manager = gpu.CudaMemoryManager(para)
-grid_generator = gpu.GridProvider.make_grid_generator(grid_builder, para, cuda_memory_manager, comm)
-
+grid_generator = gpu.GridProvider.make_grid_generator(grid_builder, para, cuda_memory_manager, communicator)
 #%%
-wall_probe = gpu.probes.WallModelProbe("wallModelProbe", str(output_path), int(t_start_averaging/dt), int(t_start_tmp_averaging/dt), int(t_averaging/dt/4), int(t_start_out_probe/dt), int(t_out_probe/dt))
-wall_probe.add_all_available_statistics()
-wall_probe.set_file_name_to_n_out()
-wall_probe.set_force_output_to_stress(True)
-if para.get_is_body_force():
-    wall_probe.set_evaluate_pressure_gradient(True)
-planar_probe = gpu.probes.PlanarAverageProbe("planarAverageProbe", str(output_path), int(t_start_averaging/dt), int(t_start_tmp_averaging/dt), int(t_averaging/dt), int(t_start_out_probe/dt), int(t_out_probe/dt), "z")
-para.add_probe(wall_probe)
-
 #%%
-sim = gpu.Simulation(para, cuda_memory_manager, comm, grid_generator)
+sim = gpu.Simulation(para, cuda_memory_manager, communicator, grid_generator, bc_factory, tm_factory)
 #%%
 sim.run()
 MPI.Finalize()
\ No newline at end of file
diff --git a/Python/boundary_layer/config.txt b/Python/boundary_layer/config.txt
deleted file mode 100644
index e4c778c4cc048f54c0a32310e6bf4a7343a263fa..0000000000000000000000000000000000000000
--- a/Python/boundary_layer/config.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Path = .
-GridPath = .
diff --git a/Python/boundary_layer/configBoundaryLayer.txt b/Python/boundary_layer/configBoundaryLayer.txt
new file mode 100644
index 0000000000000000000000000000000000000000..83e7861a5fb85ea800d187699f1c6c1409422f0a
--- /dev/null
+++ b/Python/boundary_layer/configBoundaryLayer.txt
@@ -0,0 +1,42 @@
+##################################################
+#informations for Writing
+##################################################
+Path = .
+##################################################
+#informations for reading
+##################################################
+GridPath = .
+##################################################
+Devices = 0 
+##################################################
+tStartOut           = 0
+tOut                = 100000
+tEnd                = 300000
+##################################################
+tStartAveraging     = 0
+tStartTmpAveraging  = 100000
+tAveraging          = 200
+tStartOutProbe      = 0
+tOutProbe           = 1000 
+##################################################
+Ma = 0.1
+nz = 96 
+
+bodyForce = true
+UseAMD = true
+SGSconstant = 0.2
+QuadricLimiterP = 100000.0
+QuadricLimiterM = 100000.0
+QuadricLimiterD = 100000.0
+
+##################################################
+readPrecursor = false
+nTimestepsReadPrecursor = 10
+precursorFile = precursor/Precursor
+
+##################################################
+writePrecursor = false
+nTimestepsWritePrecursor = 10
+
+tStartPrecursor = 100
+posXPrecursor = 3000
\ No newline at end of file
diff --git a/Python/cubeflow/simulation.py b/Python/cubeflow/simulation.py
index 9e77e8d747c072188d8d81150afa8e2ccb76a792..deb0411963aec65522af45cc48d7367f103232c6 100644
--- a/Python/cubeflow/simulation.py
+++ b/Python/cubeflow/simulation.py
@@ -1,13 +1,42 @@
-from pyfluids.cpu import Simulation
-from pyfluids.cpu.boundaryconditions import NoSlipBoundaryCondition, VelocityBoundaryCondition, DensityBoundaryCondition
-from pyfluids.cpu.geometry import GbCuboid3D
-from pyfluids.cpu.kernel import LBMKernel, KernelType
-from pyfluids.cpu.parameters import PhysicalParameters, RuntimeParameters, GridParameters
-from pyfluids.cpu.writer import Writer, OutputFormat
-from pymuparser import Parser
-
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file simulation.py
+! \ingroup cubeflow
+! \author Sven Marcus, Henry Korb
+=======================================================================================
+"""
 import os
 
+from pyfluids import cpu
+from pymuparser import Parser
+
 
 def get_max_length(number_of_nodes_per_direction, delta_x):
     return (number_of_nodes_per_direction[0] * delta_x,
@@ -15,10 +44,10 @@ def get_max_length(number_of_nodes_per_direction, delta_x):
             number_of_nodes_per_direction[2] * delta_x)
 
 
-physical_params = PhysicalParameters()
+physical_params = cpu.parameters.PhysicalParameters()
 physical_params.lattice_viscosity = 0.005
 
-grid_params = GridParameters()
+grid_params = cpu.parameters.GridParameters()
 grid_params.number_of_nodes_per_direction = [200, 120, 120]
 grid_params.blocks_per_direction = [2, 2, 2]
 grid_params.node_distance = 0.125
@@ -26,7 +55,7 @@ grid_params.periodic_boundary_in_x1 = False
 grid_params.periodic_boundary_in_x2 = True
 grid_params.periodic_boundary_in_x3 = True
 
-runtime_params = RuntimeParameters()
+runtime_params = cpu.parameters.RuntimeParameters()
 runtime_params.timestep_log_interval = 1000
 runtime_params.number_of_timesteps = 50000
 runtime_params.number_of_threads = int(os.environ.get("OMP_NUM_THREADS", 4))
@@ -39,46 +68,46 @@ def run_simulation(physical_parameters=physical_params, grid_parameters=grid_par
     min_x, min_y, min_z = 0, 0, 0
     max_x, max_y, max_z = get_max_length(grid_parameters.number_of_nodes_per_direction, grid_parameters.node_distance)
 
-    bottom_wall = GbCuboid3D(min_x - wall_thickness, min_y - wall_thickness, min_z, max_x + wall_thickness,
+    bottom_wall = cpu.geometry.GbCuboid3D(min_x - wall_thickness, min_y - wall_thickness, min_z, max_x + wall_thickness,
                              max_y + wall_thickness, min_z - wall_thickness)
 
-    top_wall = GbCuboid3D(min_x - wall_thickness, min_y - wall_thickness, max_z, max_x + wall_thickness,
+    top_wall = cpu.geometry.GbCuboid3D(min_x - wall_thickness, min_y - wall_thickness, max_z, max_x + wall_thickness,
                           max_y + wall_thickness,
                           max_z + wall_thickness)
 
-    left_wall = GbCuboid3D(min_x - wall_thickness, min_y, min_z - wall_thickness, max_x + wall_thickness,
+    left_wall = cpu.geometry.GbCuboid3D(min_x - wall_thickness, min_y, min_z - wall_thickness, max_x + wall_thickness,
                            min_y - wall_thickness,
                            max_z + wall_thickness)
 
-    right_wall = GbCuboid3D(min_x - wall_thickness, max_y, min_z - wall_thickness, max_x + wall_thickness,
+    right_wall = cpu.geometry.GbCuboid3D(min_x - wall_thickness, max_y, min_z - wall_thickness, max_x + wall_thickness,
                             max_y + wall_thickness, max_z + wall_thickness)
 
-    obstacle = GbCuboid3D(7, 7, 7, 8, 8, 8)
+    obstacle = cpu.geometry.GbCuboid3D(7, 7, 7, 8, 8, 8)
 
-    velocity_boundary = GbCuboid3D(min_x - wall_thickness, min_y - wall_thickness, min_z - wall_thickness, min_x,
+    velocity_boundary = cpu.geometry.GbCuboid3D(min_x - wall_thickness, min_y - wall_thickness, min_z - wall_thickness, min_x,
                                    max_y + wall_thickness, max_z + wall_thickness)
 
-    outflow_boundary = GbCuboid3D(max_x, min_y - wall_thickness, min_z - wall_thickness, max_x + wall_thickness,
+    outflow_boundary = cpu.geometry.GbCuboid3D(max_x, min_y - wall_thickness, min_z - wall_thickness, max_x + wall_thickness,
                                   max_y + wall_thickness, max_z + wall_thickness)
 
-    no_slip_bc = NoSlipBoundaryCondition()
+    no_slip_bc = cpu.boundaryconditions.NoSlipBoundaryCondition()
 
-    outflow_bc = DensityBoundaryCondition()
+    outflow_bc = cpu.boundaryconditions.DensityBoundaryCondition()
 
     velocity_function = Parser()
     velocity_function.define_constant("u", 0.07)
     velocity_function.expression = "u"
-    velocity_bc = VelocityBoundaryCondition(True, False, False, velocity_function, 0, -10)
+    velocity_bc = cpu.boundaryconditions.VelocityBoundaryCondition(True, False, False, velocity_function, 0, -10)
 
-    kernel = LBMKernel(KernelType.CompressibleCumulantFourthOrderViscosity)
+    kernel = cpu.kernel.LBMKernel(cpu.kernel.KernelType.CompressibleCumulantFourthOrderViscosity)
     # kernel.use_forcing = True
     # kernel.forcing_in_x1 = 3e-6
 
-    writer = Writer()
+    writer = cpu.writer.Writer()
     writer.output_path = "./output"
-    writer.output_format = OutputFormat.BINARY
+    writer.output_format = cpu.writer.OutputFormat.BINARY
 
-    simulation = Simulation()
+    simulation = cpu.Simulation()
     simulation.set_writer(writer)
 
     simulation.set_physical_parameters(physical_parameters)
diff --git a/Python/liddrivencavity/simulation.py b/Python/liddrivencavity/simulation.py
index 155fad2f6f8aade0368c8a7006b88f7985f8822c..3c247b87a102e3c5a720f20748acc9f9f50bb178 100644
--- a/Python/liddrivencavity/simulation.py
+++ b/Python/liddrivencavity/simulation.py
@@ -1,32 +1,61 @@
-from pyfluids.cpu import Simulation
-from pyfluids.cpu.boundaryconditions import NoSlipBoundaryCondition, VelocityBoundaryCondition
-from pyfluids.cpu.geometry import GbCuboid3D
-from pyfluids.cpu.kernel import LBMKernel, KernelType
-from pyfluids.cpu.parameters import GridParameters, PhysicalParameters, RuntimeParameters
-from pyfluids.cpu.writer import Writer, OutputFormat
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file simulation.py
+! \ingroup liddrivencavity
+! \author Sven Marcus, Henry Korb
+=======================================================================================
+"""
+from pyfluids import cpu
 from pymuparser import Parser
 
-runtime_params = RuntimeParameters()
+runtime_params = cpu.parameters.RuntimeParameters()
 runtime_params.number_of_threads = 4
 runtime_params.number_of_timesteps = 10000
 runtime_params.timestep_log_interval = 1000
 
-physical_params = PhysicalParameters()
+physical_params = cpu.parameters.PhysicalParameters()
 physical_params.lattice_viscosity = 0.005
 
-grid_params = GridParameters()
+grid_params = cpu.parameters.GridParameters()
 grid_params.number_of_nodes_per_direction = [64, 64, 64]
 grid_params.blocks_per_direction = [2, 2, 2]
 grid_params.node_distance = 1 / 10
 
 
 def run_simulation(physical_params=physical_params, grid_params=grid_params, runtime_params=runtime_params):
-    simulation = Simulation()
-    kernel = LBMKernel(KernelType.CompressibleCumulantFourthOrderViscosity)
+    simulation = cpu.Simulation()
+    kernel = cpu.kernel.LBMKernel(cpu.kernel.KernelType.CompressibleCumulantFourthOrderViscosity)
 
-    writer = Writer()
+    writer = cpu.writer.Writer()
     writer.output_path = "./output"
-    writer.output_format = OutputFormat.BINARY
+    writer.output_format = cpu.writer.OutputFormat.BINARY
 
     simulation.set_grid_parameters(grid_params)
     simulation.set_physical_parameters(physical_params)
@@ -34,12 +63,12 @@ def run_simulation(physical_params=physical_params, grid_params=grid_params, run
     simulation.set_kernel_config(kernel)
     simulation.set_writer(writer)
 
-    no_slip_bc_adapter = NoSlipBoundaryCondition()
+    no_slip_bc_adapter = cpu.boundaryconditions.NoSlipBoundaryCondition()
 
     fct = Parser()
     fct.expression = "u"
     fct.define_constant("u", 0.005)
-    velocity_bc_adapter = VelocityBoundaryCondition(True, True, False, fct, 0, -10.0)
+    velocity_bc_adapter = cpu.boundaryconditions.VelocityBoundaryCondition(True, True, False, fct, 0, -10.0)
 
     g_min_x1, g_min_x2, g_min_x3 = 0, 0, 0
     g_max_x1 = grid_params.number_of_nodes_per_direction[0] * grid_params.node_distance
@@ -48,12 +77,12 @@ def run_simulation(physical_params=physical_params, grid_params=grid_params, run
 
     dx = grid_params.node_distance
 
-    wall_x_min = GbCuboid3D(g_min_x1 - dx, g_min_x2 - dx, g_min_x3 - dx, g_min_x1, g_max_x2 + dx, g_max_x3)
-    wall_x_max = GbCuboid3D(g_max_x1, g_min_x2 - dx, g_min_x3 - dx, g_max_x1 + dx, g_max_x2 + dx, g_max_x3)
-    wall_y_min = GbCuboid3D(g_min_x1 - dx, g_min_x2 - dx, g_min_x3 - dx, g_max_x1 + dx, g_min_x2, g_max_x3)
-    wall_y_max = GbCuboid3D(g_min_x1 - dx, g_max_x2, g_min_x3 - dx, g_max_x1 + dx, g_max_x2 + dx, g_max_x3)
-    wall_z_min = GbCuboid3D(g_min_x1 - dx, g_min_x2 - dx, g_min_x3 - dx, g_max_x1 + dx, g_max_x2 + dx, g_min_x3)
-    wall_z_max = GbCuboid3D(g_min_x1 - dx, g_min_x2 - dx, g_max_x3, g_max_x1 + dx, g_max_x2 + dx, g_max_x3 + dx)
+    wall_x_min = cpu.geometry.GbCuboid3D(g_min_x1 - dx, g_min_x2 - dx, g_min_x3 - dx, g_min_x1, g_max_x2 + dx, g_max_x3)
+    wall_x_max = cpu.geometry.GbCuboid3D(g_max_x1, g_min_x2 - dx, g_min_x3 - dx, g_max_x1 + dx, g_max_x2 + dx, g_max_x3)
+    wall_y_min = cpu.geometry.GbCuboid3D(g_min_x1 - dx, g_min_x2 - dx, g_min_x3 - dx, g_max_x1 + dx, g_min_x2, g_max_x3)
+    wall_y_max = cpu.geometry.GbCuboid3D(g_min_x1 - dx, g_max_x2, g_min_x3 - dx, g_max_x1 + dx, g_max_x2 + dx, g_max_x3)
+    wall_z_min = cpu.geometry.GbCuboid3D(g_min_x1 - dx, g_min_x2 - dx, g_min_x3 - dx, g_max_x1 + dx, g_max_x2 + dx, g_min_x3)
+    wall_z_max = cpu.geometry.GbCuboid3D(g_min_x1 - dx, g_min_x2 - dx, g_max_x3, g_max_x1 + dx, g_max_x2 + dx, g_max_x3 + dx)
 
     simulation.add_object(wall_x_min, no_slip_bc_adapter, 1, "/geo/wallXmin")
     simulation.add_object(wall_x_max, no_slip_bc_adapter, 1, "/geo/wallXmax")
diff --git a/Python/poiseuille/poiseuille_hpc.py b/Python/poiseuille/poiseuille_hpc.py
index f5f5a1387c9fe234abae0c6f979cc7d5b283d1a4..b108f34445a71a686c4e22f685e26e10204113b3 100644
--- a/Python/poiseuille/poiseuille_hpc.py
+++ b/Python/poiseuille/poiseuille_hpc.py
@@ -1,15 +1,49 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file poiseuille_hpc.py
+! \ingroup poiseuille
+! \author Sven Marcus, Henry Korb
+=======================================================================================
+"""
 from poiseuille.simulation import run_simulation
-from pyfluids.cpu.parameters import *
+from pyfluids import cpu
 
-grid_parameters = GridParameters()
+grid_parameters = cpu.prameters.GridParameters()
 grid_parameters.number_of_nodes_per_direction = [64, 64, 512]
 grid_parameters.node_distance = 1
 grid_parameters.blocks_per_direction = [1, 2, 2]
 
-physical_parameters = PhysicalParameters()
+physical_parameters = cpu.prameters.PhysicalParameters()
 physical_parameters.lattice_viscosity = 0.0005
 
-runtime_parameters = RuntimeParameters()
+runtime_parameters = cpu.prameters.RuntimeParameters()
 runtime_parameters.number_of_threads = 4
 runtime_parameters.number_of_timesteps = 1000
 runtime_parameters.timestep_log_interval = 100
diff --git a/Python/poiseuille/simulation.py b/Python/poiseuille/simulation.py
index d107801fa84cfe16d1d7e91d31dc3ff4b8671f02..a6f12e59fbd0a0ccad9a4db9ccde69b828cf90bf 100644
--- a/Python/poiseuille/simulation.py
+++ b/Python/poiseuille/simulation.py
@@ -1,35 +1,65 @@
-from pyfluids.cpu import Simulation
-from pyfluids.cpu.boundaryconditions import NoSlipBoundaryCondition
-from pyfluids.cpu.geometry import GbCuboid3D, State
-from pyfluids.cpu.kernel import LBMKernel, KernelType
-from pyfluids.cpu.parameters import RuntimeParameters, GridParameters, PhysicalParameters
-from pyfluids.cpu.writer import Writer, OutputFormat
-
-default_grid_params = GridParameters()
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file simulation.py
+! \ingroup poiseuille
+! \author Sven Marcus, Henry Korb
+=======================================================================================
+"""
+from pyfluids import cpu
+
+
+default_grid_params = cpu.parameters.GridParameters()
 default_grid_params.node_distance = 10 / 32
 default_grid_params.number_of_nodes_per_direction = [8, 8, 32]
 default_grid_params.blocks_per_direction = [1, 1, 4]
 default_grid_params.periodic_boundary_in_x1 = True
 default_grid_params.periodic_boundary_in_x2 = True
 
-default_physical_params = PhysicalParameters()
+default_physical_params = cpu.parameters.PhysicalParameters()
 default_physical_params.lattice_viscosity = 0.005
 
-default_runtime_params = RuntimeParameters()
+default_runtime_params = cpu.parameters.RuntimeParameters()
 default_runtime_params.number_of_threads = 4
 default_runtime_params.number_of_timesteps = 10000
 default_runtime_params.timestep_log_interval = 1000
 
-default_kernel = LBMKernel(KernelType.CompressibleCumulantFourthOrderViscosity)
+default_kernel = cpu.kernel.LBMKernel(cpu.kernel.KernelType.CompressibleCumulantFourthOrderViscosity)
 default_kernel.use_forcing = True
 default_kernel.forcing_in_x1 = 1e-8
 
-default_writer = Writer()
+default_writer = cpu.writer.Writer()
 default_writer.output_path = "./output"
-default_writer.output_format = OutputFormat.BINARY
+default_writer.output_format = cpu.writer.OutputFormat.BINARY
 
 
-default_kernel = LBMKernel(KernelType.CompressibleCumulantFourthOrderViscosity)
+default_kernel = cpu.kernel.LBMKernel(cpu.kernel.KernelType.CompressibleCumulantFourthOrderViscosity)
 default_kernel.use_forcing = True
 default_kernel.forcing_in_x1 = 1e-8
 
@@ -39,7 +69,7 @@ def run_simulation(physical_params=default_physical_params,
                    runtime_params=default_runtime_params,
                    kernel=default_kernel,
                    writer=default_writer):
-    simulation = Simulation()
+    simulation = cpu.Simulation()
 
     simulation.set_kernel_config(kernel)
     simulation.set_physical_parameters(physical_params)
@@ -47,11 +77,11 @@ def run_simulation(physical_params=default_physical_params,
     simulation.set_runtime_parameters(runtime_params)
     simulation.set_writer(writer)
 
-    no_slip_bc = NoSlipBoundaryCondition()
+    no_slip_bc = cpu.boundaryconditions.NoSlipBoundaryCondition()
 
     block_thickness = 3 * grid_params.node_distance
     simulation.add_object(
-        GbCuboid3D(
+        cpu.geometry.GbCuboid3D(
             grid_params.bounding_box.min_x1 - block_thickness,
             grid_params.bounding_box.min_x2 - block_thickness,
             grid_params.bounding_box.min_x3 - block_thickness,
@@ -59,10 +89,10 @@ def run_simulation(physical_params=default_physical_params,
             grid_params.bounding_box.max_x2 + block_thickness,
             grid_params.bounding_box.min_x3),
         no_slip_bc,
-        State.SOLID, "/geo/addWallZMin")
+        cpu.geometry.State.SOLID, "/geo/addWallZMin")
 
     simulation.add_object(
-        GbCuboid3D(
+        cpu.geometry.GbCuboid3D(
             grid_params.bounding_box.min_x1 - block_thickness,
             grid_params.bounding_box.min_x2 - block_thickness,
             grid_params.bounding_box.max_x3,
@@ -70,7 +100,7 @@ def run_simulation(physical_params=default_physical_params,
             grid_params.bounding_box.max_x2 + block_thickness,
             grid_params.bounding_box.max_x3 + block_thickness),
         no_slip_bc,
-        State.SOLID, "/geo/addWallZMax")
+        cpu.geometry.State.SOLID, "/geo/addWallZMax")
 
     simulation.run_simulation()
 
diff --git a/Python/poiseuille/test_poiseuille_l2.py b/Python/poiseuille/test_poiseuille_l2.py
index 93aa2600d5260dea7e72f3aa98db7334fe5285c6..818cba40e115945c60e4fa2ac96b3b6b5ab0bba8 100644
--- a/Python/poiseuille/test_poiseuille_l2.py
+++ b/Python/poiseuille/test_poiseuille_l2.py
@@ -1,3 +1,37 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file test_poiseuille_l2.py
+! \ingroup poiseuille
+! \author Sven Marcus, Henry Korb
+=======================================================================================
+"""
 import os
 import shutil
 import unittest
@@ -5,8 +39,7 @@ import unittest
 import matplotlib.pyplot as plt
 import numpy as np
 import pyvista as pv
-from pyfluids.cpu.kernel import LBMKernel, KernelType
-from pyfluids.cpu.parameters import GridParameters, PhysicalParameters, RuntimeParameters
+from pyfluids import cpu
 from scipy import stats
 
 from errors import normalized_l2_error
@@ -33,13 +66,13 @@ class TestPoiseuilleFlow(unittest.TestCase):
         self.skipTest("This test is not implemented correctly yet")
         plt.ion()
 
-        physical_params = PhysicalParameters()
+        physical_params = cpu.parameters.PhysicalParameters()
 
-        runtime_params = RuntimeParameters()
+        runtime_params = cpu.parameters.RuntimeParameters()
         runtime_params.number_of_threads = os.cpu_count()
         runtime_params.timestep_log_interval = 10000
 
-        kernel = LBMKernel(KernelType.CompressibleCumulantFourthOrderViscosity)
+        kernel = cpu.kernel.LBMKernel(cpu.kernel.KernelType.CompressibleCumulantFourthOrderViscosity)
         kernel.use_forcing = True
 
         normalized_l2_errors = []
@@ -140,7 +173,7 @@ def get_heights_from_indices(mesh, indices):
 
 
 def create_grid_params_with_nodes_in_column(nodes_in_column, delta_x):
-    grid_params = GridParameters()
+    grid_params = cpu.parameters.GridParameters()
     grid_params.node_distance = delta_x
     grid_params.number_of_nodes_per_direction = [1, 1, nodes_in_column]
     grid_params.blocks_per_direction = [1, 1, 8]
diff --git a/Python/tests/test_acousticscaling.py b/Python/tests/test_acousticscaling.py
index 6413123a80db8c5882fcf1dbe6f72a1f5438736c..02454b935e3a147e045f45c273392646aeca6b8c 100644
--- a/Python/tests/test_acousticscaling.py
+++ b/Python/tests/test_acousticscaling.py
@@ -1,9 +1,41 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file test_acousticscaling.py
+! \ingroup tests
+! \author Sven Marcus, Henry Korb
+=======================================================================================
+"""
 import unittest
 from typing import List
 
-from pyfluids.cpu.kernel import LBMKernel, KernelType
-from pyfluids.cpu.parameters import GridParameters, PhysicalParameters, RuntimeParameters
-
+from pyfluids import cpu
 from acousticscaling import OneDirectionalAcousticScaling
 
 
@@ -58,18 +90,18 @@ class OneDirectionalAcousticScalingTest(unittest.TestCase):
         self.assertEqual(self.grid_params.periodic_boundary_in_x2, actual_grid_params.periodic_boundary_in_x2)
         self.assertEqual(self.grid_params.periodic_boundary_in_x3, actual_grid_params.periodic_boundary_in_x3)
 
-    def assert_physical_params_scaled_by_factor(self, actual_params: PhysicalParameters, factor: int):
+    def assert_physical_params_scaled_by_factor(self, actual_params: cpu.parameters.PhysicalParameters, factor: int):
         self.assertEqual(self.physical_params.lattice_viscosity * factor, actual_params.lattice_viscosity)
         self.assertEqual(self.physical_params.bulk_viscosity_factor, actual_params.bulk_viscosity_factor)
 
-    def assert_runtime_params_scaled_by_factor(self, actual_params: RuntimeParameters, factor: int):
+    def assert_runtime_params_scaled_by_factor(self, actual_params: cpu.parameters.RuntimeParameters, factor: int):
         self.assertEqual(self.runtime_params.number_of_timesteps * factor, actual_params.number_of_timesteps)
         self.assertEqual(self.runtime_params.number_of_threads, actual_params.number_of_threads)
         self.assertEqual(self.runtime_params.timestep_log_interval, actual_params.timestep_log_interval)
 
-    def assert_kernel_forcing_scaled_by_factor(self, actual_kernel: LBMKernel, factor: int):
+    def assert_kernel_forcing_scaled_by_factor(self, actual_kernel: cpu.kernel.LBMKernel, factor: int):
         self.assertEqual(self.kernel.type, actual_kernel.type)
-        self.assertEqual(self.kernel.use_forcing, actual_kernel.use_forcing)
+        self.assertEqual(self.kernel.use_forcing, actual_kernel.cpu.parameters.use_forcing)
         self.assertAlmostEqual(self.kernel.forcing_in_x1 / factor, actual_kernel.forcing_in_x1)
         self.assertAlmostEqual(self.kernel.forcing_in_x2, actual_kernel.forcing_in_x2)
         self.assertAlmostEqual(self.kernel.forcing_in_x3, actual_kernel.forcing_in_x3)
@@ -80,14 +112,14 @@ class OneDirectionalAcousticScalingTest(unittest.TestCase):
 
     @staticmethod
     def make_kernel():
-        kernel = LBMKernel(KernelType.CompressibleCumulantFourthOrderViscosity)
+        kernel = cpu.kernel.LBMKernel(cpu.kernel.KernelType.CompressibleCumulantFourthOrderViscosity)
         kernel.use_forcing = True
         kernel.forcing_in_x1 = 5e-10
         return kernel
 
     @staticmethod
     def make_runtime_params():
-        runtime_params = RuntimeParameters()
+        runtime_params = cpu.parameters.RuntimeParameters()
         runtime_params.number_of_threads = 4
         runtime_params.number_of_timesteps = 4_000_000
         runtime_params.timestep_log_interval = 1_000_000
@@ -95,13 +127,13 @@ class OneDirectionalAcousticScalingTest(unittest.TestCase):
 
     @staticmethod
     def make_physical_params():
-        physical_params = PhysicalParameters()
+        physical_params = cpu.parameters.PhysicalParameters()
         physical_params.lattice_viscosity = 1e-4
         return physical_params
 
     @staticmethod
     def make_grid_params():
-        grid_params = GridParameters()
+        grid_params = cpu.parameters.GridParameters()
         grid_params.node_distance = 1
         grid_params.number_of_nodes_per_direction = [1, 1, 16]
         grid_params.blocks_per_direction = [1, 1, 16]
diff --git a/Python/tests/test_boundaryconditions.py b/Python/tests/test_boundaryconditions.py
index e004ddfa21c78ea3d63a89f5dbc3bd7438a18ff1..d914c50cad2051188331b2efe604907091fa731e 100644
--- a/Python/tests/test_boundaryconditions.py
+++ b/Python/tests/test_boundaryconditions.py
@@ -1,5 +1,39 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file test_boundaryconditions.py
+! \ingroup tests
+! \author Sven Marcus, Henry Korb
+=======================================================================================
+"""
 import unittest
-from pyfluids.cpu.boundaryconditions import *
+from pyfluids import cpu
 
 
 class BoundaryConditionsTest(unittest.TestCase):
@@ -8,13 +42,13 @@ class BoundaryConditionsTest(unittest.TestCase):
         """
         Should be able to create NoSlipBoundaryCondition
         """
-        sut = NoSlipBoundaryCondition()
+        sut = cpu.boundaryconditions.NoSlipBoundaryCondition()
 
     def test__can_create_velocity_bc(self):
         """
         Should be able to create VelocityBoundaryCondition
         """
-        sut = VelocityBoundaryCondition()
+        sut = cpu.boundaryconditions.VelocityBoundaryCondition()
 
     def test__can_create_velocity_bc_with_directions_function_and_time(self):
         """
@@ -24,7 +58,7 @@ class BoundaryConditionsTest(unittest.TestCase):
 
         parser = Parser()
         parser.expression = "1"
-        sut = VelocityBoundaryCondition(True, True, True, parser, 0, 1)
+        sut = cpu.boundaryconditions.VelocityBoundaryCondition(True, True, True, parser, 0, 1)
 
     def test__can_create_velocity_bc_with_directions__function_per_direction__and__time(self):
         """
@@ -40,7 +74,7 @@ class BoundaryConditionsTest(unittest.TestCase):
 
         f3 = Parser()
         f3.expression = "1"
-        sut = VelocityBoundaryCondition(True, True, True, f1, f2, f3, 0, 1)
+        sut = cpu.boundaryconditions.VelocityBoundaryCondition(True, True, True, f1, f2, f3, 0, 1)
 
     def test__can_create_velocity_bc_with_speeds_and_times_per_direction(self):
         """
@@ -51,11 +85,11 @@ class BoundaryConditionsTest(unittest.TestCase):
         start2, end2 = 1, 2
         start3, end3 = 2, 3
 
-        sut = VelocityBoundaryCondition(vx1, start1, end1, vx2, start2, end2, vx3, start3, end3)
+        sut = cpu.boundaryconditions.VelocityBoundaryCondition(vx1, start1, end1, vx2, start2, end2, vx3, start3, end3)
 
     def test__can_create_non_reflecting_outflow(self):
         """
         Should be able to create NonReflectingOutflow
         """
 
-        sut = NonReflectingOutflow()
+        sut = cpu.boundaryconditions.NonReflectingOutflow()
diff --git a/Python/tests/test_geometry.py b/Python/tests/test_geometry.py
index 5bb89eb245b6055653b78fde381da050d402b0cc..3d297f5c176cd99f7969adf37333588d86b77627 100644
--- a/Python/tests/test_geometry.py
+++ b/Python/tests/test_geometry.py
@@ -1,6 +1,40 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file test_geometry.py
+! \ingroup tests
+! \author Sven Marcus, Henry Korb
+=======================================================================================
+"""
 import unittest
 
-from pyfluids.cpu.geometry import *
+from pyfluids import cpu
 
 
 class TestGeometry(unittest.TestCase):
@@ -9,7 +43,7 @@ class TestGeometry(unittest.TestCase):
         """
         WHEN setting point coordinates in constructor THEN point should have coordinates
         """
-        sut = GbPoint3D(4, 8, 3)
+        sut = cpu.geometry.GbPoint3D(4, 8, 3)
 
         self.assertEqual(sut.x1, 4)
         self.assertEqual(sut.x2, 8)
@@ -19,7 +53,7 @@ class TestGeometry(unittest.TestCase):
         """
         WHEN setting point coordinates THEN point should have coordinates
         """
-        sut = GbPoint3D()
+        sut = cpu.geometry.GbPoint3D()
 
         sut.x1 = 4
         sut.x2 = 8
@@ -33,10 +67,10 @@ class TestGeometry(unittest.TestCase):
         """
         WHEN setting line points THEN line should have points
         """
-        sut = GbLine3D()
+        sut = cpu.geometry.GbLine3D()
 
-        point1 = GbPoint3D()
-        point2 = GbPoint3D()
+        point1 = cpu.geometry.GbPoint3D()
+        point2 = cpu.geometry.GbPoint3D()
         sut.point1 = point1
         sut.point2 = point2
 
diff --git a/Python/tests/test_kernel.py b/Python/tests/test_kernel.py
index 8f58a1c869f9e292856268d43245a75f1dcfe213..e0159bec6802cb08d73214038b177091879fee46 100644
--- a/Python/tests/test_kernel.py
+++ b/Python/tests/test_kernel.py
@@ -1,12 +1,46 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file test_kernel.py
+! \ingroup tests
+! \author Sven Marcus, Henry Korb
+=======================================================================================
+"""
 import unittest
 
-from pyfluids.cpu.kernel import LBMKernel, KernelType
+from pyfluids import cpu
 
 
 class TestLBMKernel(unittest.TestCase):
 
     def setUp(self) -> None:
-        self.sut = LBMKernel(KernelType.BGK)
+        self.sut = cpu.kernel.LBMKernel(cpu.kernel.KernelType.BGK)
 
     def test_lbm_kernel__when_use_forcing_set_to_true__use_forcing_should_be_true(self) -> None:
         """
@@ -57,4 +91,4 @@ class TestLBMKernel(unittest.TestCase):
         """
 
         actual = self.sut.type
-        self.assertEqual(KernelType.BGK, actual)
+        self.assertEqual(cpu.kernel.KernelType.BGK, actual)
diff --git a/apps/cpu/ConvectionOfVortex/CMakeLists.txt b/apps/cpu/ConvectionOfVortex/CMakeLists.txt
index de3034c04bb2f2f16edd9b4bf48db81c83d15b3e..33d60676c7e0dfdde411c3c5b92a2534ea54fbfe 100644
--- a/apps/cpu/ConvectionOfVortex/CMakeLists.txt
+++ b/apps/cpu/ConvectionOfVortex/CMakeLists.txt
@@ -1,3 +1,6 @@
+########################################################
+## C++ PROJECT                                       ###
+########################################################
 PROJECT(ConvectionOfVortex)
 
 vf_add_library(BUILDTYPE binary PRIVATE_LINK VirtualFluidsCore basics ${MPI_CXX_LIBRARIES} FILES cov.cpp )
diff --git a/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp b/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
index 58e5aede18b9c4197b4d21b129c6347023b9390e..9d982ebac0059b4512041194100f6e1fdfa61924 100644
--- a/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
+++ b/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
@@ -1,4 +1,35 @@
-
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ActuatorLine.cpp
+//! \ingroup ActuatorLine
+//! \author Henry Korb, Henrik Asmuth
+//=======================================================================================
 #define _USE_MATH_DEFINES
 #include <math.h>
 #include <string>
@@ -28,12 +59,14 @@
 #include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
 #include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
 #include "GridGenerator/grid/BoundaryConditions/Side.h"
+#include "GridGenerator/grid/BoundaryConditions/BoundaryCondition.h"
+
 #include "GridGenerator/grid/GridFactory.h"
 
 #include "GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
 #include "GridGenerator/io/GridVTKWriter/GridVTKWriter.h"
-#include "GridGenerator/io/STLReaderWriter/STLReader.h"
-#include "GridGenerator/io/STLReaderWriter/STLWriter.h"
+#include "GridGenerator/TransientBCSetter/TransientBCSetter.h"
+
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -44,10 +77,12 @@
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
 #include "VirtualFluids_GPU/Parameter/Parameter.h"
 #include "VirtualFluids_GPU/Output/FileWriter.h"
-#include "VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h"
+#include "VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.h"
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h"
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h"
 #include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
+#include "VirtualFluids_GPU/TurbulenceModels/TurbulenceModelFactory.h"
+#include "VirtualFluids_GPU/Factories/GridScalingFactory.h"
 
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
 
@@ -63,26 +98,10 @@
 
 LbmOrGks lbmOrGks = LBM;
 
-const real reference_diameter = 126.0; // diameter in m
-
-const real L_x = 10*reference_diameter;
-const real L_y = 6*reference_diameter;
-const real L_z = 6*reference_diameter;
-
-const real viscosity = 1.56e-5;
-
-const real velocity  = 9.0;
-
-const real mach = 0.1;
-
-const uint nodes_per_diameter = 16;
-
 std::string path(".");
 
 std::string simulationName("ActuatorLine");
 
-const float tOut = 100;
-const float tEnd = 280; // total time of simulation in s
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -98,30 +117,59 @@ void multipleLevel(const std::string& configPath)
     vf::gpu::Communicator& communicator = vf::gpu::Communicator::getInstance();
 
     auto gridFactory = GridFactory::make();
-    gridFactory->setTriangularMeshDiscretizationMethod(TriangularMeshDiscretizationMethod::POINT_IN_OBJECT);
     auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
 
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    vf::basics::ConfigurationFile config;
+    config.load(configPath);
+
+    const real reference_diameter = config.getValue<real>("ReferenceDiameter");
+    const uint nodes_per_diameter = config.getValue<uint>("NodesPerDiameter");
+    const real velocity = config.getValue<real>("Velocity");
+
+
+    const real L_x = 24*reference_diameter;
+    const real L_y = 6*reference_diameter;
+    const real L_z = 6*reference_diameter;
+
+    const real viscosity = 1.56e-5;
+
+    const real mach = 0.1;
+
+
+    const float tStartOut   = config.getValue<real>("tStartOut");
+    const float tOut        = config.getValue<real>("tOut");
+    const float tEnd        = config.getValue<real>("tEnd"); // total time of simulation
+
+    const float tStartAveraging     =  config.getValue<real>("tStartAveraging");
+    const float tStartTmpAveraging  =  config.getValue<real>("tStartTmpAveraging");
+    const float tAveraging          =  config.getValue<real>("tAveraging");
+    const float tStartOutProbe      =  config.getValue<real>("tStartOutProbe");
+    const float tOutProbe           =  config.getValue<real>("tOutProbe");
+        
+    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNummberOfProcess(), communicator.getPID(), &config);
+    BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
+    GridScalingFactory scalingFactory  = GridScalingFactory();
+
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 	const real dx = reference_diameter/real(nodes_per_diameter);
 
+    real turbPos[3] = {3*reference_diameter, 3*reference_diameter, 3*reference_diameter};
+
 	gridBuilder->addCoarseGrid(0.0, 0.0, 0.0,
 							   L_x,  L_y,  L_z, dx);
 
+    gridBuilder->setNumberOfLayers(4,0);
+    gridBuilder->addGrid( new Cuboid(   turbPos[0]-1.5*reference_diameter,  turbPos[1]-1.5*reference_diameter,  turbPos[2]-1.5*reference_diameter, 
+                                        turbPos[0]+10.0*reference_diameter, turbPos[1]+1.5*reference_diameter,  turbPos[2]+1.5*reference_diameter) , 1 );
+    para->setMaxLevel(2);
+    scalingFactory.setScalingFactory(GridScalingFactory::GridScaling::ScaleCompressible);
+
 	gridBuilder->setPeriodicBoundaryCondition(false, false, false);
 
 	gridBuilder->buildGrids(lbmOrGks, false); // buildGrids() has to be called before setting the BCs!!!!
 
-	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    vf::basics::ConfigurationFile config;
-    config.load(configPath);
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////^
-    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNummberOfProcess(), communicator.getPID(), &config);
-    BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
     const real dt = dx * mach / (sqrt(3) * velocity);
 
@@ -140,14 +188,11 @@ void multipleLevel(const std::string& configPath)
 
     para->setPrintFiles(true);
 
-    para->setMaxLevel(1);
-
-
     para->setVelocityLB(velocityLB);
     para->setViscosityLB(viscosityLB);
     para->setVelocityRatio( dx / dt );
     para->setViscosityRatio( dx*dx/dt );
-    para->setMainKernel("CumulantK17CompChim");
+    para->setMainKernel("CumulantK17");
 
     para->setInitialCondition([&](real coordX, real coordY, real coordZ, real &rho, real &vx, real &vy, real &vz) {
         rho = (real)0.0;
@@ -156,13 +201,15 @@ void multipleLevel(const std::string& configPath)
         vz  = (real)0.0;
     });
 
+    para->setTimestepStartOut( uint(tStartOut/dt) );
     para->setTimestepOut( uint(tOut/dt) );
     para->setTimestepEnd( uint(tEnd/dt) );
 
     para->setIsBodyForce( true );
-
+    para->setUseStreams( true );
 
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
     gridBuilder->setVelocityBoundaryCondition(SideType::MX,  velocityLB,  0.0, 0.0);
 
     gridBuilder->setVelocityBoundaryCondition(SideType::MY,  velocityLB,  0.0, 0.0);
@@ -172,42 +219,52 @@ void multipleLevel(const std::string& configPath)
     gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0);
 
     bcFactory.setVelocityBoundaryCondition(BoundaryConditionFactory::VelocityBC::VelocityAndPressureCompressible);
-    bcFactory.setPressureBoundaryCondition(BoundaryConditionFactory::PressureBC::PressureNonEquilibriumCompressible);
+    bcFactory.setPressureBoundaryCondition(BoundaryConditionFactory::PressureBC::OutflowNonReflective);
+
+    SPtr<TurbulenceModelFactory> tmFactory = std::make_shared<TurbulenceModelFactory>(para);
+    tmFactory->readConfigFile(config);
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    real turbPos[3] = {3*reference_diameter, 3*reference_diameter, 3*reference_diameter};
-    real epsilon = 5.f; // width of gaussian smearing
-    real density = 1.225f;
-    int level = 0;
-    uint nBlades = 3;
-    uint nBladeNodes = 32;
+    int level = 1; // grid level at which the turbine samples velocities and distributes forces
+    const real epsilon = dx*exp2(-level)*1.5; // width of gaussian smearing
+    const real density = 1.225f;
+    const uint nBlades = 3;
+    const uint nBladeNodes = 32;
+    const real tipspeed_ratio = 7.5f; // tipspeed ratio = angular vel * radius / inflow vel
+    const real omega = 2*tipspeed_ratio*velocity/reference_diameter;
+    
+
+    SPtr<ActuatorFarm> actuator_farm = std::make_shared<ActuatorFarm>(nBlades, density, nBladeNodes, epsilon, level, dt, dx, true);
+    std::vector<real> bladeRadii;
+    real dr = reference_diameter/(nBladeNodes*2);
+    for(uint node=0; node<nBladeNodes; node++){ bladeRadii.emplace_back(dr*(node+1)); }
+    actuator_farm->addTurbine(turbPos[0], turbPos[1], turbPos[2], reference_diameter, omega, 0, 0, bladeRadii);
+    para->addActuator( actuator_farm );
 
-    SPtr<ActuatorLine> actuator_line =SPtr<ActuatorLine>( new ActuatorLine(nBlades, density, nBladeNodes, epsilon, turbPos[0], turbPos[1], turbPos[2], reference_diameter, level, dt, dx) );
-    para->addActuator( actuator_line );
 
-    SPtr<PointProbe> pointProbe = SPtr<PointProbe>( new PointProbe("pointProbe", para->getOutputPath(), 100, 1, 500, 100) );
-    std::vector<real> probeCoordsX = {reference_diameter,2*reference_diameter,5*reference_diameter};
-    std::vector<real> probeCoordsY = {3*reference_diameter,3*reference_diameter,3*reference_diameter};
-    std::vector<real> probeCoordsZ = {3*reference_diameter,3*reference_diameter,3*reference_diameter};
-    pointProbe->addProbePointsFromList(probeCoordsX, probeCoordsY, probeCoordsZ);
-    // pointProbe->addProbePointsFromXNormalPlane(2*D, 0.0, 0.0, L_y, L_z, (uint)L_y/dx, (uint)L_z/dx);
+    // SPtr<PointProbe> pointProbe = std::make_shared<PointProbe>("pointProbe", para->getOutputPath(), 100, 1, 500, 100);
+    // std::vector<real> probeCoordsX = {reference_diameter,2*reference_diameter,5*reference_diameter};
+    // std::vector<real> probeCoordsY = {3*reference_diameter,3*reference_diameter,3*reference_diameter};
+    // std::vector<real> probeCoordsZ = {3*reference_diameter,3*reference_diameter,3*reference_diameter};
+    // pointProbe->addProbePointsFromList(probeCoordsX, probeCoordsY, probeCoordsZ);
+    // // pointProbe->addProbePointsFromXNormalPlane(2*D, 0.0, 0.0, L_y, L_z, (uint)L_y/dx, (uint)L_z/dx);
 
-    pointProbe->addStatistic(Statistic::Means);
-    pointProbe->addStatistic(Statistic::Variances);
-    para->addProbe( pointProbe );
+    // pointProbe->addStatistic(Statistic::Means);
+    // pointProbe->addStatistic(Statistic::Variances);
+    // para->addProbe( pointProbe );
 
-    SPtr<PlaneProbe> planeProbe = SPtr<PlaneProbe>( new PlaneProbe("planeProbe", para->getOutputPath(), 100, 500, 100, 100) );
-    planeProbe->setProbePlane(5*reference_diameter, 0, 0, dx, L_y, L_z);
-    planeProbe->addStatistic(Statistic::Means);
-    para->addProbe( planeProbe );
+    // SPtr<PlaneProbe> planeProbe = std::make_shared<PlaneProbe>("planeProbe", para->getOutputPath(), 100, 500, 100, 100);
+    // planeProbe->setProbePlane(5*reference_diameter, 0, 0, dx, L_y, L_z);
+    // planeProbe->addStatistic(Statistic::Means);
+    // para->addProbe( planeProbe );
 
 
     auto cudaMemoryManager = std::make_shared<CudaMemoryManager>(para);
 
     auto gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
 
-    Simulation sim(para, cudaMemoryManager, communicator, *gridGenerator, &bcFactory);
+    Simulation sim(para, cudaMemoryManager, communicator, *gridGenerator, &bcFactory, tmFactory, &scalingFactory);
     sim.run();
 }
 
diff --git a/apps/gpu/LBM/ActuatorLine/configActuatorLine.txt b/apps/gpu/LBM/ActuatorLine/configActuatorLine.txt
index 233994f0d32a48190d84f7044500e24b06b926a9..5799f24716777295b2f835ab00561ff767ba87b9 100644
--- a/apps/gpu/LBM/ActuatorLine/configActuatorLine.txt
+++ b/apps/gpu/LBM/ActuatorLine/configActuatorLine.txt
@@ -6,3 +6,29 @@ Path = .
 #informations for reading
 ##################################################
 GridPath=.
+##################################################
+ReferenceDiameter=126
+NodesPerDiameter=32
+Velocity=9
+##################################################
+tStartOut=100
+tOut=100
+tEnd=1000
+##################################################
+
+tStartTmpAveraging=100
+tStartAveraging=100
+tAveraging=100
+tTmpAveraging=100
+tStartOutProbe=100
+tOutProbe=100
+
+##################################################
+#TurbulenceModel = QR
+#SGSconstant = 0.3333333
+#
+#QuadricLimiterP = 100000.0
+#QuadricLimiterM = 100000.0
+#QuadricLimiterD = 100000.0
+##################################################
+
diff --git a/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp b/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
index 991025b649d69305c030fe2f1dd1763a2137af9b..5fc31904433bfe2df0722ab1c63f574d3fcb9a35 100644
--- a/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
+++ b/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
@@ -1,4 +1,35 @@
-
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file BoundaryLayer.cpp
+//! \ingroup BoundaryLayer
+//! \author Henry Korb, Henrik Asmuth
+//=======================================================================================
 #define _USE_MATH_DEFINES
 #include <math.h>
 #include <string>
@@ -8,6 +39,7 @@
 #include <fstream>
 #include <exception>
 #include <memory>
+#include <numeric>
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -19,6 +51,7 @@
 #include "Core/VectorTypes.h"
 
 #include <basics/config/ConfigurationFile.h>
+#include "lbm/constants/NumericConstants.h"
 
 #include <logger/Logger.h>
 
@@ -28,12 +61,16 @@
 #include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
 #include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
 #include "GridGenerator/grid/BoundaryConditions/Side.h"
+#include "GridGenerator/grid/BoundaryConditions/BoundaryCondition.h"
+
 #include "GridGenerator/grid/GridFactory.h"
 
+#include "geometries/Cuboid/Cuboid.h"
+#include "geometries/TriangularMesh/TriangularMesh.h"
+
 #include "GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
 #include "GridGenerator/io/GridVTKWriter/GridVTKWriter.h"
-#include "GridGenerator/io/STLReaderWriter/STLReader.h"
-#include "GridGenerator/io/STLReaderWriter/STLWriter.h"
+#include "GridGenerator/TransientBCSetter/TransientBCSetter.h"
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -44,24 +81,28 @@
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
 #include "VirtualFluids_GPU/Parameter/Parameter.h"
 #include "VirtualFluids_GPU/Output/FileWriter.h"
-#include "VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h"
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h"
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h"
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.h"
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.h"
+#include "VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.h"
 #include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
+#include "VirtualFluids_GPU/Factories/GridScalingFactory.h"
 #include "VirtualFluids_GPU/TurbulenceModels/TurbulenceModelFactory.h"
 
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
 
+#include "utilities/communication.h"
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 std::string path(".");
 
-std::string simulationName("BoundayLayer");
+std::string simulationName("BoundaryLayer");
 
+using namespace vf::lbm::constant;
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -87,8 +128,16 @@ void multipleLevel(const std::string& configPath)
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////^
     SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNummberOfProcess(), communicator.getPID(), &config);
     BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
-
+    GridScalingFactory scalingFactory  = GridScalingFactory();
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    
+    const int  nProcs = communicator.getNummberOfProcess();
+    const uint procID = vf::gpu::Communicator::getInstance().getPID();
+    std::vector<uint> devices(10);
+    std::iota(devices.begin(), devices.end(), 0);
+    para->setDevices(devices);
+    para->setMaxDev(nProcs);
+    
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //
@@ -100,23 +149,45 @@ void multipleLevel(const std::string& configPath)
 
     LbmOrGks lbmOrGks = LBM;
 
-    const real H = 1000.0; // boundary layer height in m
+    const real H = config.getValue("boundaryLayerHeight", 1000.0); // boundary layer height in m
 
     const real L_x = 6*H;
     const real L_y = 4*H;
-    const real L_z = 1*H;
+    const real L_z = H;
+
+    const real z0  = config.getValue("z0", 0.1f); // roughness length in m
+    const real u_star = config.getValue("u_star", 0.4f); //friction velocity in m/s
+    const real kappa = config.getValue("vonKarmanConstant", 0.4f); // von Karman constant
 
-    const real z0  = 0.1; // roughness length in m
-    const real u_star = 0.4; //friction velocity in m/s
-    const real kappa = 0.4; // von Karman constant
+    const real viscosity = config.getValue("viscosity", 1.56e-5f);
 
-    const real viscosity = 1.56e-5;
+    const real velocity  = 0.5f*u_star/kappa*log(H/z0+1.f); //0.5 times max mean velocity at the top in m/s
 
-    const real velocity  = 0.5*u_star/kappa*log(L_z/z0); //0.5 times max mean velocity at the top in m/s
+    const real mach = config.getValue<real>("Ma", 0.1);
 
-    const real mach = config.contains("Ma")? config.getValue<real>("Ma"): 0.1;
+    const uint nodes_per_H = config.getValue<uint>("nz", 64);
 
-    const uint nodes_per_H = config.contains("nz")? config.getValue<uint>("nz"): 64;
+    const bool writePrecursor = config.getValue("writePrecursor", false);
+    bool useDistributions;
+    std::string precursorDirectory;
+    int nTWritePrecursor; real tStartPrecursor, posXPrecursor;
+    if(writePrecursor)
+    {
+        nTWritePrecursor     = config.getValue<int>("nTimestepsWritePrecursor");
+        tStartPrecursor      = config.getValue<real>("tStartPrecursor");
+        posXPrecursor        = config.getValue<real>("posXPrecursor");
+        useDistributions     = config.getValue<bool>("useDistributions", false);
+        precursorDirectory   = config.getValue<std::string>("precursorDirectory");
+    }
+
+    const bool readPrecursor = config.getValue("readPrecursor", false);
+    int timestepsBetweenReadsPrecursor;
+    if(readPrecursor)
+    {
+        timestepsBetweenReadsPrecursor = config.getValue<int>("nTimestepsReadPrecursor");
+        precursorDirectory = config.getValue<std::string>("precursorDirectory");
+        useDistributions     = config.getValue<bool>("useDistributions", false);
+    }
 
     // all in s
     const float tStartOut   = config.getValue<real>("tStartOut");
@@ -130,7 +201,7 @@ void multipleLevel(const std::string& configPath)
     const float tOutProbe           =  config.getValue<real>("tOutProbe");
 
 
-    const real dx = L_z/real(nodes_per_H);
+    const real dx = H/real(nodes_per_H);
 
     const real dt = dx * mach / (sqrt(3) * velocity);
 
@@ -155,15 +226,17 @@ void multipleLevel(const std::string& configPath)
 
     para->setPrintFiles(true);
 
-    para->setForcing(pressureGradientLB, 0, 0);
+    if(!readPrecursor) para->setForcing(pressureGradientLB, 0, 0);
     para->setVelocityLB(velocityLB);
     para->setViscosityLB(viscosityLB);
     para->setVelocityRatio( dx / dt );
     para->setViscosityRatio( dx*dx/dt );
     para->setDensityRatio( 1.0 );
 
-    para->setMainKernel("TurbulentViscosityCumulantK17CompChim");
-
+    bool useStreams = (nProcs > 1 ? true: false);
+    // useStreams=false;
+    para->setUseStreams(useStreams);
+    para->setMainKernel("CumulantK17");
     para->setIsBodyForce( config.getValue<bool>("bodyForce") );
 
     para->setTimestepStartOut(uint(tStartOut/dt) );
@@ -172,64 +245,206 @@ void multipleLevel(const std::string& configPath)
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    SPtr<TurbulenceModelFactory> tmFactory = SPtr<TurbulenceModelFactory>( new TurbulenceModelFactory(para) );
+    SPtr<TurbulenceModelFactory> tmFactory = std::make_shared<TurbulenceModelFactory>(para);
     tmFactory->readConfigFile( config );
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    const real xSplit = L_x/nProcs;
+    const real overlap = 8.0*dx;
+
+    real xMin      =  procID    * xSplit;
+    real xMax      = (procID+1) * xSplit;
+    real xGridMin  =  procID    * xSplit;
+    real xGridMax  = (procID+1) * xSplit;
+
+    real yMin      = 0.0;
+    real yMax      = L_y;
+    real zMin      = 0.0;
+    real zMax      = L_z; 
+
+    bool isFirstSubDomain = (procID == 0        && nProcs > 1)?                    true: false;
+    bool isLastSubDomain  = (procID == nProcs-1 && nProcs > 1)?                    true: false;
+    bool isMidSubDomain   = (!isFirstSubDomain && !isLastSubDomain && nProcs > 1)? true: false;
     
-    // tmFactory->setTurbulenceModel(TurbulenceModel::AMD);
-    // tmFactory->setModelConstant(config.getValue<real>("SGSconstant"));
+    if(isFirstSubDomain)
+    {
+        xGridMax += overlap;
+        if(!readPrecursor) xGridMin -= overlap;
+    }
+    if(isLastSubDomain)
+    {
+        xGridMin -= overlap;
+        if(!readPrecursor) xGridMax += overlap;
+    }
+    if(isMidSubDomain)
+    {
+        xGridMax += overlap;
+        xGridMin -= overlap;
+    }
 
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    gridBuilder->addCoarseGrid( xGridMin,  0.0,  0.0,
+                                xGridMax,  L_y,  L_z, dx);
+    if(true)// Add refinement
+    {
+        gridBuilder->setNumberOfLayers(4,0);
+        real xMaxRefinement = readPrecursor? xGridMax-H: xGridMax;   //Stop refinement some distance before outlet if domain ist not periodic
+        gridBuilder->addGrid( new Cuboid( xGridMin, 0.f, 0.f, xMaxRefinement, L_y,  0.5*L_z) , 1 );
+        para->setMaxLevel(2);
+        scalingFactory.setScalingFactory(GridScalingFactory::GridScaling::ScaleCompressible);
+    }
+
+    if(nProcs > 1)
+    {
+            gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xMin, xMax, yMin, yMax, zMin, zMax));        
+            gridBuilder->setPeriodicBoundaryCondition(false, true, false);
+    }
+    else         
+    { 
+        gridBuilder->setPeriodicBoundaryCondition(!readPrecursor, true, false);
+    }
 
-    gridBuilder->addCoarseGrid(0.0, 0.0, 0.0,
-                                L_x,  L_y,  L_z, dx);
-    // gridBuilder->setNumberOfLayers(12, 8);
+	gridBuilder->buildGrids(lbmOrGks, true); // buildGrids() has to be called before setting the BCs!!!!
 
-    // gridBuilder->addGrid( new Cuboid( 0.0, 0.0, 0.0, L_x,  L_y,  0.3*L_z) , 1 );
-    // para->setMaxLevel(2);
+    std::cout << "nProcs: "<< nProcs << "Proc: " << procID << " isFirstSubDomain: " << isFirstSubDomain << " isLastSubDomain: " << isLastSubDomain << " isMidSubDomain: " << isMidSubDomain << std::endl;
+    
+    if(nProcs > 1){
+        if (isFirstSubDomain || isMidSubDomain) {
+            gridBuilder->findCommunicationIndices(CommunicationDirections::PX, lbmOrGks);
+            gridBuilder->setCommunicationProcess(CommunicationDirections::PX, procID+1);
+        }
 
-    gridBuilder->setPeriodicBoundaryCondition(true, true, false);
+        if (isLastSubDomain || isMidSubDomain) {
+            gridBuilder->findCommunicationIndices(CommunicationDirections::MX, lbmOrGks);
+            gridBuilder->setCommunicationProcess(CommunicationDirections::MX, procID-1);
+        }
 
-	gridBuilder->buildGrids(lbmOrGks, false); // buildGrids() has to be called before setting the BCs!!!!
+        if (isFirstSubDomain && !readPrecursor) {
+            gridBuilder->findCommunicationIndices(CommunicationDirections::MX, lbmOrGks);
+            gridBuilder->setCommunicationProcess(CommunicationDirections::MX, nProcs-1);
+        }
 
+        if (isLastSubDomain && !readPrecursor) {
+            gridBuilder->findCommunicationIndices(CommunicationDirections::PX, lbmOrGks);
+            gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 0);
+        }
+    }
     uint samplingOffset = 2;
-    // gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
+    
+    std::cout << " precursorDirectory " << precursorDirectory << std::endl;
+    
+    if(readPrecursor)
+    {
+        if(isFirstSubDomain || nProcs == 1)
+        {   
+            auto precursor = createFileCollection(precursorDirectory + "/precursor", FileType::VTK);
+            gridBuilder->setPrecursorBoundaryCondition(SideType::MX, precursor, timestepsBetweenReadsPrecursor);
+            // gridBuilder->setVelocityBoundaryCondition(SideType::MX, velocityLB, 0.0, 0.0);
+        }
+
+        if(isLastSubDomain || nProcs == 1)
+        {
+            gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.f);
+        }     
+    } 
+
     gridBuilder->setStressBoundaryCondition(SideType::MZ,
                                             0.0, 0.0, 1.0,              // wall normals
-                                            samplingOffset, z0/dx);     // wall model settinng
-    para->setHasWallModelMonitor(true);
-    bcFactory.setStressBoundaryCondition(BoundaryConditionFactory::StressBC::StressPressureBounceBack);
+                                            samplingOffset, z0, dx);     // wall model settinng
+    para->setHasWallModelMonitor(true);   
+    gridBuilder->setSlipBoundaryCondition(SideType::PZ,  0.0f,  0.0f, -1.0f); 
 
-    gridBuilder->setSlipBoundaryCondition(SideType::PZ,  0.0,  0.0, 0.0);
+    bcFactory.setVelocityBoundaryCondition(BoundaryConditionFactory::VelocityBC::VelocityCompressible);
+    bcFactory.setStressBoundaryCondition(BoundaryConditionFactory::StressBC::StressPressureBounceBack);
     bcFactory.setSlipBoundaryCondition(BoundaryConditionFactory::SlipBC::SlipBounceBack); 
-    
+    bcFactory.setPressureBoundaryCondition(BoundaryConditionFactory::PressureBC::OutflowNonReflective);
+    bcFactory.setPrecursorBoundaryCondition(useDistributions ? BoundaryConditionFactory::PrecursorBC::DistributionsPrecursor : BoundaryConditionFactory::PrecursorBC::VelocityPrecursor);
+    para->setOutflowPressureCorrectionFactor(0.0); 
 
-    real cPi = 3.1415926535897932384626433832795;
-    para->setInitialCondition([&](real coordX, real coordY, real coordZ, real &rho, real &vx, real &vy, real &vz) {
+    if(readPrecursor)
+    {
+        para->setInitialCondition([&](real coordX, real coordY, real coordZ, real &rho, real &vx, real &vy, real &vz) {
         rho = (real)0.0;
-        vx  = (u_star/0.4 * log(coordZ/z0) + 2.0*sin(cPi*16.0f*coordX/L_x)*sin(cPi*8.0f*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1))  * dt / dx; 
-        vy  = 2.0*sin(cPi*16.0f*coordX/L_x)*sin(cPi*8.0f*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1)  * dt / dx; 
-        vz  = 8.0*u_star/0.4*(sin(cPi*8.0*coordY/H)*sin(cPi*8.0*coordZ/H)+sin(cPi*8.0*coordX/L_x))/(pow(L_z/2.0-coordZ, c2o1)+c1o1) * dt / dx;
-    });
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        vx  = rho = c0o1;
+        vx  = u_star/c4o10*(u_star/c4o10 * log(coordZ/z0+c1o1)) * dt/dx; 
+        vy  = c0o1; 
+        vz  = c0o1;
+        });
+    }
+    else
+    {
+        para->setInitialCondition([&](real coordX, real coordY, real coordZ, real &rho, real &vx, real &vy, real &vz) {
+        rho = (real)0.0;
+        vx  = rho = c0o1;
+        vx  = (u_star/c4o10 * log(coordZ/z0+c1o1) + c2o1*sin(cPi*c16o1*coordX/L_x)*sin(cPi*c8o1*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1)) * dt/dx; 
+        vy  = c2o1*sin(cPi*c16o1*coordX/L_x)*sin(cPi*c8o1*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1) * dt/dx; 
+        vz  = c8o1*u_star/c4o10*(sin(cPi*c8o1*coordY/H)*sin(cPi*c8o1*coordZ/H)+sin(cPi*c8o1*coordX/L_x))/(pow(c1o2*L_z-coordZ, c2o1)+c1o1) * dt/dx;
+        });
+    }
+
+
 
-    SPtr<PlanarAverageProbe> planarAverageProbe = SPtr<PlanarAverageProbe>( new PlanarAverageProbe("planeProbe", para->getOutputPath(), tStartAveraging/dt, tStartTmpAveraging/dt, tAveraging/dt , tStartOutProbe/dt, tOutProbe/dt, 'z') );
-    planarAverageProbe->addAllAvailableStatistics();
-    planarAverageProbe->setFileNameToNOut();
-    para->addProbe( planarAverageProbe );
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    if(!readPrecursor && (isFirstSubDomain || nProcs == 1))
+    {
+        SPtr<PlanarAverageProbe> planarAverageProbe = SPtr<PlanarAverageProbe>( new PlanarAverageProbe("planeProbe", para->getOutputPath(), tStartAveraging/dt, tStartTmpAveraging/dt, tAveraging/dt , tStartOutProbe/dt, tOutProbe/dt, 'z') );
+        planarAverageProbe->addAllAvailableStatistics();
+        planarAverageProbe->setFileNameToNOut();
+        para->addProbe( planarAverageProbe );
+
+        para->setHasWallModelMonitor(true);
+        SPtr<WallModelProbe> wallModelProbe = SPtr<WallModelProbe>( new WallModelProbe("wallModelProbe", para->getOutputPath(), tStartAveraging/dt, tStartTmpAveraging/dt, tAveraging/dt/4.0 , tStartOutProbe/dt, tOutProbe/dt) );
+        wallModelProbe->addAllAvailableStatistics();
+        wallModelProbe->setFileNameToNOut();
+        wallModelProbe->setForceOutputToStress(true);
+        if(para->getIsBodyForce())
+            wallModelProbe->setEvaluatePressureGradient(true);
+        para->addProbe( wallModelProbe );
+    }
+
+    SPtr<PlaneProbe> planeProbe1 = SPtr<PlaneProbe>( new PlaneProbe("planeProbe_1", para->getOutputPath(), tStartAveraging/dt, 10, tStartOutProbe/dt, tOutProbe/dt) );
+    planeProbe1->setProbePlane(100.0, 0.0, 0, dx, L_y, L_z);
+    planeProbe1->addAllAvailableStatistics();
+    para->addProbe( planeProbe1 );
+
+    if(readPrecursor)
+    {
+        SPtr<PlaneProbe> planeProbe2 = SPtr<PlaneProbe>( new PlaneProbe("planeProbe_2", para->getOutputPath(), tStartAveraging/dt, 10, tStartOutProbe/dt, tOutProbe/dt) );
+        planeProbe2->setProbePlane(1000.0, 0.0, 0, dx, L_y, L_z);
+        planeProbe2->addAllAvailableStatistics();
+        para->addProbe( planeProbe2 );
+
+        SPtr<PlaneProbe> planeProbe3 = SPtr<PlaneProbe>( new PlaneProbe("planeProbe_3", para->getOutputPath(), tStartAveraging/dt, 10, tStartOutProbe/dt, tOutProbe/dt) );
+        planeProbe3->setProbePlane(1500.0, 0.0, 0, dx, L_y, L_z);
+        planeProbe3->addAllAvailableStatistics();
+        para->addProbe( planeProbe3 );
+
+        SPtr<PlaneProbe> planeProbe4 = SPtr<PlaneProbe>( new PlaneProbe("planeProbe_4", para->getOutputPath(), tStartAveraging/dt, 10, tStartOutProbe/dt, tOutProbe/dt) );
+        planeProbe4->setProbePlane(2000.0, 0.0, 0, dx, L_y, L_z);
+        planeProbe4->addAllAvailableStatistics();
+        para->addProbe( planeProbe4 );
+
+        SPtr<PlaneProbe> planeProbe5 = SPtr<PlaneProbe>( new PlaneProbe("planeProbe_5", para->getOutputPath(), tStartAveraging/dt, 10, tStartOutProbe/dt, tOutProbe/dt) );
+        planeProbe5->setProbePlane(2500.0, 0.0, 0, dx, L_y, L_z);
+        planeProbe5->addAllAvailableStatistics();
+        para->addProbe( planeProbe5 );
+
+        SPtr<PlaneProbe> planeProbe6 = SPtr<PlaneProbe>( new PlaneProbe("planeProbe_6", para->getOutputPath(), tStartAveraging/dt, 10, tStartOutProbe/dt, tOutProbe/dt) );
+        planeProbe6->setProbePlane(0.0, L_y/2.0, 0, L_x, dx, L_z);
+        planeProbe6->addAllAvailableStatistics();
+        para->addProbe( planeProbe6 );
+    }
 
-    para->setHasWallModelMonitor(true);
-    SPtr<WallModelProbe> wallModelProbe = SPtr<WallModelProbe>( new WallModelProbe("wallModelProbe", para->getOutputPath(), tStartAveraging/dt, tStartTmpAveraging/dt, tAveraging/dt/4.0 , tStartOutProbe/dt, tOutProbe/dt) );
-    wallModelProbe->addAllAvailableStatistics();
-    wallModelProbe->setFileNameToNOut();
-    wallModelProbe->setForceOutputToStress(true);
-    if(para->getIsBodyForce())
-        wallModelProbe->setEvaluatePressureGradient(true);
-    para->addProbe( wallModelProbe );
+    if(writePrecursor)
+    {
+        SPtr<PrecursorWriter> precursorWriter = std::make_shared<PrecursorWriter>("precursor", para->getOutputPath()+precursorDirectory, posXPrecursor, 0, L_y, 0, L_z, tStartPrecursor/dt, nTWritePrecursor, useDistributions? OutputVariable::Distributions: OutputVariable::Velocities, 1000);
+        para->addProbe(precursorWriter);
+    }
 
     auto cudaMemoryManager = std::make_shared<CudaMemoryManager>(para);
     auto gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
 
-    Simulation sim(para, cudaMemoryManager, communicator, *gridGenerator, &bcFactory, tmFactory);
+    Simulation sim(para, cudaMemoryManager, communicator, *gridGenerator, &bcFactory, tmFactory, &scalingFactory);
     sim.run();
 }
 
diff --git a/apps/gpu/LBM/BoundaryLayer/configBoundaryLayer.txt b/apps/gpu/LBM/BoundaryLayer/configBoundaryLayer.txt
index a489f0ab89738a193b16fee41c212a5943f6525d..83e7861a5fb85ea800d187699f1c6c1409422f0a 100644
--- a/apps/gpu/LBM/BoundaryLayer/configBoundaryLayer.txt
+++ b/apps/gpu/LBM/BoundaryLayer/configBoundaryLayer.txt
@@ -7,7 +7,7 @@ Path = .
 ##################################################
 GridPath = .
 ##################################################
-Devices = 1 
+Devices = 0 
 ##################################################
 tStartOut           = 0
 tOut                = 100000
@@ -28,3 +28,15 @@ SGSconstant = 0.2
 QuadricLimiterP = 100000.0
 QuadricLimiterM = 100000.0
 QuadricLimiterD = 100000.0
+
+##################################################
+readPrecursor = false
+nTimestepsReadPrecursor = 10
+precursorFile = precursor/Precursor
+
+##################################################
+writePrecursor = false
+nTimestepsWritePrecursor = 10
+
+tStartPrecursor = 100
+posXPrecursor = 3000
\ No newline at end of file
diff --git a/apps/gpu/LBM/DrivenCavity/DrivenCavity.cpp b/apps/gpu/LBM/DrivenCavity/DrivenCavity.cpp
index 69ecb3d8cbd45a8a7419437e934a57bd20b0bc9f..5e1cab7f48f7fb672c85f0decee4bcc2d4ac158f 100644
--- a/apps/gpu/LBM/DrivenCavity/DrivenCavity.cpp
+++ b/apps/gpu/LBM/DrivenCavity/DrivenCavity.cpp
@@ -85,7 +85,7 @@ int main()
         const real L = 1.0;
         const real Re = 1000.0;
         const real velocity = 1.0;
-        const real dt = (real)0.5e-3;
+        const real velocityLB = 0.05; // LB units
         const uint nx = 64;
 
         const uint timeStepOut = 1000;
@@ -109,10 +109,20 @@ int main()
         auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
 
         //////////////////////////////////////////////////////////////////////////
-        // create grid
+        // compute parameters in lattice units
         //////////////////////////////////////////////////////////////////////////
 
-        real dx = L / real(nx);
+        const real dx = L / real(nx);
+        const real dt  = velocityLB / velocity * dx;
+
+        const real vxLB = velocityLB / sqrt(2.0); // LB units
+        const real vyLB = velocityLB / sqrt(2.0); // LB units
+
+        const real viscosityLB = nx * velocityLB / Re; // LB units
+
+        //////////////////////////////////////////////////////////////////////////
+        // create grid
+        //////////////////////////////////////////////////////////////////////////
 
         gridBuilder->addCoarseGrid(-0.5 * L, -0.5 * L, -0.5 * L, 0.5 * L, 0.5 * L, 0.5 * L, dx);
 
@@ -124,17 +134,6 @@ int main()
 
         gridBuilder->buildGrids(LbmOrGks::LBM, false);
 
-        //////////////////////////////////////////////////////////////////////////
-        // compute parameters in lattice units
-        //////////////////////////////////////////////////////////////////////////
-
-        const real velocityLB = velocity * dt / dx; // LB units
-
-        const real vxLB = velocityLB / sqrt(2.0); // LB units
-        const real vyLB = velocityLB / sqrt(2.0); // LB units
-
-        const real viscosityLB = nx * velocityLB / Re; // LB units
-
         //////////////////////////////////////////////////////////////////////////
         // set parameters
         //////////////////////////////////////////////////////////////////////////
@@ -154,7 +153,7 @@ int main()
         para->setTimestepOut(timeStepOut);
         para->setTimestepEnd(timeStepEnd);
 
-        para->setMainKernel("CumulantK17CompChimRedesigned");
+        para->setMainKernel("CumulantK17");
 
         //////////////////////////////////////////////////////////////////////////
         // set boundary conditions
@@ -164,8 +163,8 @@ int main()
         gridBuilder->setNoSlipBoundaryCondition(SideType::MX);
         gridBuilder->setNoSlipBoundaryCondition(SideType::PY);
         gridBuilder->setNoSlipBoundaryCondition(SideType::MY);
-        gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, vyLB, 0.0);
         gridBuilder->setNoSlipBoundaryCondition(SideType::MZ);
+        gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, vyLB, 0.0);
 
         BoundaryConditionFactory bcFactory;
 
diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp b/apps/gpu/LBM/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp
index 8ca6939924fcfba22c8b96f000b9d8d05a3f7f43..ed6b4da7a3218e4d89ac90b053d9c054e4dd8205 100644
--- a/apps/gpu/LBM/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp
+++ b/apps/gpu/LBM/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp
@@ -50,6 +50,7 @@
 #include "VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.h"
 #include "VirtualFluids_GPU/PreProcessor/PreProcessorFactory/PreProcessorFactoryImp.h"
 #include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
+#include "VirtualFluids_GPU/Factories/GridScalingFactory.h"
 
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
 
@@ -57,19 +58,6 @@
 
 #include "utilities/communication.h"
 
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-//          U s e r    s e t t i n g s
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-const std::string outPath("output/DrivenCavity_Results/");
-const std::string gridPath = "output/DrivenCavity_Results/grid/";
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -92,15 +80,12 @@ void multipleLevel(std::filesystem::path& configPath)
     config.load(configPath.string());
     SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNummberOfProcess(), communicator.getPID(), &config);
     BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
-
+    GridScalingFactory scalingFactory = GridScalingFactory();
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
     bool useGridGenerator = true;
     bool useLevels        = true;
-    // para->setUseStreams(useStreams);                  // set in config
-    // para->useReducedCommunicationAfterFtoC = true;    // set in config
-    para->setCalcTurbulenceIntensity(false);
 
     if (para->getNumprocs() == 1) {
         para->useReducedCommunicationAfterFtoC = false;
@@ -108,47 +93,40 @@ void multipleLevel(std::filesystem::path& configPath)
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    const real L        = 1.0;
-    const real Re       = 1000.0; // 1000
+    const std::string outPath("output/");
+    const std::string gridPath = "output/";
+    std::string simulationName("DrivenCavityMultiGPU");
+
+    const real L = 1.0;
+    const real Re = 1000.0;
     const real velocity = 1.0;
-    const real dt       = (real)1.0e-3; // 0.5e-3;
-    const uint nx       = 64;
-    std::string simulationName("DrivenCavityChimMultiGPU");
+    const real velocityLB = 0.05; // LB units
+    const uint nx = 64;
 
     // para->setTimestepOut(10000);   // set in config
     // para->setTimestepEnd(10000);   // set in config
 
     const real dxGrid      = L / real(nx);
-    const real velocityLB  = velocity * dt / dxGrid;       // LB units
+    const real dt  = velocityLB / velocity * dxGrid;
     const real vxLB        = velocityLB / (real)sqrt(2.0); // LB units
     const real vyLB        = velocityLB / (real)sqrt(2.0); // LB units
     const real viscosityLB = nx * velocityLB / Re;         // LB units
 
-    para->setInitialCondition([&](real coordX, real coordY, real coordZ, real &rho, real &vx, real &vy, real &vz) {
-        rho = (real)1.0;
-        vx  = (real)(coordX * velocityLB);
-        vy  = (real)(coordY * velocityLB);
-        vz  = (real)(coordZ * velocityLB);
-    });
-
     para->setVelocityLB(velocityLB);
     para->setViscosityLB(viscosityLB);
     para->setVelocityRatio(velocity / velocityLB);
-    para->setDensityRatio((real)1.0); // correct value?
+    para->setDensityRatio((real)1.0);
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    para->setCalcDragLift(false);
-    para->setUseWale(false);
-
     if (para->getOutputPath() == "output/") {para->setOutputPath(outPath);}
     para->setOutputPrefix(simulationName);
 
     para->setPrintFiles(true);
     std::cout << "Write result files to " << para->getFName() << std::endl;
 
-    // para->setMainKernel("CumulantK17CompChim");
-    para->setMainKernel("CumulantK17CompChimStream");
+    para->setMainKernel("CumulantK17");
+    scalingFactory.setScalingFactory(GridScalingFactory::GridScaling::ScaleCompressible);
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -157,7 +135,7 @@ void multipleLevel(std::filesystem::path& configPath)
     VF_LOG_INFO("velocity LB [dx/dt]              = {}", vxLB);
     VF_LOG_INFO("viscosity LB [dx/dt]             = {}", viscosityLB);
     VF_LOG_INFO("dxGrid [-]                       = {}\n", dxGrid);
-
+    VF_LOG_INFO("dt [s]                           = {}", dt);
     VF_LOG_INFO("simulation parameters:");
     VF_LOG_INFO("mainKernel                       = {}\n", para->getMainKernel());
 
@@ -226,7 +204,7 @@ void multipleLevel(std::filesystem::path& configPath)
                 if (generatePart == 0)
                     gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
                 if (generatePart == 1)
-                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, vyLB, 0.0);
                 gridBuilder->setVelocityBoundaryCondition(SideType::MX, 0.0, 0.0, 0.0);
                 gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
                 gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
@@ -303,13 +281,13 @@ void multipleLevel(std::filesystem::path& configPath)
                 }
                 if (generatePart == 2) {
                     gridBuilder->setVelocityBoundaryCondition(SideType::MX, 0.0, 0.0, 0.0);
-                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, vyLB, 0.0);
                 }
                 gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
                 gridBuilder->setVelocityBoundaryCondition(SideType::PY, 0.0, 0.0, 0.0);
                 if (generatePart == 3) {
                     gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
-                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, vyLB, 0.0);
                 }
                 if (generatePart == 1) {
                     gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
@@ -472,22 +450,22 @@ void multipleLevel(std::filesystem::path& configPath)
                 if (generatePart == 4) {
                     gridBuilder->setVelocityBoundaryCondition(SideType::MX, 0.0, 0.0, 0.0);
                     gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
-                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, vyLB, 0.0);
                 }
                 if (generatePart == 5) {
                     gridBuilder->setVelocityBoundaryCondition(SideType::MX, 0.0, 0.0, 0.0);
                     gridBuilder->setVelocityBoundaryCondition(SideType::PY, 0.0, 0.0, 0.0);
-                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, vyLB, 0.0);
                 }
                 if (generatePart == 6) {
                     gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
                     gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
-                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, vyLB, 0.0);
                 }
                 if (generatePart == 7) {
                     gridBuilder->setVelocityBoundaryCondition(SideType::PY, 0.0, 0.0, 0.0);
                     gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
-                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, vyLB, 0.0);
                 }
                 //////////////////////////////////////////////////////////////////////////
             }
@@ -513,7 +491,7 @@ void multipleLevel(std::filesystem::path& configPath)
             gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
             gridBuilder->setVelocityBoundaryCondition(SideType::PY, 0.0, 0.0, 0.0);
             gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
-            gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, vyLB, 0.0);
 
             //////////////////////////////////////////////////////////////////////////
             gridBuilder->writeGridsToVtk(outPath + "/grid/");
@@ -534,7 +512,7 @@ void multipleLevel(std::filesystem::path& configPath)
         gridGenerator = GridProvider::makeGridReader(FILEFORMAT::BINARY, para, cudaMemoryManager);
     }
 
-    Simulation sim(para, cudaMemoryManager, communicator, *gridGenerator, &bcFactory);
+    Simulation sim(para, cudaMemoryManager, communicator, *gridGenerator, &bcFactory, &scalingFactory);
     sim.run();
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU.txt b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU.txt
index c710922b9fc82ac7680f5f7daade4faa235bc957..c5789cdf96049b7c0a31ce693c29cd2db4952a58 100644
--- a/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU.txt
+++ b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU.txt
@@ -4,35 +4,11 @@
 Devices="0 1 2 3"
 NumberOfDevices=4
 
-##################################################
-#informations for Writing
-##################################################
-Path=/work/y0078217/Results/DrivenCavityMultiGPUResults/4GPU/
-#Prefix="DrivenCavityMultiGPU" 
-#WriteGrid=true
-##################################################
-#informations for reading
-##################################################
-GridPath=/work/y0078217/Grids/GridDrivenCavityMultiGPU/4GPU/
-#GridPath="C:"
-
-##################################################
-#number of grid levels
-##################################################
-#NOGL=1
-
-##################################################
-#LBM Version
-##################################################
-#D3Qxx=27
-#MainKernelName=CumulantK17CompChim
-
 ##################################################
 #simulation parameter
 ##################################################
-TimeEnd=1
-TimeOut=1
-#TimeStartOut=0
+TimeEnd=10000
+TimeOut=10000
 
 ##################################################
 # CUDA Streams and optimized communication (only used for multiple GPUs)
diff --git a/apps/gpu/LBM/DrivenCavityUniform/CMakeLists.txt b/apps/gpu/LBM/DrivenCavityUniform/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..40b4f08d7500c56efae7378df6398d065e4ecbfb
--- /dev/null
+++ b/apps/gpu/LBM/DrivenCavityUniform/CMakeLists.txt
@@ -0,0 +1,10 @@
+PROJECT(DrivenCavityUniform LANGUAGES CUDA CXX)
+
+#LIST(APPEND CS_COMPILER_FLAGS_CXX "-DOMPI_SKIP_MPICXX" )
+
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES DrivenCavity.cpp)
+
+set_source_files_properties(DrivenCavity.cpp PROPERTIES LANGUAGE CUDA)
+
+set_target_properties(DrivenCavityUniform PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
diff --git a/apps/gpu/LBM/DrivenCavityUniform/DrivenCavity.cpp b/apps/gpu/LBM/DrivenCavityUniform/DrivenCavity.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..958ef4714118aac34b8cfb0bec3aab97b108b01d
--- /dev/null
+++ b/apps/gpu/LBM/DrivenCavityUniform/DrivenCavity.cpp
@@ -0,0 +1,231 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file LidDrivenCavity.cpp
+//! \ingroup Applications
+//! \author Martin Schoenherr, Stephan Lenz
+//=======================================================================================
+#define _USE_MATH_DEFINES
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "Core/DataTypes.h"
+#include "Core/LbmOrGks.h"
+#include "Core/Logger/Logger.h"
+#include "Core/VectorTypes.h"
+#include "PointerDefinitions.h"
+
+#include <logger/Logger.h>
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "GridGenerator/grid/BoundaryConditions/Side.h"
+#include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
+#include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
+#include "GridGenerator/grid/GridFactory.h"
+#include "GridGenerator/geometries/Cuboid/Cuboid.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
+#include "VirtualFluids_GPU/Factories/GridScalingFactory.h"
+#include "VirtualFluids_GPU/Communication/Communicator.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
+#include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
+#include "VirtualFluids_GPU/LBM/Simulation.h"
+#include "VirtualFluids_GPU/Output/FileWriter.h"
+#include "VirtualFluids_GPU/Parameter/Parameter.h"
+#include "VirtualFluids_GPU/Factories/GridScalingFactory.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+int main()
+{
+    try {
+         vf::logging::Logger::initalizeLogger();
+        //////////////////////////////////////////////////////////////////////////
+        // Simulation parameters
+        //////////////////////////////////////////////////////////////////////////
+        std::string path("./output/DrivenCavity_uniform");
+        std::string simulationName("LidDrivenCavity");
+
+        const real L = 1.0;
+        const real Re = 1000.0;
+        const real velocity = 1.0;
+        const real dt = (real)0.5e-3;
+        const uint nx = 64;
+
+        const uint timeStepOut = 1000;
+        const uint timeStepEnd = 10000;
+
+        //////////////////////////////////////////////////////////////////////////
+        // setup logger
+        //////////////////////////////////////////////////////////////////////////
+
+        logging::Logger::addStream(&std::cout);
+        logging::Logger::setDebugLevel(logging::Logger::Level::INFO_LOW);
+        logging::Logger::timeStamp(logging::Logger::ENABLE);
+        logging::Logger::enablePrintedRankNumbers(logging::Logger::ENABLE);
+
+        //////////////////////////////////////////////////////////////////////////
+        // setup gridGenerator
+        //////////////////////////////////////////////////////////////////////////
+
+        auto gridFactory = GridFactory::make();
+        gridFactory->setTriangularMeshDiscretizationMethod(TriangularMeshDiscretizationMethod::POINT_IN_OBJECT);
+        auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
+
+        //////////////////////////////////////////////////////////////////////////
+        // create grid
+        //////////////////////////////////////////////////////////////////////////
+
+        real dx = L / real(nx);
+
+        gridBuilder->addCoarseGrid(-0.5 * L, -0.5 * L, -0.5 * L, 0.5 * L, 0.5 * L, 0.5 * L, dx);
+
+        // gridBuilder->addGrid(new Cuboid(-0.25, -0.25, -0.25, 0.25, 0.25, 0.25), 1); // add fine grid
+        GridScalingFactory scalingFactory = GridScalingFactory();
+        scalingFactory.setScalingFactory(GridScalingFactory::GridScaling::ScaleCompressible);
+
+        gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+
+        gridBuilder->buildGrids(LbmOrGks::LBM, false);
+
+        //////////////////////////////////////////////////////////////////////////
+        // compute parameters in lattice units
+        //////////////////////////////////////////////////////////////////////////
+
+        const real velocityLB = velocity * dt / dx; // LB units
+
+        const real vxLB = velocityLB / sqrt(2.0); // LB units
+        const real vyLB = velocityLB / sqrt(2.0); // LB units
+
+        const real viscosityLB = nx * velocityLB / Re; // LB units
+
+        //////////////////////////////////////////////////////////////////////////
+        // set parameters
+        //////////////////////////////////////////////////////////////////////////
+        SPtr<Parameter> para = std::make_shared<Parameter>();
+
+        para->setOutputPath(path);
+        para->setOutputPrefix(simulationName);
+
+        para->setPrintFiles(true);
+
+        para->setVelocityLB(velocityLB);
+        para->setViscosityLB(viscosityLB);
+
+        para->setVelocityRatio(velocity / velocityLB);
+        para->setDensityRatio(1.0);
+
+        para->setTimestepOut(timeStepOut);
+        para->setTimestepEnd(timeStepEnd);
+
+        para->setMainKernel("CumulantK17");
+
+        //////////////////////////////////////////////////////////////////////////
+        // set boundary conditions
+        //////////////////////////////////////////////////////////////////////////
+
+        gridBuilder->setNoSlipBoundaryCondition(SideType::PX);
+        gridBuilder->setNoSlipBoundaryCondition(SideType::MX);
+        gridBuilder->setNoSlipBoundaryCondition(SideType::PY);
+        gridBuilder->setNoSlipBoundaryCondition(SideType::MY);
+        gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, vyLB, 0.0);
+        gridBuilder->setNoSlipBoundaryCondition(SideType::MZ);
+
+        BoundaryConditionFactory bcFactory;
+
+        bcFactory.setNoSlipBoundaryCondition(BoundaryConditionFactory::NoSlipBC::NoSlipBounceBack);
+        bcFactory.setVelocityBoundaryCondition(BoundaryConditionFactory::VelocityBC::VelocitySimpleBounceBackCompressible);
+
+        //////////////////////////////////////////////////////////////////////////
+        // set copy mesh to simulation
+        //////////////////////////////////////////////////////////////////////////
+
+        vf::gpu::Communicator &communicator = vf::gpu::Communicator::getInstance();
+
+        auto cudaMemoryManager = std::make_shared<CudaMemoryManager>(para);
+        SPtr<GridProvider> gridGenerator =
+            GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
+
+
+        //////////////////////////////////////////////////////////////////////////
+        // run simulation
+        //////////////////////////////////////////////////////////////////////////
+
+        VF_LOG_INFO("Start Running DrivenCavity Showcase...");
+        printf("\n");
+        VF_LOG_INFO("world parameter:");
+        VF_LOG_INFO("--------------");
+        VF_LOG_INFO("dt [s]                 = {}", dt);
+        VF_LOG_INFO("world_length   [m]     = {}", L);
+        VF_LOG_INFO("world_velocity [m/s]   = {}", velocity);
+        VF_LOG_INFO("dx [m]                 = {}", dx);
+        printf("\n");
+        VF_LOG_INFO("LB parameter:");
+        VF_LOG_INFO("--------------");
+        VF_LOG_INFO("Re                     = {}", Re);
+        VF_LOG_INFO("lb_velocity [dx/dt]    = {}", velocityLB);
+        VF_LOG_INFO("lb_viscosity [dx^2/dt] = {}", viscosityLB);
+        VF_LOG_INFO("lb_vx [dx/dt] (lb_velocity/sqrt(2)) = {}", vxLB);
+        VF_LOG_INFO("lb_vy [dx/dt] (lb_velocity/sqrt(2)) = {}", vyLB);
+        printf("\n");
+        VF_LOG_INFO("simulation parameter:");
+        VF_LOG_INFO("--------------");
+        VF_LOG_INFO("nx                     = {}", nx);
+        VF_LOG_INFO("ny                     = {}", nx);
+        VF_LOG_INFO("nz                     = {}", nx);
+        VF_LOG_INFO("number of nodes        = {}", nx * nx * nx);
+        VF_LOG_INFO("n timesteps            = {}", timeStepOut);
+        VF_LOG_INFO("write_nth_timestep     = {}", timeStepEnd);
+        VF_LOG_INFO("output_path            = {}", path);
+
+        Simulation sim(para, cudaMemoryManager, communicator, *gridGenerator, &bcFactory, &scalingFactory);
+        sim.run();
+
+    } catch (const spdlog::spdlog_ex &ex) {
+        std::cout << "Log initialization failed: " << ex.what() << std::endl;
+    } catch (const std::bad_alloc &e) {
+        VF_LOG_CRITICAL("Bad Alloc: {}", e.what());
+    } catch (const std::exception &e) {
+        VF_LOG_CRITICAL("exception: {}", e.what());
+    } catch (...) {
+        VF_LOG_CRITICAL("Unknown exception!");
+    }
+
+    return 0;
+}
diff --git a/apps/gpu/LBM/DrivenCavityUniform/configDrivenCavity.txt b/apps/gpu/LBM/DrivenCavityUniform/configDrivenCavity.txt
new file mode 100644
index 0000000000000000000000000000000000000000..458346a67c7f001580494af1dc9262034613be68
--- /dev/null
+++ b/apps/gpu/LBM/DrivenCavityUniform/configDrivenCavity.txt
@@ -0,0 +1,34 @@
+##################################################
+#GPU Mapping
+##################################################
+#Devices="0 1 2 3"
+#NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+#Path = "output/"
+#Prefix="DrivenCavity" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+#GridPath="grid/"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantAA2016CompSP27
+
+##################################################
+#simulation parameter
+##################################################
+#TimeEnd=100000
+#TimeOut=1000 
+#TimeStartOut=0
\ No newline at end of file
diff --git a/apps/gpu/LBM/MusselOyster/MusselOyster.cpp b/apps/gpu/LBM/MusselOyster/MusselOyster.cpp
index efac863fc9efd446e5f266648ad4fa74c954634f..dc5eaf58aff9b4a1b87d70c187b81461330ee3da 100644
--- a/apps/gpu/LBM/MusselOyster/MusselOyster.cpp
+++ b/apps/gpu/LBM/MusselOyster/MusselOyster.cpp
@@ -40,7 +40,6 @@
 
 //////////////////////////////////////////////////////////////////////////
 
-#include "VirtualFluids_GPU/BoundaryConditions/BoundaryConditionFactory.h"
 #include "VirtualFluids_GPU/Communication/Communicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
@@ -49,7 +48,7 @@
 #include "VirtualFluids_GPU/LBM/Simulation.h"
 #include "VirtualFluids_GPU/Output/FileWriter.h"
 #include "VirtualFluids_GPU/Parameter/Parameter.h"
-#include "VirtualFluids_GPU/BoundaryConditions/BoundaryConditionFactory.h"
+#include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
 
 //////////////////////////////////////////////////////////////////////////
 
diff --git a/apps/gpu/LBM/MusselOyster/configPhoenix8GPU.txt b/apps/gpu/LBM/MusselOyster/configPhoenix8GPU.txt
index 4e2b0c91482b6a650ff28a210673cac097cb8c2d..2bf6955062da5c98f6a7b931c19821c52eaf15ea 100644
--- a/apps/gpu/LBM/MusselOyster/configPhoenix8GPU.txt
+++ b/apps/gpu/LBM/MusselOyster/configPhoenix8GPU.txt
@@ -7,14 +7,14 @@ NumberOfDevices=4
 ##################################################
 #informations for Writing
 ##################################################
-Path=/work/y0078217/Results/MusselOysterResults/8GPUOyster05/
+#Path=/work/y0078217/Results/MusselOysterResults/8GPUOyster05/
 #Path="F:/Work/Computations/out/MusselOyster/"
 #Prefix="MusselOyster" 
 #WriteGrid=true
 ##################################################
 #informations for reading
 ##################################################
-GridPath=/work/y0078217/Grids/GridMusselOyster/Oyster8GPU/
+#GridPath=/work/y0078217/Grids/GridMusselOyster/Oyster8GPU/
 #GridPath="C:"
 
 ##################################################
@@ -31,8 +31,8 @@ GridPath=/work/y0078217/Grids/GridMusselOyster/Oyster8GPU/
 ##################################################
 #simulation parameter
 ##################################################
-TimeEnd=400000 # 800000
-TimeOut=100000 # 400000
+TimeEnd=100000 # 800000
+TimeOut=10000 # 400000
 #TimeStartOut=0
 
 ##################################################
diff --git a/apps/gpu/LBM/TGV_3D/TGV_3D.cpp b/apps/gpu/LBM/TGV_3D/TGV_3D.cpp
index d8642c7b267bcad6c58ab2a9c178c2d9394ecf2a..7514c2b273bf60d6e2523f132911dde8839d296a 100644
--- a/apps/gpu/LBM/TGV_3D/TGV_3D.cpp
+++ b/apps/gpu/LBM/TGV_3D/TGV_3D.cpp
@@ -1,63 +1,95 @@
-//#define MPI_LOGGING
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TGV_3D.cpp
+//! \ingroup Applications
+//! \author Martin Schoenherr
+//=======================================================================================
+#define _USE_MATH_DEFINES
+#include <exception>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
 
-//Martin Branch
+#include "mpi.h"
 
-#include <mpi.h>
-#if defined( MPI_LOGGING )
-	#include <mpe.h>
-#endif
+//////////////////////////////////////////////////////////////////////////
 
-#include <string>
-#include <sstream>
-#include <iostream>
-#include <stdexcept>
-#include <fstream>
-#define _USE_MATH_DEFINES
-#include <math.h>
+#include "Core/DataTypes.h"
+#include "Core/LbmOrGks.h"
+#include "Core/Logger/Logger.h"
+#include "Core/VectorTypes.h"
+#include "PointerDefinitions.h"
 
-//#include "metis.h"
+//////////////////////////////////////////////////////////////////////////
 
-#include "basics/Core/LbmOrGks.h"
-#include "basics/Core/StringUtilities/StringUtil.h"
-#include <basics/config/ConfigurationFile.h>
+#include "GridGenerator/geometries/Conglomerate/Conglomerate.h"
+#include "GridGenerator/geometries/TriangularMesh/TriangularMesh.h"
+#include "GridGenerator/grid/BoundaryConditions/BoundaryCondition.h"
+#include "GridGenerator/grid/BoundaryConditions/Side.h"
+#include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
+#include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
+#include "GridGenerator/grid/GridFactory.h"
+
+#include "GridGenerator/io/GridVTKWriter/GridVTKWriter.h"
+#include "GridGenerator/io/STLReaderWriter/STLReader.h"
+#include "GridGenerator/io/STLReaderWriter/STLWriter.h"
+#include "GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
+
+//////////////////////////////////////////////////////////////////////////
 
-#include "VirtualFluids_GPU/LBM/Simulation.h"
 #include "VirtualFluids_GPU/Communication/Communicator.h"
-#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
-#include "VirtualFluids_GPU/Parameter/Parameter.h"
-#include "VirtualFluids_GPU/Output/FileWriter.h"
-
-#include "VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.h"
-#include "VirtualFluids_GPU/PreProcessor/PreProcessorFactory/PreProcessorFactoryImp.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
-
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
+#include "VirtualFluids_GPU/LBM/Simulation.h"
+#include "VirtualFluids_GPU/Output/FileWriter.h"
+#include "VirtualFluids_GPU/Parameter/Parameter.h"
 
-#include "global.h"
-
-#include "geometries/Sphere/Sphere.h"
-#include "geometries/VerticalCylinder/VerticalCylinder.h"
-#include "geometries/Cuboid/Cuboid.h"
-#include "geometries/TriangularMesh/TriangularMesh.h"
-#include "geometries/Conglomerate/Conglomerate.h"
-#include "geometries/TriangularMesh/TriangularMeshStrategy.h"
-
-#include "grid/GridBuilder/LevelGridBuilder.h"
-#include "grid/GridBuilder/MultipleGridBuilder.h"
-#include "grid/BoundaryConditions/Side.h"
-#include "grid/BoundaryConditions/BoundaryCondition.h"
-#include "grid/GridFactory.h"
+#include <logger/Logger.h>
 
-#include "io/SimulationFileWriter/SimulationFileWriter.h"
-#include "io/GridVTKWriter/GridVTKWriter.h"
-#include "io/STLReaderWriter/STLReader.h"
-#include "io/STLReaderWriter/STLWriter.h"
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//          U s e r    s e t t i n g s
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-#include "utilities/math/Math.h"
-#include "utilities/communication.h"
-#include "utilities/transformator/TransformatorImp.h"
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // from https://stackoverflow.com/questions/865668/how-to-parse-command-line-arguments-in-c
@@ -94,8 +126,8 @@ bool useWale = false;
 
 std::string kernel( "CumulantK17Comp" );
 
-std::string path("F:/Work/Computations/out/TaylorGreen3DNew/"); //LEGOLAS
-//std::string path("E:/DrivenCavity/results/"); //TESLA03
+//std::string path("F:/Work/Computations/out/TaylorGreen3DNew/"); //LEGOLAS
+std::string path("D:/out/TGV_3D/"); //TESLA03
 
 std::string simulationName("TGV_3D");
 //////////////////////////////////////////////////////////////////////////
diff --git a/apps/gpu/LBM/TGV_3D_GridRef/CMakeLists.txt b/apps/gpu/LBM/TGV_3D_GridRef/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..efb4310669f9c0de7aa5cf3f1e4dffa00bd66cbf
--- /dev/null
+++ b/apps/gpu/LBM/TGV_3D_GridRef/CMakeLists.txt
@@ -0,0 +1,7 @@
+PROJECT(TGV_3D_GridRef LANGUAGES CUDA CXX)
+
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES TGV_3D_GridRef.cpp)
+
+set_source_files_properties(TGV_3D_GridRef.cpp PROPERTIES LANGUAGE CUDA)
+
+set_target_properties(TGV_3D_GridRef PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
\ No newline at end of file
diff --git a/apps/gpu/LBM/TGV_3D_GridRef/TGV_3D_GridRef.cpp b/apps/gpu/LBM/TGV_3D_GridRef/TGV_3D_GridRef.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a88fee2e583a7cb227702ff19ada7daced1b1708
--- /dev/null
+++ b/apps/gpu/LBM/TGV_3D_GridRef/TGV_3D_GridRef.cpp
@@ -0,0 +1,399 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TGV_3D.cpp
+//! \ingroup Applications
+//! \author Martin Schoenherr
+//=======================================================================================
+#define _USE_MATH_DEFINES
+#include <exception>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include "mpi.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "Core/DataTypes.h"
+#include "Core/LbmOrGks.h"
+#include "Core/Logger/Logger.h"
+#include "Core/VectorTypes.h"
+#include "PointerDefinitions.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "GridGenerator/geometries/Conglomerate/Conglomerate.h"
+#include "GridGenerator/geometries/TriangularMesh/TriangularMesh.h"
+#include "GridGenerator/grid/BoundaryConditions/BoundaryCondition.h"
+#include "GridGenerator/grid/BoundaryConditions/Side.h"
+#include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
+#include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
+#include "GridGenerator/grid/GridFactory.h"
+
+#include "GridGenerator/io/GridVTKWriter/GridVTKWriter.h"
+#include "GridGenerator/io/STLReaderWriter/STLReader.h"
+#include "GridGenerator/io/STLReaderWriter/STLWriter.h"
+#include "GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "VirtualFluids_GPU/Communication/Communicator.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
+#include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
+#include "VirtualFluids_GPU/Factories/GridScalingFactory.h"
+#include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
+#include "VirtualFluids_GPU/LBM/Simulation.h"
+#include "VirtualFluids_GPU/Output/FileWriter.h"
+#include "VirtualFluids_GPU/Parameter/Parameter.h"
+
+#include <logger/Logger.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//          U s e r    s e t t i n g s
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// from https://stackoverflow.com/questions/865668/how-to-parse-command-line-arguments-in-c
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+char* getCmdOption(char ** begin, char ** end, const std::string & option)
+{
+    char ** itr = std::find(begin, end, option);
+    if (itr != end && ++itr != end)
+    {
+        return *itr;
+    }
+    return 0;
+}
+
+bool cmdOptionExists(char** begin, char** end, const std::string& option)
+{
+    return std::find(begin, end, option) != end;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////
+real Re =  1600.0;
+
+uint dtPerL = 500;
+
+uint nx = 64;
+uint gpuIndex = 0;
+
+bool useLimiter = false;
+bool useWale = false;
+
+std::string kernel( "CumulantK17CompChimRedesigned" );
+
+std::string path("D:/out/TGV_3D/"); //MOLLOK
+
+std::string simulationName("TGV_3D_Gridref_noSqPress");
+//////////////////////////////////////////////////////////////////////////
+
+void multipleLevel(const std::string& configPath)
+{
+    logging::Logger::addStream(&std::cout);
+    logging::Logger::setDebugLevel(logging::Logger::Level::INFO_LOW);
+    logging::Logger::timeStamp(logging::Logger::ENABLE);
+    logging::Logger::enablePrintedRankNumbers(logging::Logger::ENABLE);
+
+    vf::gpu::Communicator& communicator = vf::gpu::Communicator::getInstance();
+
+    auto gridFactory = GridFactory::make();
+    //gridFactory->setTriangularMeshDiscretizationMethod(TriangularMeshDiscretizationMethod::RAYCASTING);
+    gridFactory->setTriangularMeshDiscretizationMethod(TriangularMeshDiscretizationMethod::POINT_IN_OBJECT);
+    //gridFactory->setTriangularMeshDiscretizationMethod(TriangularMeshDiscretizationMethod::POINT_UNDER_TRIANGLE);
+
+    auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
+
+    vf::basics::ConfigurationFile config;
+    config.load(configPath);
+    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNummberOfProcess(), communicator.getPID(), &config);
+    BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
+    GridScalingFactory scalingFactory = GridScalingFactory();
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	const real PI = 3.141592653589793238462643383279;
+
+    real L = nx / ( 2.0 * PI );
+
+    const real velocity = 64.0 / ( dtPerL * 2.0 * PI );
+
+    const real viscosity = nx / ( 2.0 * PI ) * velocity / Re;
+
+    *logging::out << logging::Logger::INFO_HIGH << "velocity = " << velocity << " s\n";
+
+    *logging::out << logging::Logger::INFO_HIGH << "viscosity = " << viscosity << "\n";
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	real dx = 2.0 * PI / real(nx);
+
+	gridBuilder->addCoarseGrid(-PI, -PI, -PI,
+								PI,  PI,  PI, dx);
+
+    gridBuilder->setNumberOfLayers(0, 0);
+
+    auto fineGrid = new Cuboid(-PI * 0.5, -PI * 0.5, -PI * 0.5, 
+                                     0.0,  PI * 0.5,       0.0);
+
+    gridBuilder->addGrid(fineGrid, 1);
+
+	gridBuilder->setPeriodicBoundaryCondition(true, true, true);
+
+	gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    scalingFactory.setScalingFactory(GridScalingFactory::GridScaling::ScaleCompressible);
+
+	//std::stringstream _path;
+ //   std::stringstream _prefix;
+
+ //   //_path << "F:/Work/Computations/TaylorGreenVortex_3D/TGV_LBM/" << nx << "_Re_1.6e4";
+ //   //_path << "F:/Work/Computations/TaylorGreenVortex_3D/TGV_LBM/" << nx << "_neqInit";
+ //   _path << "F:/Work/Computations/TaylorGreenVortex_3D/TGV_LBM/Re_1600/AA2016/" << nx << "_FD_O8";
+
+ //   //_path << "./results/AA2016/" << nx;
+ //   //_path << "./results/CumOne/" << nx;
+ //   //_path << "./results/F3_2018/" << nx;
+
+ //   _prefix << "TGV_3D_" << nx << "_" ;
+
+ //   para->setOutputPath(_path.str());
+ //   para->setOutputPrefix(_prefix.str());
+ //   para->setPathAndFilename(_path.str() + "/" + _prefix.str());
+
+    //////////////////////////////////////////////////////////////////////////
+
+    {
+        std::stringstream _path;
+
+        _path << path;
+        _path << kernel;
+        _path << "SingleGPU";
+
+        if (useLimiter) _path << "_Limiter";
+
+        path = _path.str();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+
+    {
+        std::stringstream _simulationName;
+
+        _simulationName << simulationName;
+        _simulationName << "_nx_" << nx;
+        _simulationName << "_dtPerL_" << dtPerL << "_";
+
+        simulationName = _simulationName.str();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+
+    para->setDevices(std::vector<uint>{gpuIndex});
+
+    //////////////////////////////////////////////////////////////////////////
+
+    para->setOutputPath( path );
+    para->setOutputPrefix( simulationName );
+
+    para->setPrintFiles(true);
+
+    para->setTimestepEnd(40 * lround(L / velocity));
+    para->setTimestepOut(5 * lround(L / velocity));
+    //para->setTimestepOut(lround(L / velocity));
+ //   para->setTimestepEnd(2048);
+	//para->setTimestepOut(512);
+ //   para->setTimestepStartOut(500);
+
+    para->setVelocityLB( velocity );
+
+    para->setViscosityLB( viscosity );
+
+    para->setVelocityRatio( 1.0 / velocity );
+
+    para->setDensityRatio(1.0);
+
+    para->setInitialCondition( [&]( real coordX, real coordY, real coordZ, real& rho, real& vx, real& vy, real& vz){
+
+        real a = 1.0;
+        real b = 1.0;
+        real c = 1.0;
+
+        rho = 3.0 * ((velocity * velocity) / 16.0 * ( cos( 2.0 * a * coordX ) + cos( 2.0 * b * coordY ) ) * ( cos( 2.0 * c * coordZ ) + 2.0 ) );
+        vx  =  velocity * sin( a * coordX ) * cos( b * coordY ) * cos( c * coordZ );
+        vy  = -velocity * cos( a * coordX ) * sin( b * coordY ) * cos( c * coordZ );
+        vz  = 0.0;
+
+    } );
+
+    para->setMainKernel( kernel );
+
+    if( !useLimiter )
+        para->setQuadricLimiters( 1000000.0, 1000000.0, 1000000.0 );
+
+    if( useWale )
+        para->setUseWale( true );
+
+    para->setUseInitNeq( true );
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    auto cudaMemoryManager = std::make_shared<CudaMemoryManager>(para);
+    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
+    //SPtr<GridProvider> gridGenerator = GridProvider::makeGridReader(FILEFORMAT::BINARY, para, cudaMemoryManager);
+
+    SPtr<FileWriter> fileWriter = SPtr<FileWriter>(new FileWriter());
+    Simulation sim(para, cudaMemoryManager, communicator, *gridGenerator, &bcFactory, &scalingFactory);
+    sim.run();
+
+    //sim.addKineticEnergyAnalyzer( 10 );
+    //sim.addEnstrophyAnalyzer( 10 );
+
+    //sim.run();
+}
+
+
+int main( int argc, char* argv[])
+{
+    MPI_Init(&argc, &argv);
+    std::string str, str2;
+    if ( argv != NULL )
+    {
+        //str = static_cast<std::string>(argv[0]);
+
+        try
+        {
+            //////////////////////////////////////////////////////////////////////////
+			std::string targetPath( __FILE__ );
+
+#ifdef _WIN32
+			targetPath = targetPath.substr(0, targetPath.find_last_of('\\') + 1);
+#else
+			targetPath = targetPath.substr(0, targetPath.find_last_of('/') + 1);
+#endif
+
+            //////////////////////////////////////////////////////////////////////////
+
+            if( cmdOptionExists( argv, argv+argc, "--Re" ) )
+                Re = atof( getCmdOption( argv, argv+argc, "--Re" ) );
+
+            if( cmdOptionExists( argv, argv+argc, "--nx" ) )
+                nx = atoi( getCmdOption( argv, argv+argc, "--nx" ) );
+
+            if( cmdOptionExists( argv, argv+argc, "--dtPerL" ) )
+                dtPerL = atoi( getCmdOption( argv, argv+argc, "--dtPerL" ) );
+
+            if( cmdOptionExists( argv, argv+argc, "--kernel" ) )
+                kernel = getCmdOption( argv, argv+argc, "--kernel" );
+
+            if( cmdOptionExists( argv, argv+argc, "--gpu" ) )
+                gpuIndex = atoi( getCmdOption( argv, argv+argc, "--gpu" ) );
+
+            if( cmdOptionExists( argv, argv+argc, "--useLimiter" ) )
+                useLimiter = true;
+
+            if( cmdOptionExists( argv, argv+argc, "--useWale" ) )
+                useWale = true;
+
+			multipleLevel(targetPath + "config.txt");
+
+            //////////////////////////////////////////////////////////////////////////
+		}
+        catch (const std::bad_alloc& e)
+        {
+
+            *logging::out << logging::Logger::LOGGER_ERROR << "Bad Alloc:" << e.what() << "\n";
+            //std::cout << e.what() << std::flush;
+            //MPI_Abort(MPI_COMM_WORLD, -1);
+        }
+        catch (const std::exception& e)
+        {
+
+            *logging::out << logging::Logger::LOGGER_ERROR << e.what() << "\n";
+            //std::cout << e.what() << std::flush;
+            //MPI_Abort(MPI_COMM_WORLD, -1);
+        }
+        catch (...)
+        {
+            *logging::out << logging::Logger::LOGGER_ERROR << "Unknown exception!\n";
+            //std::cout << "unknown exeption" << std::endl;
+        }
+
+        //std::cout << "\nConfiguration file must be set!: lbmgm <config file>" << std::endl << std::flush;
+        //MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+
+
+   /*
+   MPE_Init_log() & MPE_Finish_log() are NOT needed when
+   liblmpe.a is linked with this program.  In that case,
+   MPI_Init() would have called MPE_Init_log() already.
+   */
+#if defined( MPI_LOGGING )
+   MPE_Init_log();
+#endif
+
+#if defined( MPI_LOGGING )
+   if ( argv != NULL )
+      MPE_Finish_log( argv[0] );
+   if ( str != "" )
+      MPE_Finish_log( str.c_str() );
+   else
+      MPE_Finish_log( "TestLog" );
+#endif
+
+   MPI_Finalize();
+   return 0;
+}
diff --git a/apps/gpu/LBM/TGV_3D_GridRef/config.txt b/apps/gpu/LBM/TGV_3D_GridRef/config.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ae6d3e9bc4be5403d151f3d59ffb13af7164abf0
--- /dev/null
+++ b/apps/gpu/LBM/TGV_3D_GridRef/config.txt
@@ -0,0 +1,36 @@
+##################################################
+#GPU Mapping
+##################################################
+#Devices="0 1 2 3"
+#NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+#Path="E:/DrivenCavity/results"
+#Path="F:/Work/Computations/out/DrivenCavity/"
+#Prefix="DrivenCavity" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+#GridPath="E:/DrivenCavity/dummy"
+GridPath="F:/Work/Computations/out/TaylorGreen3DNew/grid"
+
+##################################################
+#number of grid levels
+##################################################
+NOGL=2
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantAA2016CompSP27
+
+##################################################
+#simulation parameter
+##################################################
+#TimeEnd=100000
+#TimeOut=1000 
+#TimeStartOut=0
\ No newline at end of file
diff --git a/apps/gpu/LBM/TGV_3D_MultiGPU/TGV_3D_MultiGPU.cpp b/apps/gpu/LBM/TGV_3D_MultiGPU/TGV_3D_MultiGPU.cpp
index 8c303dc07c911c363e892ce53f7bfe7f48e284d6..045c208274bc6bc216d25e8c2fa905916a52f87b 100644
--- a/apps/gpu/LBM/TGV_3D_MultiGPU/TGV_3D_MultiGPU.cpp
+++ b/apps/gpu/LBM/TGV_3D_MultiGPU/TGV_3D_MultiGPU.cpp
@@ -1,7 +1,38 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TGV_3d_MuitiGPU.cpp
+//! \ingroup TGV_3D_MultiGPU
+//! \author Martin Schoenherr
+//=======================================================================================
 //#define MPI_LOGGING
 
 //Martin Branch
-
 #include <mpi.h>
 #if defined( MPI_LOGGING )
 	#include <mpe.h>
@@ -97,7 +128,7 @@ bool useWale = false;
 int mpirank;
 int mpiWorldSize;
 
-std::string kernel( "CumulantK20Comp" );
+std::string kernel( "CumulantK17CompChim" );
 
 //std::string path("F:/Work/Computations/out/TaylorGreen3DNew/"); //LEGOLAS
 //std::string path("results/"); //PHOENIX
diff --git a/apps/gpu/LBM/WTG_RUB/WTG_RUB.cpp b/apps/gpu/LBM/WTG_RUB/WTG_RUB.cpp
index 06b3678d7c8ddd236c26a69686356fbe87c31db2..3e083afd690632dbaabdde5d00f2ab454d86032b 100644
--- a/apps/gpu/LBM/WTG_RUB/WTG_RUB.cpp
+++ b/apps/gpu/LBM/WTG_RUB/WTG_RUB.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file WTG_RUB.cpp
+//! \ingroup Applications
+//! \author Martin Schoenherr
+//=======================================================================================
 #define _USE_MATH_DEFINES
 #include <math.h>
 #include <string>
@@ -15,15 +47,10 @@
 
 #include "Core/DataTypes.h"
 #include "PointerDefinitions.h"
-
 #include "Core/LbmOrGks.h"
-#include "Core/StringUtilities/StringUtil.h"
-
 #include "Core/VectorTypes.h"
 #include "Core/Logger/Logger.h"
 
-#include <basics/config/ConfigurationFile.h>
-
 //////////////////////////////////////////////////////////////////////////
 
 #include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
diff --git a/metadata.xml b/metadata.xml
deleted file mode 100644
index 7cbae3ae7e1d5d7d48af2f0e5577253a89f953f5..0000000000000000000000000000000000000000
--- a/metadata.xml
+++ /dev/null
@@ -1,204 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.3/metadata.xsd">
-	<identifier identifierType="DOI">PLACEHOLDER</identifier>
-	<titles>
-		<title xml:lang="en">VirtualFluids</title>
-	</titles>
-	<language>en</language>
-	<creators>
-		<creator>
-			<creatorName nameType="Personal">Krafczyk, Manfred</creatorName>
-			<givenName>Manfred</givenName>
-			<familyName>Krafczyk</familyName>
-			<nameIdentifier nameIdentifierScheme="ORCID">0000-0002-8509-0871</nameIdentifier>
-			<affiliation xml:lang="de">TU Braunschweig</affiliation>
-			<affiliation xml:lang="de">Institut für rechnergestützte Modellierung im Bauingenieurwesen</affiliation>
-		</creator>
-		<creator>
-			<creatorName nameType="Organizational">Institut für rechnergestützte Modellierung im Bauingenieurwesen</creatorName>
-			<affiliation xml:lang="de">TU Braunschweig</affiliation>
-		</creator>
-	</creators>
-	<publisher xml:lang="de">Institut für rechnergestützte Modellierung im Bauingenieurwesen</publisher>
-	<publicationYear>2021</publicationYear>
-	<resourceType resourceTypeGeneral="Software">Computational Fluid Dynamics Solver</resourceType>
-	<subjects>
-		<subject subjectScheme="DDC" schemeURI="https://www.oclc.org/en/dewey.html">532 Fluid Mechanics, liquid mechanics</subject>
-	</subjects>
-	<contributors>
-		<contributor contributorType="Researcher">
-			<contributorName>Ahrenholz, Benjamin</contributorName>
-			<givenName>Benjamin</givenName>
-			<familyName>Ahrenholz</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Alihussein, Hussein</contributorName>
-			<givenName>Hussein</givenName>
-			<familyName>Alihussein</familyName>
-			<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0003-3656-7028</nameIdentifier>
-			<affiliation xml:lang="de">TU Braunschweig</affiliation>
-			<affiliation xml:lang="en">Institut für rechnergestützte Modellierung im Bauingenieurwesen</affiliation>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Bindick, Sebastian</contributorName>
-			<givenName>Sebastian</givenName>
-			<familyName>Bindick</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Brendel, Aileen</contributorName>
-			<givenName>Aileen</givenName>
-			<familyName>Brendel</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Geier, Martin</contributorName>
-			<givenName>Martin</givenName>
-			<familyName>Geier</familyName>
-			<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0002-8367-9412</nameIdentifier>
-			<affiliation xml:lang="de">TU Braunschweig</affiliation>
-			<affiliation xml:lang="en">Institut für rechnergestützte Modellierung im Bauingenieurwesen</affiliation>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Geller, Sebastian</contributorName>
-			<givenName>Sebastian</givenName>
-			<familyName>Geller</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Goraki Fard, Ehsan</contributorName>
-			<givenName>Ehsan</givenName>
-			<familyName>Goraki Fard</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Hegewald, Jan</contributorName>
-			<givenName>Jan</givenName>
-			<familyName>Hegewald</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Janßen, Christian</contributorName>
-			<givenName>Christian</givenName>
-			<familyName>Janßen</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Kutscher, Konstantin</contributorName>
-			<givenName>Konstantin</givenName>
-			<familyName>Kutscher</familyName>
-			<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0002-1099-1608</nameIdentifier>
-			<affiliation xml:lang="de">TU Braunschweig</affiliation>
-			<affiliation xml:lang="en">Institut für rechnergestützte Modellierung im Bauingenieurwesen</affiliation>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Lenz, Stephan</contributorName>
-			<givenName>Stephan</givenName>
-			<familyName>Lenz</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Linxweiler, Jan</contributorName>
-			<givenName>Jan</givenName>
-			<familyName>Linxweiler</familyName>
-			<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0002-2755-5087</nameIdentifier>
-			<affiliation xml:lang="de">TU Braunschweig</affiliation>
-			<affiliation xml:lang="en">Institut für rechnergestützte Modellierung im Bauingenieurwesen</affiliation>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Lux, Lennard</contributorName>
-			<givenName>Lennard</givenName>
-			<familyName>Lux</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Marcus, Sven</contributorName>
-			<givenName>Sven</givenName>
-			<familyName>Marcus</familyName>
-			<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0003-3689-2162</nameIdentifier>
-			<affiliation xml:lang="de">TU Braunschweig</affiliation>
-			<affiliation xml:lang="en">Universitätsbibliothek Braunschweig</affiliation>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Peters, Sören</contributorName>
-			<givenName>Sören</givenName>
-			<familyName>Peters</familyName>
-			<affiliation xml:lang="de">TU Braunschweig</affiliation>
-			<affiliation xml:lang="en">Institut für rechnergestützte Modellierung im Bauingenieurwesen</affiliation>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Safari, Hesameddin</contributorName>
-			<givenName>Hesameddin</givenName>
-			<familyName>Safari</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Schönherr, Martin</contributorName>
-			<givenName>Martin</givenName>
-			<familyName>Schönherr</familyName>
-			<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0002-4774-1776</nameIdentifier>
-			<affiliation xml:lang="de">TU Braunschweig</affiliation>
-			<affiliation xml:lang="en">Institut für rechnergestützte Modellierung im Bauingenieurwesen</affiliation>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Stiebler, Maik</contributorName>
-			<givenName>Maik</givenName>
-			<familyName>Stiebler</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Textor, Sören</contributorName>
-			<givenName>Sören</givenName>
-			<familyName>Textor</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Tölke, Jonas</contributorName>
-			<givenName>Jonas</givenName>
-			<familyName>Tölke</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Uphoff, Sonja</contributorName>
-			<givenName>Sonja</givenName>
-			<familyName>Uphoff</familyName>
-		</contributor>
-
-		<contributor contributorType="Researcher">
-			<contributorName>Wellmann, Anna</contributorName>
-			<givenName>Anna</givenName>
-			<familyName>Wellmann</familyName>
-		</contributor>
-	</contributors>
-	<dates>
-		<date dateType="Created">2000</date>
-	</dates>
-	<formats>
-		<format>text/x-c</format>
-		<format>text/x-h</format>
-		<format>text/x-script.python</format>
-	</formats>
-	<relatedIdentifiers>
-		<relatedIdentifier relatedIdentifierType="URL" relationType="Requires" resourceTypeGeneral="Software">https://www.open-mpi.org/software/ompi/v4.1/</relatedIdentifier>
-		<relatedIdentifier relatedIdentifierType="URL" relationType="IsCompiledBy" resourceTypeGeneral="Software">https://cmake.org</relatedIdentifier>
-		<relatedIdentifier relatedIdentifierType="URL" relationType="IsCompiledBy" resourceTypeGeneral="Software">https://gcc.gnu.org</relatedIdentifier>
-		<relatedIdentifier relatedIdentifierType="URL" relationType="IsCompiledBy" resourceTypeGeneral="Software">https://clang.llvm.org</relatedIdentifier>
-		<relatedIdentifier relatedIdentifierType="URL" relationType="IsCompiledBy" resourceTypeGeneral="Software">https://visualstudio.microsoft.com/vs/features/cplusplus/</relatedIdentifier>
-	</relatedIdentifiers>
-	<rightsList>
-		<rights xml:lang="en" schemeURI="https://spdx.org/licenses/" rightsIdentifierScheme="SPDX" rightsIdentifier="GPL-3.0-only" rightsURI="https://www.gnu.org/licenses/gpl-3.0-standalone.html">GNU General Public License Version 3</rights>
-	</rightsList>
-	<descriptions>
-		<description descriptionType="Abstract">
-			VirtualFluids (VF) is a research code developed at the Institute for Computational Modeling in Civil Engineering (iRMB). The code is a Computational Fluid Dynamics (CFD) solver based on the Lattice Boltzmann Method (LBM) for turbulent, thermal, multiphase and multicomponent flow problems as well as for multi-field problems such as Fluid-Structure-interaction including distributed pre- and postprocessing capabilities for simulations with more than 100 billion degrees of freedom.
-		</description>
-	</descriptions>
-</resource>
diff --git a/pyproject.toml b/pyproject.toml
index 8fcb7926102d188b44d8c74084235b6f175edf80..257da6fd95d683081dbff865c864079eae9c675d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,2 +1,9 @@
 [build-system]
-requires = ["setuptools", "wheel", "scikit-build"]
\ No newline at end of file
+requires = [
+    "setuptools>=42",
+    "scikit-build",
+    "cmake",
+    "ninja; platform_system!='Windows'"
+]
+build-backend = "setup_builder"
+backend-path = ["utilities"]
\ No newline at end of file
diff --git a/pythonbindings/CMakeLists.txt b/pythonbindings/CMakeLists.txt
index 5a84adef027fdfa2953e016693bb64570e48c1ef..815a4b59cf6c3e4e5ac4a7a72a5bd4e374d64c96 100644
--- a/pythonbindings/CMakeLists.txt
+++ b/pythonbindings/CMakeLists.txt
@@ -1,24 +1,45 @@
-project(VirtualFluidsPython LANGUAGES CUDA CXX)
+set(PYFLUIDS_LANGUAGES CXX)
+
+if(BUILD_VF_GPU)
+    set(PYFLUIDS_LANGUAGES CUDA CXX)
+endif()
+
+project(VirtualFluidsPython LANGUAGES ${PYFLUIDS_LANGUAGES})
+
+pybind11_add_module(python_bindings MODULE src/VirtualFluids.cpp)
+
+set_target_properties(  python_bindings PROPERTIES
+                        LIBRARY_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/pythonbindings/pyfluids
+                        OUTPUT_NAME "bindings")
+
+target_link_libraries(python_bindings PRIVATE basics logger mpi)
+
 IF(BUILD_VF_GPU)
-    pybind11_add_module(pyfluids src/VirtualFluidsModulesGPU.cpp)
-    set_source_files_properties(src/VirtualFluidsModulesGPU.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(src/VirtualFluids.cpp PROPERTIES LANGUAGE CUDA)
 
-    target_link_libraries(pyfluids PRIVATE GridGenerator VirtualFluids_GPU basics lbmCuda logger)
-    target_include_directories(pyfluids PRIVATE ${VF_THIRD_DIR}/cuda_samples/)
+    target_include_directories(python_bindings PRIVATE ${VF_THIRD_DIR}/cuda_samples/)
+    target_compile_definitions(python_bindings PRIVATE VF_GPU_PYTHONBINDINGS)
 
+    target_link_libraries(python_bindings PRIVATE GridGenerator VirtualFluids_GPU lbm)
 ENDIF()
+
 IF(BUILD_VF_CPU)
-    pybind11_add_module(pyfluids src/VirtualFluidsModulesCPU.cpp)
-    pybind11_add_module(pymuparser src/muParser.cpp)
+    target_compile_definitions(python_bindings PRIVATE VF_METIS VF_MPI VF_CPU_PYTHONBINDINGS)
+    target_link_libraries(python_bindings PRIVATE simulationconfig VirtualFluidsCore muparser lbm)
+
+    # include bindings for muparsers
+    pybind11_add_module(pymuparser MODULE src/muParser.cpp)
 
     # TODO: Move this to MuParser CMakeLists.txt
     set_target_properties(muparser PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-    target_compile_definitions(pyfluids PRIVATE VF_METIS VF_MPI)
+    set_target_properties(  pymuparser PROPERTIES
+                            LIBRARY_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/pythonbindings/pymuparser
+                            OUTPUT_NAME "bindings")
     target_compile_definitions(pymuparser PRIVATE VF_METIS VF_MPI)
-
-    target_link_libraries(pyfluids PRIVATE simulationconfig VirtualFluidsCore muparser basics)
     target_link_libraries(pymuparser PRIVATE muparser)
 ENDIF()
-target_include_directories(pyfluids PRIVATE ${CMAKE_SOURCE_DIR}/src/)
-target_include_directories(pyfluids PRIVATE ${CMAKE_BINARY_DIR})
\ No newline at end of file
+
+
+target_include_directories(python_bindings PRIVATE ${CMAKE_SOURCE_DIR}/src/)
+target_include_directories(python_bindings PRIVATE ${CMAKE_BINARY_DIR})
\ No newline at end of file
diff --git a/Python/boundary_layer/__init__.py b/pythonbindings/pyfluids-stubs/__init__.pyi
similarity index 100%
rename from Python/boundary_layer/__init__.py
rename to pythonbindings/pyfluids-stubs/__init__.pyi
diff --git a/pythonbindings/pyfluids-stubs/bindings/__init__.pyi b/pythonbindings/pyfluids-stubs/bindings/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..4e7f353eab97cc536f8f18e72319af1cd7a1916a
--- /dev/null
+++ b/pythonbindings/pyfluids-stubs/bindings/__init__.pyi
@@ -0,0 +1,38 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file __init__.pyi
+! \ingroup bindings
+! \author Henry Korb
+=======================================================================================
+"""
+class ostream_redirect:
+    def __init__(self, stdout: bool = ..., stderr: bool = ...) -> None: ...
+    def __enter__(self) -> None: ...
+    def __exit__(self, *args) -> None: ...
diff --git a/pythonbindings/pyfluids-stubs/bindings/basics/__init__.pyi b/pythonbindings/pyfluids-stubs/bindings/basics/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..a41b7934ca706dc0db5bd6188fee3150456e0cd9
--- /dev/null
+++ b/pythonbindings/pyfluids-stubs/bindings/basics/__init__.pyi
@@ -0,0 +1,82 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file __init__.py
+! \ingroup basics
+! \author Henry Korb
+=======================================================================================
+"""
+from typing import ClassVar
+
+from typing import overload
+
+class ConfigurationFile:
+    def __init__(self) -> None: ...
+    def contains(self, key: str) -> bool: ...
+    @overload
+    def get_bool_value(self, key: str) -> bool: ...
+    @overload
+    def get_bool_value(self, key: str, default_value: bool) -> bool: ...
+    @overload
+    def get_double_value(self, key: str) -> float: ...
+    @overload
+    def get_double_value(self, key: str, default_value: float) -> float: ...
+    @overload
+    def get_float_value(self, key: str) -> float: ...
+    @overload
+    def get_float_value(self, key: str, default_value: float) -> float: ...
+    @overload
+    def get_int_value(self, key: str) -> int: ...
+    @overload
+    def get_int_value(self, key: str, default_value: int) -> int: ...
+    @overload
+    def get_string_value(self, key: str) -> str: ...
+    @overload
+    def get_string_value(self, key: str, default_value: str) -> str: ...
+    @overload
+    def get_uint_value(self, key: str) -> int: ...
+    @overload
+    def get_uint_value(self, key: str, default_value: int) -> int: ...
+    def load(self, file: str) -> bool: ...
+
+class LbmOrGks:
+    __members__: ClassVar[dict] = ...  # read-only
+    GKS: ClassVar[LbmOrGks] = ...
+    LBM: ClassVar[LbmOrGks] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
diff --git a/pythonbindings/pyfluids-stubs/bindings/basics/logger.pyi b/pythonbindings/pyfluids-stubs/bindings/basics/logger.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..43938ff7646efd3c596ae29971cce39fed865fa6
--- /dev/null
+++ b/pythonbindings/pyfluids-stubs/bindings/basics/logger.pyi
@@ -0,0 +1,83 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file logger.pyi
+! \ingroup basics
+! \author Henry Korb
+=======================================================================================
+"""
+from typing import Any, ClassVar
+
+log: None
+
+class Level:
+    __members__: ClassVar[dict] = ...  # read-only
+    INFO_HIGH: ClassVar[Level] = ...
+    INFO_INTERMEDIATE: ClassVar[Level] = ...
+    INFO_LOW: ClassVar[Level] = ...
+    LOGGER_ERROR: ClassVar[Level] = ...
+    WARNING: ClassVar[Level] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class Logger:
+    def __init__(self, *args, **kwargs) -> None: ...
+    @staticmethod
+    def add_stdout() -> None: ...
+    @staticmethod
+    def enable_printed_rank_numbers(print: bool) -> None: ...
+    @staticmethod
+    def set_debug_level(level: int) -> None: ...
+    @staticmethod
+    def time_stamp(time_stemp: TimeStamp) -> None: ...
+
+class TimeStamp:
+    __members__: ClassVar[dict] = ...  # read-only
+    DISABLE: ClassVar[TimeStamp] = ...
+    ENABLE: ClassVar[TimeStamp] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
diff --git a/pythonbindings/pyfluids-stubs/bindings/gpu/__init__.pyi b/pythonbindings/pyfluids-stubs/bindings/gpu/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..36c2fea76713e980bb95eb6726d778de8c9a6583
--- /dev/null
+++ b/pythonbindings/pyfluids-stubs/bindings/gpu/__init__.pyi
@@ -0,0 +1,436 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file __init__.pyi
+! \ingroup gpu
+! \author Henry Korb
+=======================================================================================
+"""
+from typing import Any, Callable, ClassVar, List, Optional
+
+from typing import overload
+import numpy
+import pyfluids.bindings.basics
+import pyfluids.bindings.gpu.grid_generator as grid_generator
+
+class ActuatorFarm(PreCollisionInteractor):
+    def __init__(self, number_of_blades_per_turbine: int, density: float, number_of_nodes_per_blade: int, epsilon: float, level: int, delta_t: float, delta_x: float, use_host_arrays: bool) -> None: ...
+    def add_turbine(self, posX: float, posY: float, posZ: float, diameter: float, omega: float, azimuth: float, yaw: float, bladeRadii: List[float]) -> None: ...
+    def calc_blade_forces(self) -> None: ...
+    def get_all_azimuths(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_blade_coords_x(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_blade_coords_x_device(self) -> int: ...
+    def get_all_blade_coords_y(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_blade_coords_y_device(self) -> int: ...
+    def get_all_blade_coords_z(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_blade_coords_z_device(self) -> int: ...
+    def get_all_blade_forces_x(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_blade_forces_x_device(self) -> int: ...
+    def get_all_blade_forces_y(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_blade_forces_y_device(self) -> int: ...
+    def get_all_blade_forces_z(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_blade_forces_z_device(self) -> int: ...
+    def get_all_blade_radii(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_blade_radii_device(self) -> int: ...
+    def get_all_blade_velocities_x(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_blade_velocities_x_device(self) -> int: ...
+    def get_all_blade_velocities_y(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_blade_velocities_y_device(self) -> int: ...
+    def get_all_blade_velocities_z(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_blade_velocities_z_device(self) -> int: ...
+    def get_all_omegas(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_turbine_pos_x(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_turbine_pos_y(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_turbine_pos_z(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_all_yaws(self) -> numpy.ndarray[numpy.float32]: ...
+    def get_turbine_azimuth(self, turbine: int) -> float: ...
+    def get_turbine_blade_coords_x(self, turbine: int) -> numpy.ndarray[numpy.float32]: ...
+    def get_turbine_blade_coords_x_device(self, turbine: int) -> int: ...
+    def get_turbine_blade_coords_y(self, turbine: int) -> numpy.ndarray[numpy.float32]: ...
+    def get_turbine_blade_coords_y_device(self, turbine: int) -> int: ...
+    def get_turbine_blade_coords_z(self, turbine: int) -> numpy.ndarray[numpy.float32]: ...
+    def get_turbine_blade_coords_z_device(self, turbine: int) -> int: ...
+    def get_turbine_blade_forces_x(self, turbine: int) -> numpy.ndarray[numpy.float32]: ...
+    def get_turbine_blade_forces_x_device(self, turbine: int) -> int: ...
+    def get_turbine_blade_forces_y(self, turbine: int) -> numpy.ndarray[numpy.float32]: ...
+    def get_turbine_blade_forces_y_device(self, turbine: int) -> int: ...
+    def get_turbine_blade_forces_z(self, turbine: int) -> numpy.ndarray[numpy.float32]: ...
+    def get_turbine_blade_forces_z_device(self, turbine: int) -> int: ...
+    def get_turbine_blade_radii(self, turbine: int) -> numpy.ndarray[numpy.float32]: ...
+    def get_turbine_blade_radii_device(self, turbine: int) -> int: ...
+    def get_turbine_blade_velocities_x(self, turbine: int) -> numpy.ndarray[numpy.float32]: ...
+    def get_turbine_blade_velocities_x_device(self, turbine: int) -> int: ...
+    def get_turbine_blade_velocities_y(self, turbine: int) -> numpy.ndarray[numpy.float32]: ...
+    def get_turbine_blade_velocities_y_device(self, turbine: int) -> int: ...
+    def get_turbine_blade_velocities_z(self, turbine: int) -> numpy.ndarray[numpy.float32]: ...
+    def get_turbine_blade_velocities_z_device(self, turbine: int) -> int: ...
+    def get_turbine_omega(self, turbine: int) -> float: ...
+    def get_turbine_pos(self, turbine: int) -> numpy.ndarray[numpy.float32]: ...
+    def get_turbine_yaw(self, turbine: int) -> float: ...
+    def set_all_azimuths(self, azimuths: numpy.ndarray[numpy.float32]) -> None: ...
+    def set_all_blade_coords(self, blade_coords_x: numpy.ndarray[numpy.float32], blade_coords_y: numpy.ndarray[numpy.float32], blade_coords_z: numpy.ndarray[numpy.float32]) -> None: ...
+    def set_all_blade_forces(self, blade_forces_x: numpy.ndarray[numpy.float32], blade_forces_y: numpy.ndarray[numpy.float32], blade_forces_z: numpy.ndarray[numpy.float32]) -> None: ...
+    def set_all_blade_velocities(self, blade_velocities_x: numpy.ndarray[numpy.float32], blade_velocities_y: numpy.ndarray[numpy.float32], blade_velocities_z: numpy.ndarray[numpy.float32]) -> None: ...
+    def set_all_omegas(self, omegas: numpy.ndarray[numpy.float32]) -> None: ...
+    def set_all_yaws(self, yaws: numpy.ndarray[numpy.float32]) -> None: ...
+    def set_turbine_azimuth(self, turbine: int, azimuth: float) -> None: ...
+    def set_turbine_blade_coords(self, turbine: int, blade_coords_x: numpy.ndarray[numpy.float32], blade_coords_y: numpy.ndarray[numpy.float32], blade_coords_z: numpy.ndarray[numpy.float32]) -> None: ...
+    def set_turbine_blade_forces(self, turbine: int, blade_forces_x: numpy.ndarray[numpy.float32], blade_forces_y: numpy.ndarray[numpy.float32], blade_forces_z: numpy.ndarray[numpy.float32]) -> None: ...
+    def set_turbine_blade_velocities(self, turbine: int, blade_velocities_x: numpy.ndarray[numpy.float32], blade_velocities_y: numpy.ndarray[numpy.float32], blade_velocities_z: numpy.ndarray[numpy.float32]) -> None: ...
+    def set_turbine_omega(self, turbine: int, omega: float) -> None: ...
+    def set_turbine_yaw(self, turbine: int, yaw: float) -> None: ...
+    @property
+    def delta_t(self) -> float: ...
+    @property
+    def delta_x(self) -> float: ...
+    @property
+    def density(self) -> float: ...
+    @property
+    def number_of_blades_per_turbine(self) -> int: ...
+    @property
+    def number_of_indices(self) -> int: ...
+    @property
+    def number_of_nodes(self) -> int: ...
+    @property
+    def number_of_nodes_per_blade(self) -> int: ...
+    @property
+    def number_of_turbines(self) -> int: ...
+
+class BoundaryConditionFactory:
+    def __init__(self) -> None: ...
+    def set_geometry_boundary_condition(self, boundary_condition_type) -> None: ...
+    def set_no_slip_boundary_condition(self, boundary_condition_type) -> None: ...
+    def set_precursor_boundary_condition(self, boundary_condition_type) -> None: ...
+    def set_pressure_boundary_condition(self, boundary_condition_type) -> None: ...
+    def set_slip_boundary_condition(self, boundary_condition_type) -> None: ...
+    def set_stress_boundary_condition(self, boundary_condition_type) -> None: ...
+    def set_velocity_boundary_condition(self, boundary_condition_type) -> None: ...
+
+class Communicator:
+    def __init__(self, *args, **kwargs) -> None: ...
+    @staticmethod
+    def get_instance() -> Communicator: ...
+    def get_number_of_process(self) -> int: ...
+    def get_pid(self) -> int: ...
+
+class CudaMemoryManager:
+    def __init__(self, parameter: Parameter) -> None: ...
+
+class FileType:
+    __members__: ClassVar[dict] = ...  # read-only
+    VTK: ClassVar[FileType] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class GridProvider:
+    def __init__(self, *args, **kwargs) -> None: ...
+    @staticmethod
+    def make_grid_generator(builder: grid_generator.GridBuilder, para: Parameter, cuda_memory_manager: CudaMemoryManager, communicator: Communicator) -> GridProvider: ...
+
+class GridScaling:
+    __members__: ClassVar[dict] = ...  # read-only
+    NotSpecified: ClassVar[GridScaling] = ...
+    ScaleCompressible: ClassVar[GridScaling] = ...
+    ScaleRhoSq: ClassVar[GridScaling] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class GridScalingFactory:
+    def __init__(self) -> None: ...
+    def set_scaling_factory(self, scaling_type) -> None: ...
+
+class NoSlipBC:
+    __members__: ClassVar[dict] = ...  # read-only
+    NoSlip3rdMomentsCompressible: ClassVar[NoSlipBC] = ...
+    NoSlipBounceBack: ClassVar[NoSlipBC] = ...
+    NoSlipCompressible: ClassVar[NoSlipBC] = ...
+    NoSlipImplicitBounceBack: ClassVar[NoSlipBC] = ...
+    NoSlipIncompressible: ClassVar[NoSlipBC] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class OutputVariable:
+    __members__: ClassVar[dict] = ...  # read-only
+    Distributions: ClassVar[OutputVariable] = ...
+    Velocities: ClassVar[OutputVariable] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class Parameter:
+    @overload
+    def __init__(self, number_of_processes: int, my_ID: int, config_data: Optional[pyfluids.bindings.basics.ConfigurationFile]) -> None: ...
+    @overload
+    def __init__(self, number_of_processes: int, my_ID: int) -> None: ...
+    @overload
+    def __init__(self, config_data: pyfluids.bindings.basics.ConfigurationFile) -> None: ...
+    def add_actuator(self, actuator: PreCollisionInteractor) -> None: ...
+    def add_probe(self, probe: PreCollisionInteractor) -> None: ...
+    def get_SGS_constant(self) -> float: ...
+    def get_density_ratio(self) -> float: ...
+    def get_force_ratio(self) -> float: ...
+    def get_is_body_force(self) -> bool: ...
+    def get_output_path(self) -> str: ...
+    def get_output_prefix(self) -> str: ...
+    def get_velocity(self) -> float: ...
+    def get_velocity_ratio(self) -> float: ...
+    def get_viscosity(self) -> float: ...
+    def get_viscosity_ratio(self) -> float: ...
+    def set_AD_kernel(self, ad_kernel: str) -> None: ...
+    def set_calc_turbulence_intensity(self, calc_velocity_and_fluctuations: bool) -> None: ...
+    def set_comp_on(self, is_comp: bool) -> None: ...
+    def set_density_ratio(self, density_ratio: float) -> None: ...
+    def set_devices(self, devices: List[int]) -> None: ...
+    def set_diff_on(self, is_diff: bool) -> None: ...
+    def set_forcing(self, forcing_x: float, forcing_y: float, forcing_z: float) -> None: ...
+    def set_has_wall_model_monitor(self, has_wall_monitor: bool) -> None: ...
+    def set_initial_condition(self, init_func: Callable[[float,float,float],List[float]]) -> None: ...
+    def set_initial_condition_log_law(self, u_star: float, z0: float, velocity_ratio: float) -> None: ...
+    def set_initial_condition_perturbed_log_law(self, u_star: float, z0: float, length_x: float, length_z: float, height: float, velocity_ratio: float) -> None: ...
+    def set_initial_condition_uniform(self, velocity_x: float, velocity_y: float, velocity_z: float) -> None: ...
+    def set_is_body_force(self, is_body_force: bool) -> None: ...
+    def set_main_kernel(self, kernel: str) -> None: ...
+    def set_max_dev(self, max_dev: int) -> None: ...
+    def set_max_level(self, number_of_levels: int) -> None: ...
+    def set_outflow_pressure_correction_factor(self, correction_factor: float) -> None: ...
+    def set_output_path(self, o_path: str) -> None: ...
+    def set_output_prefix(self, o_prefix: str) -> None: ...
+    def set_print_files(self, print_files: bool) -> None: ...
+    def set_quadric_limiters(self, quadric_limiter_p: float, quadric_limiter_m: float, quadric_limiter_d: float) -> None: ...
+    def set_temperature_BC(self, temp_bc: float) -> None: ...
+    def set_temperature_init(self, temp: float) -> None: ...
+    def set_timestep_end(self, tend: int) -> None: ...
+    def set_timestep_of_coarse_level(self, timestep: int) -> None: ...
+    def set_timestep_out(self, tout: int) -> None: ...
+    def set_timestep_start_out(self, t_start_out: int) -> None: ...
+    def set_use_streams(self, use_streams: bool) -> None: ...
+    def set_velocity_LB(self, velocity: float) -> None: ...
+    def set_velocity_ratio(self, velocity_ratio: float) -> None: ...
+    def set_viscosity_LB(self, viscosity: float) -> None: ...
+    def set_viscosity_ratio(self, viscosity_ratio: float) -> None: ...
+
+class PreCollisionInteractor:
+    def __init__(self, *args, **kwargs) -> None: ...
+
+class PrecursorBC:
+    __members__: ClassVar[dict] = ...  # read-only
+    DistributionsPrecursor: ClassVar[PrecursorBC] = ...
+    NotSpecified: ClassVar[PrecursorBC] = ...
+    VelocityPrecursor: ClassVar[PrecursorBC] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class PrecursorWriter(PreCollisionInteractor):
+    def __init__(self, filename: str, output_path: str, x_pos: float, y_min: float, y_max: float, z_min: float, z_max: float, t_start_out: int, t_save: int, output_variable: OutputVariable, max_timesteps_per_file: int) -> None: ...
+
+class PressureBC:
+    __members__: ClassVar[dict] = ...  # read-only
+    NotSpecified: ClassVar[PressureBC] = ...
+    OutflowNonReflective: ClassVar[PressureBC] = ...
+    OutflowNonReflectivePressureCorrection: ClassVar[PressureBC] = ...
+    PressureEquilibrium: ClassVar[PressureBC] = ...
+    PressureEquilibrium2: ClassVar[PressureBC] = ...
+    PressureNonEquilibriumCompressible: ClassVar[PressureBC] = ...
+    PressureNonEquilibriumIncompressible: ClassVar[PressureBC] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class SideType:
+    __members__: ClassVar[dict] = ...  # read-only
+    GEOMETRY: ClassVar[SideType] = ...
+    MX: ClassVar[SideType] = ...
+    MY: ClassVar[SideType] = ...
+    MZ: ClassVar[SideType] = ...
+    PX: ClassVar[SideType] = ...
+    PY: ClassVar[SideType] = ...
+    PZ: ClassVar[SideType] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class Simulation:
+    @overload
+    def __init__(self, parameter: Parameter, memoryManager: CudaMemoryManager, communicator, gridProvider: GridProvider, bcFactory: BoundaryConditionFactory, gridScalingFactory: GridScalingFactory) -> None: ...
+    @overload
+    def __init__(self, parameter: Parameter, memoryManager: CudaMemoryManager, communicator, gridProvider: GridProvider, bcFactory: BoundaryConditionFactory) -> None: ...
+    @overload
+    def __init__(self, parameter: Parameter, memoryManager: CudaMemoryManager, communicator, gridProvider: GridProvider, bcFactory: BoundaryConditionFactory, tmFactory: TurbulenceModelFactory, gridScalingFactory: GridScalingFactory) -> None: ...
+    def addEnstrophyAnalyzer(self, t_analyse: int) -> None: ...
+    def addKineticEnergyAnalyzer(self, t_analyse: int) -> None: ...
+    def run(self) -> None: ...
+
+class SlipBC:
+    __members__: ClassVar[dict] = ...  # read-only
+    NotSpecified: ClassVar[SlipBC] = ...
+    SlipBounceBack: ClassVar[SlipBC] = ...
+    SlipCompressible: ClassVar[SlipBC] = ...
+    SlipCompressibleTurbulentViscosity: ClassVar[SlipBC] = ...
+    SlipIncompressible: ClassVar[SlipBC] = ...
+    SlipPressureCompressibleTurbulentViscosity: ClassVar[SlipBC] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class StressBC:
+    __members__: ClassVar[dict] = ...  # read-only
+    NotSpecified: ClassVar[StressBC] = ...
+    StressBounceBack: ClassVar[StressBC] = ...
+    StressCompressible: ClassVar[StressBC] = ...
+    StressPressureBounceBack: ClassVar[StressBC] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class TurbulenceModel:
+    __members__: ClassVar[dict] = ...  # read-only
+    AMD: ClassVar[TurbulenceModel] = ...
+    NONE: ClassVar[TurbulenceModel] = ...
+    QR: ClassVar[TurbulenceModel] = ...
+    Smagorinsky: ClassVar[TurbulenceModel] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class TurbulenceModelFactory:
+    def __init__(self, para: Parameter) -> None: ...
+    def read_config_file(self, config_data: pyfluids.bindings.basics.ConfigurationFile) -> None: ...
+    def set_model_constant(self, model_constant: float) -> None: ...
+    def set_turbulence_model(self, turbulence_model: TurbulenceModel) -> None: ...
+
+class VTKFileCollection(FileCollection):
+    def __init__(self, prefix: str) -> None: ...
+
+class VelocityBC:
+    __members__: ClassVar[dict] = ...  # read-only
+    NotSpecified: ClassVar[VelocityBC] = ...
+    VelocityAndPressureCompressible: ClassVar[VelocityBC] = ...
+    VelocityCompressible: ClassVar[VelocityBC] = ...
+    VelocityIncompressible: ClassVar[VelocityBC] = ...
+    VelocitySimpleBounceBackCompressible: ClassVar[VelocityBC] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class FileCollection:
+    def __init__(self, *args, **kwargs) -> None: ...
+
+def create_file_collection(prefix: str, type: FileType) -> FileCollection: ...
diff --git a/pythonbindings/pyfluids-stubs/bindings/gpu/grid_generator.pyi b/pythonbindings/pyfluids-stubs/bindings/gpu/grid_generator.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..8d715e4b4cd49e6dbf92da3aedddbc4b869067c4
--- /dev/null
+++ b/pythonbindings/pyfluids-stubs/bindings/gpu/grid_generator.pyi
@@ -0,0 +1,100 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file grid_generator.pyi
+! \ingroup gpu
+! \author Henry Korb
+=======================================================================================
+"""
+from typing import Any, List
+
+from typing import overload
+import pyfluids.bindings.basics
+import pyfluids.bindings.gpu
+
+class BoundingBox:
+    def __init__(self, min_x: float, max_x: float, min_y: float, max_y: float, min_z: float, max_z: float) -> None: ...
+
+class Conglomerate(Object):
+    def __init__(self, *args, **kwargs) -> None: ...
+    def add(self, object: Object) -> None: ...
+    @staticmethod
+    def make_shared() -> Conglomerate: ...
+    def subtract(self, object: Object) -> None: ...
+
+class Cuboid(Object):
+    def __init__(self, min_x1: float, min_x2: float, min_x3: float, max_x1: float, max_x2: float, max_x3: float) -> None: ...
+
+class GridBuilder:
+    def __init__(self, *args, **kwargs) -> None: ...
+    def get_number_of_grid_levels(self) -> int: ...
+
+class GridFactory:
+    def __init__(self, *args, **kwargs) -> None: ...
+    @staticmethod
+    def make() -> GridFactory: ...
+
+class LevelGridBuilder(GridBuilder):
+    def __init__(self, *args, **kwargs) -> None: ...
+    def set_no_slip_boundary_condition(self, side_type: pyfluids.bindings.gpu.SideType) -> None: ...
+    def set_periodic_boundary_condition(self, periodic_x: bool, periodic_y: bool, periodic_z: bool) -> None: ...
+    def set_precursor_boundary_condition(self, side_type: pyfluids.bindings.gpu.SideType, file_collection: pyfluids.bindings.gpu.VelocityFileCollection, n_t_read: int, velocity_x: float = ..., velocity_y: float = ..., velocity_z: float = ..., file_level_to_grid_level_map: List[int] = ...) -> None: ...
+    def set_pressure_boundary_condition(self, side_type: pyfluids.bindings.gpu.SideType, rho: float) -> None: ...
+    def set_slip_boundary_condition(self, side_type: pyfluids.bindings.gpu.SideType, normal_x: float, normal_y: float, normal_z: float) -> None: ...
+    def set_stress_boundary_condition(self, side_type: pyfluids.bindings.gpu.SideType, normal_x: float, normal_y: float, normal_z: float, sampling_offset: int, z0: float, dx: float) -> None: ...
+    def set_velocity_boundary_condition(self, side_type: pyfluids.bindings.gpu.SideType, vx: float, vy: float, vz: float) -> None: ...
+
+class MultipleGridBuilder(LevelGridBuilder):
+    def __init__(self, *args, **kwargs) -> None: ...
+    def add_coarse_grid(self, start_x: float, start_y: float, start_z: float, end_x: float, end_y: float, end_z: float, delta: float) -> None: ...
+    @overload
+    def add_geometry(self, solid_object: Object) -> None: ...
+    @overload
+    def add_geometry(self, solid_object: Object, level: int) -> None: ...
+    @overload
+    def add_grid(self, grid_shape: Object) -> None: ...
+    @overload
+    def add_grid(self, grid_shape: Object, level_fine: int) -> None: ...
+    def build_grids(self, lbm_or_gks: pyfluids.bindings.basics.LbmOrGks, enable_thin_walls: bool) -> None: ...
+    def get_number_of_levels(self) -> int: ...
+    @staticmethod
+    def make_shared(grid_factory: GridFactory) -> MultipleGridBuilder: ...
+
+class Object:
+    def __init__(self, *args, **kwargs) -> None: ...
+
+class Sphere(Object):
+    def __init__(self, *args, **kwargs) -> None: ...
+    @staticmethod
+    def make_shared() -> Sphere: ...
+
+class TriangularMesh(Object):
+    def __init__(self, *args, **kwargs) -> None: ...
+    @staticmethod
+    def make() -> TriangularMesh: ...
diff --git a/pythonbindings/pyfluids-stubs/bindings/gpu/probes.pyi b/pythonbindings/pyfluids-stubs/bindings/gpu/probes.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..af9c40078e6009efebda4450b5c5e23586aa1e83
--- /dev/null
+++ b/pythonbindings/pyfluids-stubs/bindings/gpu/probes.pyi
@@ -0,0 +1,85 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file probes.pyi
+! \ingroup gpu
+! \author Henry Korb
+=======================================================================================
+"""
+from typing import ClassVar, List
+
+import pyfluids.bindings.gpu
+
+class PlanarAverageProbe(Probe):
+    def __init__(self, probe_name: str, output_path: str, t_start_avg: int, t_start_tmp_avg: int, t_avg: int, t_start_out: int, t_out: int, plane_normal: str) -> None: ...
+
+class PlaneProbe(Probe):
+    def __init__(self, probe_name: str, output_path: str, t_start_avg: int, t_avg: int, t_start_out: int, t_out: int) -> None: ...
+    def set_probe_plane(self, pos_x: float, pos_y: float, pos_z: float, delta_x: float, delta_y: float, delta_z: float) -> None: ...
+
+class PointProbe(Probe):
+    def __init__(self, probe_name: str, output_path: str, t_start_avg: int, t_avg: int, t_start_out: int, t_out: int, output_timeseries: bool) -> None: ...
+    def add_probe_points_from_list(self, point_coords_x: List[float], point_coords_y: List[float], point_coords_z: List[float]) -> None: ...
+    def add_probe_points_from_x_normal_plane(self, pos_x: float, pos0_y: float, pos0_z: float, pos1_y: float, pos1_z: float, n_y: int, n_z: int) -> None: ...
+
+class Probe(pyfluids.bindings.gpu.PreCollisionInteractor):
+    def __init__(self, *args, **kwargs) -> None: ...
+    def add_all_available_statistics(self) -> None: ...
+    def add_statistic(self, variable: Statistic) -> None: ...
+    def set_file_name_to_n_out(self) -> None: ...
+
+class Statistic:
+    __members__: ClassVar[dict] = ...  # read-only
+    Instantaneous: ClassVar[Statistic] = ...
+    Means: ClassVar[Statistic] = ...
+    SpatialCovariances: ClassVar[Statistic] = ...
+    SpatialFlatness: ClassVar[Statistic] = ...
+    SpatialMeans: ClassVar[Statistic] = ...
+    SpatialSkewness: ClassVar[Statistic] = ...
+    SpatioTemporalCovariances: ClassVar[Statistic] = ...
+    SpatioTemporalFlatness: ClassVar[Statistic] = ...
+    SpatioTemporalMeans: ClassVar[Statistic] = ...
+    SpatioTemporalSkewness: ClassVar[Statistic] = ...
+    Variances: ClassVar[Statistic] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, arg0: int) -> None: ...
+    def __eq__(self, arg0: object) -> bool: ...
+    def __getstate__(self) -> int: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, arg0: object) -> bool: ...
+    def __setstate__(self, arg0: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+
+class WallModelProbe(Probe):
+    def __init__(self, probe_name: str, output_path: str, t_start_avg: int, t_start_tmp_avg: int, t_avg: int, t_start_out: int, t_out: int) -> None: ...
+    def set_evaluate_pressure_gradient(self, eval_press_grad: bool) -> None: ...
+    def set_force_output_to_stress(self, output_stress: bool) -> None: ...
diff --git a/pythonbindings/pyfluids-stubs/bindings/lbm.pyi b/pythonbindings/pyfluids-stubs/bindings/lbm.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pythonbindings/pyfluids-stubs/bindings/logger.pyi b/pythonbindings/pyfluids-stubs/bindings/logger.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..fe84eeb18f3245ef72ed023b2de9db7b9131d144
--- /dev/null
+++ b/pythonbindings/pyfluids-stubs/bindings/logger.pyi
@@ -0,0 +1,45 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file logger.pyi
+! \ingroup bindings
+! \author Henry Korb
+=======================================================================================
+"""
+class Logger:
+    @staticmethod
+    def change_log_path(path: str) -> None: ...
+    @staticmethod
+    def initialize_logger() -> None: ...
+
+def vf_log_critical(message: str) -> None: ...
+def vf_log_debug(message: str) -> None: ...
+def vf_log_info(message: str) -> None: ...
+def vf_log_trace(message: str) -> None: ...
+def vf_log_warning(message: str) -> None: ...
diff --git a/pythonbindings/pyfluids/__init__.py b/pythonbindings/pyfluids/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0537b758267e22a72e5030340de7b87d52f35c3
--- /dev/null
+++ b/pythonbindings/pyfluids/__init__.py
@@ -0,0 +1,54 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file __init__.py
+! \ingroup pyfluids
+! \author Henry Korb
+=======================================================================================
+"""
+try:
+    from .bindings import basics
+except ImportError:
+    print("Basics bindings not included")
+try:
+    from .bindings import logger
+except ImportError:
+    print("Logger bindings not included")
+try:
+    from .bindings import lbm
+except ImportError:
+    print("LBM bindings not included")
+try:
+    from .bindings import gpu
+except ImportError:
+    print("GPU bindings not included")
+try:
+    from .bindings import cpu
+except ImportError:
+    print("CPU bindings not included")
\ No newline at end of file
diff --git a/pythonbindings/pyfluids/py.typed b/pythonbindings/pyfluids/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pythonbindings/pymuparser/__init__.py b/pythonbindings/pymuparser/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..398069bcba03b3fe710d9d9a6398e9c530b19ee9
--- /dev/null
+++ b/pythonbindings/pymuparser/__init__.py
@@ -0,0 +1,38 @@
+r"""
+=======================================================================================
+ ____          ____    __    ______     __________   __      __       __        __
+ \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+      \    \  |    |   ________________________________________________________________
+       \    \ |    |  |  ______________________________________________________________|
+        \    \|    |  |  |         __          __     __     __     ______      _______
+         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+
+  This file is part of VirtualFluids. VirtualFluids is free software: you can
+  redistribute it and/or modify it under the terms of the GNU General Public
+  License as published by the Free Software Foundation, either version 3 of
+  the License, or (at your option) any later version.
+
+  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+
+! \file __init__.pyi
+! \ingroup pymuparser
+! \author Henry Korb
+=======================================================================================
+"""
+try:
+    from .bindings import Parser
+except ImportError as e:
+    raise ImportError("Pymuparser bindings were not built. Only included if VirtualFluids is built with VF_BUILD_CPU=ON.")
\ No newline at end of file
diff --git a/pythonbindings/src/VirtualFluids.cpp b/pythonbindings/src/VirtualFluids.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..20e5012e0af325440e502c704d6f372100306ab1
--- /dev/null
+++ b/pythonbindings/src/VirtualFluids.cpp
@@ -0,0 +1,63 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file VirtualFluids.cpp
+//! \ingroup src
+//! \author Henry Korb
+//=======================================================================================
+#include <pybind11/pybind11.h>
+#include "basics/basics.cpp"
+#include "lbm/lbm.cpp"
+#include "logger/logger.cpp"
+
+#ifdef VF_GPU_PYTHONBINDINGS
+#include "gpu/gpu.cpp"
+#endif
+#ifdef VF_CPU_PYTHONBINDINGS
+#include "cpu/cpu.cpp"
+#endif
+
+
+namespace py_bindings
+{
+    namespace py = pybind11;
+
+    PYBIND11_MODULE(bindings, m)
+    {
+        py::add_ostream_redirect(m, "ostream_redirect");
+        basics::makeModule(m);
+        lbm::makeModule(m);
+        logging::makeModule(m);
+#ifdef VF_GPU_PYTHONBINDINGS
+        gpu::makeModule(m);
+#endif
+#ifdef VF_CPU_PYTHONBINDINGS
+        cpu::makeModule(m);
+#endif
+    }
+}
\ No newline at end of file
diff --git a/pythonbindings/src/VirtualFluidsModulesCPU.cpp b/pythonbindings/src/VirtualFluidsModulesCPU.cpp
deleted file mode 100644
index 2fba3da494f568f7d0d0a117a579a45c9c1b9245..0000000000000000000000000000000000000000
--- a/pythonbindings/src/VirtualFluidsModulesCPU.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <pybind11/pybind11.h>
-#include "cpu/cpu.cpp"
-
-namespace py_bindings
-{
-    namespace py = pybind11;
-
-    PYBIND11_MODULE(pyfluids, m)
-    {
-        cpu::makeModule(m);
-    }
-}
\ No newline at end of file
diff --git a/pythonbindings/src/VirtualFluidsModulesGPU.cpp b/pythonbindings/src/VirtualFluidsModulesGPU.cpp
deleted file mode 100644
index b96971caf381faada76ee676cf60469492d055c2..0000000000000000000000000000000000000000
--- a/pythonbindings/src/VirtualFluidsModulesGPU.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#include <pybind11/pybind11.h>
-#include "basics/basics.cpp"
-#include "lbm/lbm.cpp"
-#include "gpu/gpu.cpp"
-#include "logger/logger.cpp"
-
-namespace py_bindings
-{
-    namespace py = pybind11;
-
-    PYBIND11_MODULE(pyfluids, m)
-    {
-        basics::makeModule(m);
-        gpu::makeModule(m);
-        lbm::makeModule(m);
-        logging::makeModule(m);
-        py::add_ostream_redirect(m, "ostream_redirect");
-    }
-}
\ No newline at end of file
diff --git a/pythonbindings/src/basics/basics.cpp b/pythonbindings/src/basics/basics.cpp
index 381e345d78226b25ec3a77a14340d2ef1171c8c9..e67dfb05308511c8bf79d7e860299f062f317194 100644
--- a/pythonbindings/src/basics/basics.cpp
+++ b/pythonbindings/src/basics/basics.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file basics.cpp
+//! \ingroup basics
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include "submodules/logger.cpp"
 #include "submodules/configuration_file.cpp"
diff --git a/pythonbindings/src/basics/submodules/configuration_file.cpp b/pythonbindings/src/basics/submodules/configuration_file.cpp
index f5a2f87135a17f5eda34a7467d95f9db6b1c21d1..7fcd48c34824b9370eeac1872c899bf980176a52 100644
--- a/pythonbindings/src/basics/submodules/configuration_file.cpp
+++ b/pythonbindings/src/basics/submodules/configuration_file.cpp
@@ -1,5 +1,37 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file configuration_file.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
-#include <basics/config/ConfigurationFile.h>
+#include "basics/config/ConfigurationFile.h"
 
 namespace configuration
 {
@@ -9,6 +41,19 @@ namespace configuration
     {
         py::class_<vf::basics::ConfigurationFile>(parentModule, "ConfigurationFile")
         .def(py::init<>())
-        .def("load", &vf::basics::ConfigurationFile::load);
+        .def("load", &vf::basics::ConfigurationFile::load, py::arg("file"))
+        .def("contains", &vf::basics::ConfigurationFile::contains, py::arg("key"))
+        .def("get_int_value"   , static_cast<int         (vf::basics::ConfigurationFile::*)(const std::string&) const>(&vf::basics::ConfigurationFile::getValue), py::arg("key"))
+        .def("get_int_value"   , static_cast<int         (vf::basics::ConfigurationFile::*)(const std::string&, int        ) const>(&vf::basics::ConfigurationFile::getValue), py::arg("key"), py::arg("default_value"))
+        .def("get_uint_value"  , static_cast<uint        (vf::basics::ConfigurationFile::*)(const std::string&) const>(&vf::basics::ConfigurationFile::getValue), py::arg("key"))
+        .def("get_uint_value"  , static_cast<uint        (vf::basics::ConfigurationFile::*)(const std::string&, uint       ) const>(&vf::basics::ConfigurationFile::getValue), py::arg("key"), py::arg("default_value"))
+        .def("get_float_value" , static_cast<float       (vf::basics::ConfigurationFile::*)(const std::string&) const>(&vf::basics::ConfigurationFile::getValue), py::arg("key"))
+        .def("get_float_value" , static_cast<float       (vf::basics::ConfigurationFile::*)(const std::string&, float      ) const>(&vf::basics::ConfigurationFile::getValue), py::arg("key"), py::arg("default_value"))
+        .def("get_double_value", static_cast<double      (vf::basics::ConfigurationFile::*)(const std::string&) const>(&vf::basics::ConfigurationFile::getValue), py::arg("key"))
+        .def("get_double_value", static_cast<double      (vf::basics::ConfigurationFile::*)(const std::string&, double     ) const>(&vf::basics::ConfigurationFile::getValue), py::arg("key"), py::arg("default_value"))
+        .def("get_bool_value"  , static_cast<bool        (vf::basics::ConfigurationFile::*)(const std::string&) const>(&vf::basics::ConfigurationFile::getValue), py::arg("key"))
+        .def("get_bool_value"  , static_cast<bool        (vf::basics::ConfigurationFile::*)(const std::string&, bool       ) const>(&vf::basics::ConfigurationFile::getValue), py::arg("key"), py::arg("default_value"))
+        .def("get_string_value", static_cast<std::string (vf::basics::ConfigurationFile::*)(const std::string&) const>(&vf::basics::ConfigurationFile::getValue), py::arg("key"))
+        .def("get_string_value", static_cast<std::string (vf::basics::ConfigurationFile::*)(const std::string&, std::string) const>(&vf::basics::ConfigurationFile::getValue), py::arg("key"), py::arg("default_value"));
     }
 }
\ No newline at end of file
diff --git a/pythonbindings/src/basics/submodules/lbm_or_gks.cpp b/pythonbindings/src/basics/submodules/lbm_or_gks.cpp
index ed1deeca62fc57b7f44499b306e9f99b7f990604..d20cf2d1f631f6d36a80c36f1fb6c9c59d192090 100644
--- a/pythonbindings/src/basics/submodules/lbm_or_gks.cpp
+++ b/pythonbindings/src/basics/submodules/lbm_or_gks.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file lbm_or_gks.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include "basics/Core/LbmOrGks.h"
 
diff --git a/pythonbindings/src/basics/submodules/logger.cpp b/pythonbindings/src/basics/submodules/logger.cpp
index d46648e349b44243581e083f3561e8a13648f3b2..fa7e00e4dca06581b7a14d2bcf2628ed6af60001 100644
--- a/pythonbindings/src/basics/submodules/logger.cpp
+++ b/pythonbindings/src/basics/submodules/logger.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file logger.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <pybind11/iostream.h>
 #include <basics/Core/Logger/Logger.h>
@@ -12,12 +44,12 @@ namespace logger
         py::module loggerModule = parentModule.def_submodule("logger");
 
         py::class_<logging::Logger>(loggerModule, "Logger")
-        .def("add_stdout", [](){
+        .def_static("add_stdout", [](){
             logging::Logger::addStream(&std::cout);
         })
-        .def("set_debug_level", &logging::Logger::setDebugLevel)
-        .def("time_stamp", &logging::Logger::timeStamp)
-        .def("enable_printed_rank_numbers", &logging::Logger::enablePrintedRankNumbers);
+        .def_static("set_debug_level", &logging::Logger::setDebugLevel)
+        .def_static("time_stamp", &logging::Logger::timeStamp, py::arg("time_stamp"))
+        .def_static("enable_printed_rank_numbers", &logging::Logger::enablePrintedRankNumbers, py::arg("print"));
 
         loggerModule.attr("log") = logging::out;
         py::enum_<logging::Logger::Level>(loggerModule, "Level")
diff --git a/pythonbindings/src/cpu/cpu.cpp b/pythonbindings/src/cpu/cpu.cpp
index 554de53b47446366693aed31d534f6145ebea8ba..75143d913596c74a26f25ce64f1e6d214a442e34 100644
--- a/pythonbindings/src/cpu/cpu.cpp
+++ b/pythonbindings/src/cpu/cpu.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file cpu.cpp
+//! \ingroup cpu
+//! \author Sven Marcus, Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include "submodules/boundaryconditions.cpp"
 #include "submodules/simulationconfig.cpp"
diff --git a/pythonbindings/src/cpu/submodules/boundaryconditions.cpp b/pythonbindings/src/cpu/submodules/boundaryconditions.cpp
index 3bff7bc069ca20fe1c0cf3d1847b9714e0381505..ac9ec8605dec51e8374c850b1c1b58314674c426 100644
--- a/pythonbindings/src/cpu/submodules/boundaryconditions.cpp
+++ b/pythonbindings/src/cpu/submodules/boundaryconditions.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file boundaryconditions.cpp
+//! \ingroup submodules
+//! \author Sven Marcus, Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <BoundaryConditions/DensityBCAdapter.h>
diff --git a/pythonbindings/src/cpu/submodules/geometry.cpp b/pythonbindings/src/cpu/submodules/geometry.cpp
index b7ff4dd761258d41687589d2dd89c3479093753e..4c4c47b002b9c7451a8d788ba82c4a19b78ca96f 100644
--- a/pythonbindings/src/cpu/submodules/geometry.cpp
+++ b/pythonbindings/src/cpu/submodules/geometry.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file geometry.cpp
+//! \ingroup submodules
+//! \author Sven Marcus, Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <geometry3d/GbPoint3D.h>
 #include <geometry3d/GbObject3D.h>
diff --git a/pythonbindings/src/cpu/submodules/kernel.cpp b/pythonbindings/src/cpu/submodules/kernel.cpp
index fb291790632cc2041410f60a14fca8d966283343..b00d86579540a299e4bf3ed47bc09d4386f420a2 100644
--- a/pythonbindings/src/cpu/submodules/kernel.cpp
+++ b/pythonbindings/src/cpu/submodules/kernel.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file kernel.cpp
+//! \ingroup submodules
+//! \author Sven Marcus, Henry Korb
+//=======================================================================================
 #include <memory>
 #include <pybind11/pybind11.h>
 #include <simulationconfig/KernelFactory.h>
diff --git a/pythonbindings/src/cpu/submodules/simulationconfig.cpp b/pythonbindings/src/cpu/submodules/simulationconfig.cpp
index 60af4e36af4dca67e9262dd9f5ee1f46d5b7bb58..09d91f44e85f03c6150c56ce5762e7629212fba0 100644
--- a/pythonbindings/src/cpu/submodules/simulationconfig.cpp
+++ b/pythonbindings/src/cpu/submodules/simulationconfig.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file simulationconfig.cpp
+//! \ingroup submodules
+//! \author Sven Marcus, Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <simulationconfig/Simulation.h>
 
diff --git a/pythonbindings/src/cpu/submodules/simulationparameters.cpp b/pythonbindings/src/cpu/submodules/simulationparameters.cpp
index acc272f2ee412cfbafd9007b4b18610cfd0a1e9b..b33d20f9e5d335a0ed381faf8786d88cc7642738 100644
--- a/pythonbindings/src/cpu/submodules/simulationparameters.cpp
+++ b/pythonbindings/src/cpu/submodules/simulationparameters.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file simulationparameters.cpp
+//! \ingroup submodules
+//! \author Sven Marcus, Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <complex>
diff --git a/pythonbindings/src/cpu/submodules/writer.cpp b/pythonbindings/src/cpu/submodules/writer.cpp
index d5ec527a27caf63d9a3066c51e1f675b307fe0b2..f1cfd8934c2da84266a93d5bcd91eb26f5f69d3f 100644
--- a/pythonbindings/src/cpu/submodules/writer.cpp
+++ b/pythonbindings/src/cpu/submodules/writer.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file writer.cpp
+//! \ingroup submodules
+//! \author Sven Marcus, Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <simulationconfig/WriterConfiguration.h>
 
diff --git a/pythonbindings/src/gpu/gpu.cpp b/pythonbindings/src/gpu/gpu.cpp
index dc110cd5e19a9aad4937f9c2133ddf74c0ddf9bf..9eb160ae7765f16a6437e343cb878bb4b80877bf 100644
--- a/pythonbindings/src/gpu/gpu.cpp
+++ b/pythonbindings/src/gpu/gpu.cpp
@@ -1,14 +1,50 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file gpu.cpp
+//! \ingroup gpu
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
-#include "submodules/actuator_line.cpp"
 #include "submodules/pre_collision_interactor.cpp"
 #include "submodules/simulation.cpp"
 #include "submodules/parameter.cpp"
 #include "submodules/boundary_conditions.cpp"
 #include "submodules/communicator.cpp"
 #include "submodules/cuda_memory_manager.cpp"
+#include "submodules/probes.cpp"
+#include "submodules/precursor_writer.cpp"
 #include "submodules/grid_provider.cpp"
 #include "submodules/grid_generator.cpp"
-#include "submodules/probes.cpp"
+#include "submodules/turbulence_models.cpp"
+#include "submodules/transient_bc_setter.cpp"
+#include "submodules/actuator_farm.cpp"
+#include "submodules/grid_scaling_factory.cpp"
 
 namespace gpu
 {
@@ -20,13 +56,17 @@ namespace gpu
         simulation::makeModule(gpuModule);
         parameter::makeModule(gpuModule);
         pre_collision_interactor::makeModule(gpuModule);
-        actuator_line::makeModule(gpuModule);
+        actuator_farm::makeModule(gpuModule);
         boundary_conditions::makeModule(gpuModule);
+        transient_bc_setter::makeModule(gpuModule);
         communicator::makeModule(gpuModule); 
         cuda_memory_manager::makeModule(gpuModule);
-        grid_provider::makeModule(gpuModule);
         probes::makeModule(gpuModule);
+        precursor_writer::makeModule(gpuModule);
         grid_generator::makeModule(gpuModule);
+        grid_provider::makeModule(gpuModule);
+        turbulence_model::makeModule(gpuModule);
+        grid_scaling_factory::makeModule(gpuModule);
         return gpuModule;
     }
 }
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/actuator_farm.cpp b/pythonbindings/src/gpu/submodules/actuator_farm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a930616db3e0d0713bdf57157387d75d171603de
--- /dev/null
+++ b/pythonbindings/src/gpu/submodules/actuator_farm.cpp
@@ -0,0 +1,171 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file actuator_farm.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.h>
+#include <gpu/VirtualFluids_GPU/PreCollisionInteractor/PreCollisionInteractor.h>
+class PyActuatorFarm : public ActuatorFarm 
+{
+public:
+    using ActuatorFarm::ActuatorFarm; // Inherit constructors
+    void calcBladeForces() override 
+    { 
+        PYBIND11_OVERRIDE_NAME(void, ActuatorFarm, "calc_blade_forces", calcBladeForces); 
+    }
+};
+namespace actuator_farm
+{
+    namespace py = pybind11;
+
+    void makeModule(py::module_ &parentModule)
+    {
+        using arr = py::array_t<float, py::array::c_style>;
+        
+        py::class_<ActuatorFarm, PreCollisionInteractor, PyActuatorFarm, std::shared_ptr<ActuatorFarm>>(parentModule, "ActuatorFarm", py::dynamic_attr())
+        .def(py::init<  const uint,
+                        const real,
+                        const uint,
+                        const real,
+                        int,
+                        const real,
+                        const real,
+                        const bool>(), 
+                        py::arg("number_of_blades_per_turbine"), 
+                        py::arg("density"), 
+                        py::arg("number_of_nodes_per_blade"), 
+                        py::arg("epsilon"),
+                        py::arg("level"), 
+                        py::arg("delta_t"), 
+                        py::arg("delta_x"),
+                        py::arg("use_host_arrays"))
+        .def_property_readonly("number_of_turbines", &ActuatorFarm::getNumberOfTurbines)
+        .def_property_readonly("number_of_nodes_per_blade", &ActuatorFarm::getNumberOfNodesPerBlade)
+        .def_property_readonly("number_of_blades_per_turbine", &ActuatorFarm::getNumberOfBladesPerTurbine)
+        .def_property_readonly("number_of_nodes", &ActuatorFarm::getNumberOfNodes)
+        .def_property_readonly("number_of_indices", &ActuatorFarm::getNumberOfIndices)
+        .def_property_readonly("density", &ActuatorFarm::getDensity)
+        .def_property_readonly("delta_t", &ActuatorFarm::getDeltaT)
+        .def_property_readonly("delta_x", &ActuatorFarm::getDeltaX)
+
+        .def("add_turbine", &ActuatorFarm::addTurbine, py::arg("posX"), py::arg("posY"), py::arg("posZ"), py::arg("diameter"), py::arg("omega"), py::arg("azimuth"), py::arg("yaw"), py::arg("bladeRadii"))
+
+        .def("get_turbine_pos", [](ActuatorFarm& al, uint turbine){ real position[3] = {al.getTurbinePosX(turbine), al.getTurbinePosY(turbine), al.getTurbinePosZ(turbine)}; return arr(3,  position); }, py::arg("turbine"))
+        .def("get_turbine_azimuth", &ActuatorFarm::getTurbineAzimuth, py::arg("turbine"))
+        .def("get_turbine_yaw", &ActuatorFarm::getTurbineYaw, py::arg("turbine"))
+        .def("get_turbine_omega", &ActuatorFarm::getTurbineOmega, py::arg("turbine"))
+        .def("get_all_azimuths", [](ActuatorFarm& al){ return arr(al.getNumberOfTurbines(), al.getAllAzimuths()); } )
+        .def("get_all_yaws", [](ActuatorFarm& al){ return arr(al.getNumberOfTurbines(), al.getAllYaws()); } )
+        .def("get_all_omegas", [](ActuatorFarm& al){ return arr(al.getNumberOfTurbines(), al.getAllOmegas()); } )
+        .def("get_all_turbine_pos_x", [](ActuatorFarm& al){ return arr(al.getNumberOfTurbines(), al.getAllTurbinePosX()); } )
+        .def("get_all_turbine_pos_y", [](ActuatorFarm& al){ return arr(al.getNumberOfTurbines(), al.getAllTurbinePosY()); } )
+        .def("get_all_turbine_pos_z", [](ActuatorFarm& al){ return arr(al.getNumberOfTurbines(), al.getAllTurbinePosZ()); } )
+    
+        .def("get_all_blade_radii", [](ActuatorFarm& al){ return arr({al.getNumberOfTurbines(), al.getNumberOfNodesPerBlade()}, al.getAllBladeRadii()); } )
+        .def("get_all_blade_coords_x", [](ActuatorFarm& al){ return arr({al.getNumberOfTurbines(), al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getAllBladeCoordsX()); } )
+        .def("get_all_blade_coords_y", [](ActuatorFarm& al){ return arr({al.getNumberOfTurbines(), al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getAllBladeCoordsY()); } )
+        .def("get_all_blade_coords_z", [](ActuatorFarm& al){ return arr({al.getNumberOfTurbines(), al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getAllBladeCoordsZ()); } )        
+        .def("get_all_blade_velocities_x", [](ActuatorFarm& al){ return arr({al.getNumberOfTurbines(), al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getAllBladeVelocitiesX()); } )
+        .def("get_all_blade_velocities_y", [](ActuatorFarm& al){ return arr({al.getNumberOfTurbines(), al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getAllBladeVelocitiesY()); } )
+        .def("get_all_blade_velocities_z", [](ActuatorFarm& al){ return arr({al.getNumberOfTurbines(), al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getAllBladeVelocitiesZ()); } )
+        .def("get_all_blade_forces_x", [](ActuatorFarm& al){ return arr({al.getNumberOfTurbines(), al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getAllBladeForcesX()); } )
+        .def("get_all_blade_forces_y", [](ActuatorFarm& al){ return arr({al.getNumberOfTurbines(), al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getAllBladeForcesY()); } )
+        .def("get_all_blade_forces_z", [](ActuatorFarm& al){ return arr({al.getNumberOfTurbines(), al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getAllBladeForcesZ()); } )
+
+        .def("get_turbine_blade_radii", [](ActuatorFarm& al, uint turbine){ return arr(al.getNumberOfNodesPerBlade(), al.getTurbineBladeRadiiDevice(turbine)); } , py::arg("turbine"))
+        .def("get_turbine_blade_coords_x", [](ActuatorFarm& al, uint turbine){ return arr({al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getTurbineBladeCoordsXDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_coords_y", [](ActuatorFarm& al, uint turbine){ return arr({al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getTurbineBladeCoordsYDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_coords_z", [](ActuatorFarm& al, uint turbine){ return arr({al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getTurbineBladeCoordsZDevice(turbine)); }, py::arg("turbine") )        
+        .def("get_turbine_blade_velocities_x", [](ActuatorFarm& al, uint turbine){ return arr({al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getTurbineBladeVelocitiesXDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_velocities_y", [](ActuatorFarm& al, uint turbine){ return arr({al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getTurbineBladeVelocitiesYDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_velocities_z", [](ActuatorFarm& al, uint turbine){ return arr({al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getTurbineBladeVelocitiesZDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_forces_x", [](ActuatorFarm& al, uint turbine){ return arr({al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getTurbineBladeForcesXDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_forces_y", [](ActuatorFarm& al, uint turbine){ return arr({al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getTurbineBladeForcesYDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_forces_z", [](ActuatorFarm& al, uint turbine){ return arr({al.getNumberOfBladesPerTurbine(), al.getNumberOfNodesPerBlade()}, al.getTurbineBladeForcesZDevice(turbine)); }, py::arg("turbine") )
+
+        .def("get_all_blade_radii_device", [](ActuatorFarm& al) -> intptr_t { return reinterpret_cast<intptr_t>(al.getAllBladeRadiiDevice()); } )
+        .def("get_all_blade_coords_x_device", [](ActuatorFarm& al) -> intptr_t { return reinterpret_cast<intptr_t> (al.getAllBladeCoordsXDevice()); } )
+        .def("get_all_blade_coords_y_device", [](ActuatorFarm& al) -> intptr_t { return reinterpret_cast<intptr_t> (al.getAllBladeCoordsYDevice()); } )
+        .def("get_all_blade_coords_z_device", [](ActuatorFarm& al) -> intptr_t { return reinterpret_cast<intptr_t> (al.getAllBladeCoordsZDevice()); } )        
+        .def("get_all_blade_velocities_x_device", [](ActuatorFarm& al) -> intptr_t { return reinterpret_cast<intptr_t> (al.getAllBladeVelocitiesXDevice()); } )
+        .def("get_all_blade_velocities_y_device", [](ActuatorFarm& al) -> intptr_t { return reinterpret_cast<intptr_t> (al.getAllBladeVelocitiesYDevice()); } )
+        .def("get_all_blade_velocities_z_device", [](ActuatorFarm& al) -> intptr_t { return reinterpret_cast<intptr_t> (al.getAllBladeVelocitiesZDevice()); } )
+        .def("get_all_blade_forces_x_device", [](ActuatorFarm& al) -> intptr_t { return reinterpret_cast<intptr_t> (al.getAllBladeForcesXDevice()); } )
+        .def("get_all_blade_forces_y_device", [](ActuatorFarm& al) -> intptr_t { return reinterpret_cast<intptr_t> (al.getAllBladeForcesYDevice()); } )
+        .def("get_all_blade_forces_z_device", [](ActuatorFarm& al) -> intptr_t { return reinterpret_cast<intptr_t> (al.getAllBladeForcesZDevice()); } )
+
+        .def("get_turbine_blade_radii_device", [](ActuatorFarm& al, uint turbine) -> intptr_t { return reinterpret_cast<intptr_t>(al.getTurbineBladeRadiiDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_coords_x_device", [](ActuatorFarm& al, uint turbine) -> intptr_t { return reinterpret_cast<intptr_t>(al.getTurbineBladeCoordsXDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_coords_y_device", [](ActuatorFarm& al, uint turbine) -> intptr_t { return reinterpret_cast<intptr_t>(al.getTurbineBladeCoordsYDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_coords_z_device", [](ActuatorFarm& al, uint turbine) -> intptr_t { return reinterpret_cast<intptr_t>(al.getTurbineBladeCoordsZDevice(turbine)); }, py::arg("turbine") )        
+        .def("get_turbine_blade_velocities_x_device", [](ActuatorFarm& al, uint turbine) -> intptr_t { return reinterpret_cast<intptr_t>(al.getTurbineBladeVelocitiesXDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_velocities_y_device", [](ActuatorFarm& al, uint turbine) -> intptr_t { return reinterpret_cast<intptr_t>(al.getTurbineBladeVelocitiesYDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_velocities_z_device", [](ActuatorFarm& al, uint turbine) -> intptr_t { return reinterpret_cast<intptr_t>(al.getTurbineBladeVelocitiesZDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_forces_x_device", [](ActuatorFarm& al, uint turbine) -> intptr_t { return reinterpret_cast<intptr_t>(al.getTurbineBladeForcesXDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_forces_y_device", [](ActuatorFarm& al, uint turbine) -> intptr_t { return reinterpret_cast<intptr_t>(al.getTurbineBladeForcesYDevice(turbine)); }, py::arg("turbine") )
+        .def("get_turbine_blade_forces_z_device", [](ActuatorFarm& al, uint turbine) -> intptr_t { return reinterpret_cast<intptr_t>(al.getTurbineBladeForcesZDevice(turbine)); }, py::arg("turbine") )
+
+        .def("set_all_azimuths", [](ActuatorFarm& al, arr azimuths){ al.setAllAzimuths(static_cast<float *>(azimuths.request().ptr)); }, py::arg("azimuths"))
+        .def("set_all_yaws", [](ActuatorFarm& al, arr yaws){ al.setAllYaws(static_cast<float *>(yaws.request().ptr)); }, py::arg("yaws"))
+        .def("set_all_omegas", [](ActuatorFarm& al, arr omegas){ al.setAllOmegas(static_cast<float *>(omegas.request().ptr)); }, py::arg("omegas"))
+
+        .def("set_turbine_azimuth", &ActuatorFarm::setTurbineAzimuth, py::arg("turbine"), py::arg("azimuth"))
+        .def("set_turbine_yaw", &ActuatorFarm::setTurbineYaw, py::arg("turbine"), py::arg("yaw"))
+        .def("set_turbine_omega", &ActuatorFarm::setTurbineOmega, py::arg("turbine"), py::arg("omega"))
+
+        .def("set_all_blade_coords", [](ActuatorFarm& al, arr coordsX, arr coordsY, arr coordsZ)
+        { 
+            al.setAllBladeCoords(static_cast<float *>(coordsX.request().ptr), static_cast<float *>(coordsY.request().ptr), static_cast<float *>(coordsZ.request().ptr)); 
+        }, py::arg("blade_coords_x"), py::arg("blade_coords_y"), py::arg("blade_coords_z") )
+        .def("set_all_blade_velocities", [](ActuatorFarm& al, arr velocitiesX, arr velocitiesY, arr velocitiesZ)
+        { 
+            al.setAllBladeVelocities(static_cast<float *>(velocitiesX.request().ptr), static_cast<float *>(velocitiesY.request().ptr), static_cast<float *>(velocitiesZ.request().ptr)); 
+        }, py::arg("blade_velocities_x"), py::arg("blade_velocities_y"), py::arg("blade_velocities_z") )
+        .def("set_all_blade_forces", [](ActuatorFarm& al, arr forcesX, arr forcesY, arr forcesZ)
+        { 
+            al.setAllBladeForces(static_cast<float *>(forcesX.request().ptr), static_cast<float *>(forcesY.request().ptr), static_cast<float *>(forcesZ.request().ptr));
+        }, py::arg("blade_forces_x"), py::arg("blade_forces_y"), py::arg("blade_forces_z") )     
+        .def("set_turbine_blade_coords", [](ActuatorFarm& al, uint turbine, arr coordsX, arr coordsY, arr coordsZ)
+        { 
+            al.setTurbineBladeCoords(turbine, static_cast<float *>(coordsX.request().ptr), static_cast<float *>(coordsY.request().ptr), static_cast<float *>(coordsZ.request().ptr)); 
+        }, py::arg("turbine"), py::arg("blade_coords_x"), py::arg("blade_coords_y"), py::arg("blade_coords_z") )
+        .def("set_turbine_blade_velocities", [](ActuatorFarm& al, uint turbine, arr velocitiesX, arr velocitiesY, arr velocitiesZ)
+        {
+            al.setTurbineBladeVelocities(turbine, static_cast<float *>(velocitiesX.request().ptr), static_cast<float *>(velocitiesY.request().ptr), static_cast<float *>(velocitiesZ.request().ptr)); 
+        }, py::arg("turbine"), py::arg("blade_velocities_x"), py::arg("blade_velocities_y"), py::arg("blade_velocities_z") )
+        .def("set_turbine_blade_forces", [](ActuatorFarm& al, uint turbine, arr forcesX, arr forcesY, arr forcesZ)
+        { 
+            al.setTurbineBladeForces(turbine, static_cast<float *>(forcesX.request().ptr), static_cast<float *>(forcesY.request().ptr), static_cast<float *>(forcesZ.request().ptr)); 
+        }, py::arg("turbine"), py::arg("blade_forces_x"), py::arg("blade_forces_y"), py::arg("blade_forces_z") )
+        .def("calc_blade_forces", &ActuatorFarm::calcBladeForces);
+    }
+}
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/actuator_line.cpp b/pythonbindings/src/gpu/submodules/actuator_line.cpp
deleted file mode 100644
index 3207fadbc37df38e53e00adcb9a86f0b8e82ba98..0000000000000000000000000000000000000000
--- a/pythonbindings/src/gpu/submodules/actuator_line.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <pybind11/numpy.h>
-#include <gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h>
-#include <gpu/VirtualFluids_GPU/PreCollisionInteractor/PreCollisionInteractor.h>
-class PyActuatorLine : public ActuatorLine 
-{
-public:
-    using ActuatorLine::ActuatorLine; // Inherit constructors
-    void calcBladeForces() override 
-    { 
-        PYBIND11_OVERRIDE_NAME(void, ActuatorLine, "calc_blade_forces", calcBladeForces,); 
-    }
-};
-namespace actuator_line
-{
-    namespace py = pybind11;
-
-    void makeModule(py::module_ &parentModule)
-    {
-        using arr = py::array_t<float, py::array::c_style>;
-        
-        py::class_<ActuatorLine, PreCollisionInteractor, PyActuatorLine, std::shared_ptr<ActuatorLine>>(parentModule, "ActuatorLine", py::dynamic_attr())
-        .def(py::init<  const uint,
-                        const real,
-                        const uint,
-                        const real,
-                        real, real, real,
-                        const real,
-                        int,
-                        const real,
-                        const real>(), 
-                        "n_blades", 
-                        "density", 
-                        "n_blade_nodes", 
-                        "epsilon",
-                        "turbine_pos_x", "turbine_pos_y", "turbine_pos_z", 
-                        "diameter", 
-                        "level", 
-                        "delta_t", 
-                        "delta_x")
-        .def_property("omega", &ActuatorLine::getOmega, &ActuatorLine::setOmega)
-        .def_property("azimuth", &ActuatorLine::getAzimuth, &ActuatorLine::setAzimuth)
-        .def_property("yaw", &ActuatorLine::getYaw, &ActuatorLine::setYaw)
-        .def_property_readonly("n_blades", &ActuatorLine::getNBlades)
-        .def_property_readonly("n_blade_nodes", &ActuatorLine::getNBladeNodes)
-        .def_property_readonly("n_nodes", &ActuatorLine::getNNodes)
-        .def_property_readonly("n_indices", &ActuatorLine::getNIndices)
-        .def_property_readonly("density", &ActuatorLine::getDensity)
-        .def_property_readonly("position_x", &ActuatorLine::getPositionX)
-        .def_property_readonly("position_y", &ActuatorLine::getPositionY)
-        .def_property_readonly("position_z", &ActuatorLine::getPositionZ)
-        .def_property_readonly("position", [](ActuatorLine& al){ real position[3] = {al.getPositionX(), al.getPositionY(), al.getPositionZ()}; return arr(3, position); } )
-        .def("get_radii", [](ActuatorLine& al){ return arr(al.getNBladeNodes(), al.getBladeRadii()); } )
-        .def("get_blade_coords_x", [](ActuatorLine& al){ return arr({al.getNBlades(), al.getNBladeNodes()}, al.getBladeCoordsX()); } )
-        .def("get_blade_coords_y", [](ActuatorLine& al){ return arr({al.getNBlades(), al.getNBladeNodes()}, al.getBladeCoordsY()); } )
-        .def("get_blade_coords_z", [](ActuatorLine& al){ return arr({al.getNBlades(), al.getNBladeNodes()}, al.getBladeCoordsZ()); } )        
-        .def("get_blade_velocities_x", [](ActuatorLine& al){ return arr({al.getNBlades(), al.getNBladeNodes()}, al.getBladeVelocitiesX()); } )
-        .def("get_blade_velocities_y", [](ActuatorLine& al){ return arr({al.getNBlades(), al.getNBladeNodes()}, al.getBladeVelocitiesY()); } )
-        .def("get_blade_velocities_z", [](ActuatorLine& al){ return arr({al.getNBlades(), al.getNBladeNodes()}, al.getBladeVelocitiesZ()); } )
-        .def("get_blade_forces_x", [](ActuatorLine& al){ return arr({al.getNBlades(), al.getNBladeNodes()}, al.getBladeForcesX()); } )
-        .def("get_blade_forces_y", [](ActuatorLine& al){ return arr({al.getNBlades(), al.getNBladeNodes()}, al.getBladeForcesY()); } )
-        .def("get_blade_forces_z", [](ActuatorLine& al){ return arr({al.getNBlades(), al.getNBladeNodes()}, al.getBladeForcesZ()); } )
-        .def("set_blade_coords", [](ActuatorLine& al, arr coordsX, arr coordsY, arr coordsZ){ 
-            al.setBladeCoords(static_cast<float *>(coordsX.request().ptr), static_cast<float *>(coordsY.request().ptr), static_cast<float *>(coordsZ.request().ptr)); } )
-        .def("set_blade_velocities", [](ActuatorLine& al, arr velocitiesX, arr velocitiesY, arr velocitiesZ){ 
-            al.setBladeVelocities(static_cast<float *>(velocitiesX.request().ptr), static_cast<float *>(velocitiesY.request().ptr), static_cast<float *>(velocitiesZ.request().ptr)); } )
-        .def("set_blade_forces", [](ActuatorLine& al, arr forcesX, arr forcesY, arr forcesZ){ 
-            al.setBladeForces(static_cast<float *>(forcesX.request().ptr), static_cast<float *>(forcesY.request().ptr), static_cast<float *>(forcesZ.request().ptr)); } )
-        .def("calc_blade_forces", &ActuatorLine::calcBladeForces);
-    }
-}
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/boundary_conditions.cpp b/pythonbindings/src/gpu/submodules/boundary_conditions.cpp
index 8f941a8705c225275d25291205ebdaeef8de5c9e..865817bb16f7b164c40bdc066645fb2e1f1c842e 100644
--- a/pythonbindings/src/gpu/submodules/boundary_conditions.cpp
+++ b/pythonbindings/src/gpu/submodules/boundary_conditions.cpp
@@ -1,5 +1,38 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file boindary_conditions.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <gpu/GridGenerator/grid/BoundaryConditions/Side.h>
+#include "gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
 
 namespace boundary_conditions
 {
@@ -14,7 +47,59 @@ namespace boundary_conditions
         .value("PY", SideType::PY)
         .value("MZ", SideType::MZ)
         .value("PZ", SideType::PZ)
-        .value("GEOMETRY", SideType::GEOMETRY)
-        .export_values();
+        .value("GEOMETRY", SideType::GEOMETRY);
+
+        py::class_<BoundaryConditionFactory>(parentModule, "BoundaryConditionFactory")
+        .def(py::init<>())
+        .def("set_velocity_boundary_condition", &BoundaryConditionFactory::setVelocityBoundaryCondition, py::arg("boundary_condition_type"))
+        .def("set_no_slip_boundary_condition", &BoundaryConditionFactory::setNoSlipBoundaryCondition, py::arg("boundary_condition_type"))
+        .def("set_slip_boundary_condition", &BoundaryConditionFactory::setSlipBoundaryCondition, py::arg("boundary_condition_type"))
+        .def("set_pressure_boundary_condition", &BoundaryConditionFactory::setPressureBoundaryCondition, py::arg("boundary_condition_type"))
+        .def("set_stress_boundary_condition", &BoundaryConditionFactory::setStressBoundaryCondition, py::arg("boundary_condition_type"))
+        .def("set_precursor_boundary_condition", &BoundaryConditionFactory::setPrecursorBoundaryCondition, py::arg("boundary_condition_type"))
+        .def("set_geometry_boundary_condition", &BoundaryConditionFactory::setGeometryBoundaryCondition, py::arg("boundary_condition_type"));
+
+        py::enum_<BoundaryConditionFactory::VelocityBC>(parentModule, "VelocityBC")
+        .value("VelocitySimpleBounceBackCompressible", BoundaryConditionFactory::VelocityBC::VelocitySimpleBounceBackCompressible)
+        .value("VelocityIncompressible", BoundaryConditionFactory::VelocityBC::VelocityIncompressible)
+        .value("VelocityCompressible", BoundaryConditionFactory::VelocityBC::VelocityCompressible)
+        .value("VelocityAndPressureCompressible", BoundaryConditionFactory::VelocityBC::VelocityAndPressureCompressible)
+        .value("NotSpecified", BoundaryConditionFactory::VelocityBC::NotSpecified);
+
+
+        py::enum_<BoundaryConditionFactory::NoSlipBC>(parentModule, "NoSlipBC")
+        .value("NoSlipImplicitBounceBack", BoundaryConditionFactory::NoSlipBC::NoSlipImplicitBounceBack)
+        .value("NoSlipBounceBack", BoundaryConditionFactory::NoSlipBC::NoSlipBounceBack)
+        .value("NoSlipIncompressible", BoundaryConditionFactory::NoSlipBC::NoSlipIncompressible)
+        .value("NoSlipCompressible", BoundaryConditionFactory::NoSlipBC::NoSlipCompressible)
+        .value("NoSlip3rdMomentsCompressible", BoundaryConditionFactory::NoSlipBC::NoSlip3rdMomentsCompressible);
+
+        py::enum_<BoundaryConditionFactory::SlipBC>(parentModule, "SlipBC")
+        .value("SlipIncompressible", BoundaryConditionFactory::SlipBC::SlipIncompressible)
+        .value("SlipCompressible", BoundaryConditionFactory::SlipBC::SlipCompressible)
+        .value("SlipBounceBack", BoundaryConditionFactory::SlipBC::SlipBounceBack)
+        .value("SlipCompressibleTurbulentViscosity", BoundaryConditionFactory::SlipBC::SlipCompressibleTurbulentViscosity)
+        .value("SlipPressureCompressibleTurbulentViscosity", BoundaryConditionFactory::SlipBC::SlipPressureCompressibleTurbulentViscosity)
+        .value("NotSpecified", BoundaryConditionFactory::SlipBC::NotSpecified);
+
+        py::enum_<BoundaryConditionFactory::PressureBC>(parentModule, "PressureBC")
+        .value("PressureEquilibrium", BoundaryConditionFactory::PressureBC::PressureEquilibrium)
+        .value("PressureEquilibrium2", BoundaryConditionFactory::PressureBC::PressureEquilibrium2)
+        .value("PressureNonEquilibriumIncompressible", BoundaryConditionFactory::PressureBC::PressureNonEquilibriumIncompressible)
+        .value("PressureNonEquilibriumCompressible", BoundaryConditionFactory::PressureBC::PressureNonEquilibriumCompressible)
+        .value("OutflowNonReflective", BoundaryConditionFactory::PressureBC::OutflowNonReflective)
+        .value("OutflowNonReflectivePressureCorrection", BoundaryConditionFactory::PressureBC::OutflowNonReflectivePressureCorrection)
+        .value("NotSpecified", BoundaryConditionFactory::PressureBC::NotSpecified);
+
+        py::enum_<BoundaryConditionFactory::StressBC>(parentModule, "StressBC")
+        .value("StressCompressible", BoundaryConditionFactory::StressBC::StressCompressible)
+        .value("StressBounceBack", BoundaryConditionFactory::StressBC::StressBounceBack)
+        .value("StressPressureBounceBack", BoundaryConditionFactory::StressBC::StressPressureBounceBack)
+        .value("NotSpecified", BoundaryConditionFactory::StressBC::NotSpecified);
+
+        py::enum_<BoundaryConditionFactory::PrecursorBC>(parentModule, "PrecursorBC")
+        .value("VelocityPrecursor", BoundaryConditionFactory::PrecursorBC::VelocityPrecursor)
+        .value("DistributionsPrecursor", BoundaryConditionFactory::PrecursorBC::DistributionsPrecursor)
+        .value("NotSpecified", BoundaryConditionFactory::PrecursorBC::NotSpecified);
     }
 }
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/communicator.cpp b/pythonbindings/src/gpu/submodules/communicator.cpp
index edb36e2c2f774903590a16a0b406c721662827b1..26a57061933fbdbfe3447ec89eeb07116a9b974b 100644
--- a/pythonbindings/src/gpu/submodules/communicator.cpp
+++ b/pythonbindings/src/gpu/submodules/communicator.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file communicator.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <gpu/VirtualFluids_GPU/Communication/Communicator.h>
 
@@ -8,7 +40,7 @@ namespace communicator
     void makeModule(py::module_ &parentModule)
     {
         py::class_<vf::gpu::Communicator, std::unique_ptr<vf::gpu::Communicator, py::nodelete>>(parentModule, "Communicator")
-        .def("get_instance", &vf::gpu::Communicator::getInstance, py::return_value_policy::reference)
+        .def_static("get_instance", &vf::gpu::Communicator::getInstance, py::return_value_policy::reference)
         .def("get_number_of_process", &vf::gpu::Communicator::getNummberOfProcess)
         .def("get_pid", &vf::gpu::Communicator::getPID);
     }
diff --git a/pythonbindings/src/gpu/submodules/cuda_memory_manager.cpp b/pythonbindings/src/gpu/submodules/cuda_memory_manager.cpp
index bf27080cb3cd050343ba42b0571827ed58870cfd..bbff4832cb73f47e3d1a5a6abd78e21da2473deb 100644
--- a/pythonbindings/src/gpu/submodules/cuda_memory_manager.cpp
+++ b/pythonbindings/src/gpu/submodules/cuda_memory_manager.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file cuda_memory_manager.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h>
 #include <gpu/VirtualFluids_GPU/Parameter/Parameter.h>
@@ -10,6 +42,6 @@ namespace cuda_memory_manager
     void makeModule(py::module_ &parentModule)
     {
         py::class_<CudaMemoryManager, std::shared_ptr<CudaMemoryManager>>(parentModule, "CudaMemoryManager")
-        .def(py::init<std::shared_ptr<Parameter>>(), "parameter");
+        .def(py::init<std::shared_ptr<Parameter>>(), py::arg("parameter"));
     }
 }
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/grid_generator.cpp b/pythonbindings/src/gpu/submodules/grid_generator.cpp
index 579c06c4e00cae9646ced8b554d71631eeb7e793..3e9fb5655e26ffa6053a205da5a3e3f0f2ecd49f 100644
--- a/pythonbindings/src/gpu/submodules/grid_generator.cpp
+++ b/pythonbindings/src/gpu/submodules/grid_generator.cpp
@@ -1,4 +1,37 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file grid_generator.cpp
+//! \ingroup submodules
+//! \author Henry Korb, Henrik Asmuth
+//=======================================================================================
 #include <pybind11/pybind11.h>
+#include "gpu/GridGenerator/utilities/communication.h"
 #include "gpu/GridGenerator/geometries/Object.h"
 #include "gpu/GridGenerator/geometries/BoundingBox/BoundingBox.h"
 #include "gpu/GridGenerator/geometries/Conglomerate/Conglomerate.h"
@@ -17,51 +50,63 @@ namespace grid_generator
     {  
         py::module gridGeneratorModule = parentModule.def_submodule("grid_generator");
 
+        //TODO:
+        // py::enum_<CommunicationDirections>(gridGeneratorModule, "CommunicationDirections")
+        // .value("MX", CommunicationDirections::MX)
+        // .value("PX", CommunicationDirections::PX)
+        // .value("MY", CommunicationDirections::MY)
+        // .value("PY", CommunicationDirections::PY)
+        // .value("MZ", CommunicationDirections::MZ)
+        // .value("PZ", CommunicationDirections::PZ);
+
         py::class_<GridFactory, std::shared_ptr<GridFactory>>(gridGeneratorModule, "GridFactory")
-        .def("make", &GridFactory::make, py::return_value_policy::reference);
+        .def_static("make", &GridFactory::make, py::return_value_policy::reference);
 
-        py::class_<BoundingBox>(gridGeneratorModule, "BoundingBox")
-        .def(py::init<real, real, real, real, real, real>(),"min_x","max_x","min_y","max_y","min_z","max_z");
+        py::class_<BoundingBox, std::shared_ptr<BoundingBox>>(gridGeneratorModule, "BoundingBox")
+        .def(py::init<real, real, real, real, real, real>(), py::arg("min_x"), py::arg("max_x"), py::arg("min_y"), py::arg("max_y"), py::arg("min_z"), py::arg("max_z"));
 
         py::class_<Object, std::shared_ptr<Object>>(gridGeneratorModule, "Object");
         
         py::class_<Conglomerate, Object, std::shared_ptr<Conglomerate>>(gridGeneratorModule, "Conglomerate")
-        .def("make_shared", &Conglomerate::makeShared, py::return_value_policy::reference)
-        .def("add", &Conglomerate::add)
-        .def("subtract", &Conglomerate::subtract);
+        .def_static("make_shared", &Conglomerate::makeShared, py::return_value_policy::reference)
+        .def("add", &Conglomerate::add, py::arg("object"))
+        .def("subtract", &Conglomerate::subtract, py::arg("object"));
 
         py::class_<Cuboid, Object, std::shared_ptr<Cuboid>>(gridGeneratorModule, "Cuboid")
         .def(py::init<const double&, const double&, const double&, const double&, const double&, const double&>(),
-                        "min_x1", "min_x2", "min_x3", "max_x1", "max_x2", "max_x3");
+                        py::arg("min_x1"), py::arg("min_x2"), py::arg("min_x3"), py::arg("max_x1"), py::arg("max_x2"), py::arg("max_x3"));
 
         py::class_<Sphere, Object, std::shared_ptr<Sphere>>(gridGeneratorModule, "Sphere")
-        .def("make_shared", &Sphere::makeShared, py::return_value_policy::reference);
+        .def_static("make_shared", &Sphere::makeShared, py::return_value_policy::reference);
 
         py::class_<TriangularMesh, Object, std::shared_ptr<TriangularMesh>>(gridGeneratorModule, "TriangularMesh")
-        .def("make", &TriangularMesh::make, py::return_value_policy::reference);
+        .def_static("make", &TriangularMesh::make, py::return_value_policy::reference);
 
         py::class_<GridBuilder, std::shared_ptr<GridBuilder>>(gridGeneratorModule, "GridBuilder")
-        .def("get_number_of_grid_levels", &GridBuilder::getNumberOfGridLevels)
-        .def("get_grid", &GridBuilder::getGrid);
+        .def("get_number_of_grid_levels", &GridBuilder::getNumberOfGridLevels);
 
         py::class_<LevelGridBuilder, GridBuilder, std::shared_ptr<LevelGridBuilder>>(gridGeneratorModule, "LevelGridBuilder")
-        .def("get_grid", py::overload_cast<int, int>(&LevelGridBuilder::getGrid))
-        .def("set_slip_boundary_condition", &LevelGridBuilder::setSlipBoundaryCondition)
-        .def("set_velocity_boundary_condition", &LevelGridBuilder::setVelocityBoundaryCondition)
-        .def("set_pressure_boundary_condition", &LevelGridBuilder::setPressureBoundaryCondition)
-        .def("set_periodic_boundary_condition", &LevelGridBuilder::setPeriodicBoundaryCondition)
-        .def("set_no_slip_boundary_condition", &LevelGridBuilder::setNoSlipBoundaryCondition)
-        .def("set_stress_boundary_condition", &LevelGridBuilder::setStressBoundaryCondition);
+        .def("set_slip_boundary_condition", &LevelGridBuilder::setSlipBoundaryCondition, py::arg("side_type"), py::arg("normal_x"), py::arg("normal_y"), py::arg("normal_z"))
+        .def("set_velocity_boundary_condition", &LevelGridBuilder::setVelocityBoundaryCondition, py::arg("side_type"), py::arg("vx"), py::arg("vy"), py::arg("vz"))
+        .def("set_pressure_boundary_condition", &LevelGridBuilder::setPressureBoundaryCondition, py::arg("side_type"), py::arg("rho"))
+        .def("set_periodic_boundary_condition", &LevelGridBuilder::setPeriodicBoundaryCondition, py::arg("periodic_x"), py::arg("periodic_y"), py::arg("periodic_z"))
+        .def("set_no_slip_boundary_condition", &LevelGridBuilder::setNoSlipBoundaryCondition, py::arg("side_type"))
+        .def("set_precursor_boundary_condition", &LevelGridBuilder::setPrecursorBoundaryCondition, py::arg("side_type"), py::arg("file_collection"), py::arg("n_t_read"), py::arg("velocity_x")=0.0f, py::arg("velocity_y")=0.0f, py::arg("velocity_z")=0.0f, py::arg("file_level_to_grid_level_map")=std::vector<uint>())
+        .def("set_stress_boundary_condition", &LevelGridBuilder::setStressBoundaryCondition, py::arg("side_type"), py::arg("normal_x"), py::arg("normal_y"), py::arg("normal_z"), py::arg("sampling_offset"), py::arg("z0"), py::arg("dx"));
 
         py::class_<MultipleGridBuilder, LevelGridBuilder, std::shared_ptr<MultipleGridBuilder>>(gridGeneratorModule, "MultipleGridBuilder")
-        .def("make_shared", &MultipleGridBuilder::makeShared, py::return_value_policy::reference)
-        .def("add_coarse_grid", &MultipleGridBuilder::addCoarseGrid)
-        .def("add_grid", py::overload_cast<Object*>(&MultipleGridBuilder::addGrid))
-        .def("add_grid", py::overload_cast<Object*, uint>(&MultipleGridBuilder::addGrid))
-        .def("add_geometry", py::overload_cast<Object*>(&MultipleGridBuilder::addGeometry))
-        .def("add_geometry", py::overload_cast<Object*, uint>(&MultipleGridBuilder::addGeometry))
+        .def_static("make_shared", &MultipleGridBuilder::makeShared, py::return_value_policy::reference, py::arg("grid_factory"))
+        .def("add_coarse_grid", &MultipleGridBuilder::addCoarseGrid, py::arg("start_x"), py::arg("start_y"), py::arg("start_z"), py::arg("end_x"), py::arg("end_y"), py::arg("end_z"), py::arg("delta"))
+        .def("add_grid", py::overload_cast<Object*>(&MultipleGridBuilder::addGrid), py::arg("grid_shape"))
+        .def("add_grid", py::overload_cast<Object*, uint>(&MultipleGridBuilder::addGrid), py::arg("grid_shape"), py::arg("level_fine"))
+        .def("add_geometry", py::overload_cast<Object*>(&MultipleGridBuilder::addGeometry), py::arg("solid_object"))
+        .def("add_geometry", py::overload_cast<Object*, uint>(&MultipleGridBuilder::addGeometry), py::arg("solid_object"), py::arg("level"))
         .def("get_number_of_levels", &MultipleGridBuilder::getNumberOfLevels)
-        .def("build_grids", &MultipleGridBuilder::buildGrids);
+        .def("build_grids", &MultipleGridBuilder::buildGrids, py::arg("lbm_or_gks"), py::arg("enable_thin_walls"))
+        .def("set_subdomain_box", &MultipleGridBuilder::setSubDomainBox, py::arg("bounding_box"))
+        .def("find_communication_indices", &MultipleGridBuilder::findCommunicationIndices)
+        .def("set_communication_process", &MultipleGridBuilder::setCommunicationProcess)
+        .def("set_number_of_layers", &MultipleGridBuilder::setNumberOfLayers, py::arg("number_of_layers_fine"), py::arg("number_of_layers_between_levels"));
 
         return gridGeneratorModule;
     }
diff --git a/pythonbindings/src/gpu/submodules/grid_provider.cpp b/pythonbindings/src/gpu/submodules/grid_provider.cpp
index 02ff273e2cd1a2022943e19c9a48a447d9dfe54b..717e9d5cd82100636a5398c09662a0895ce8fb56 100644
--- a/pythonbindings/src/gpu/submodules/grid_provider.cpp
+++ b/pythonbindings/src/gpu/submodules/grid_provider.cpp
@@ -1,8 +1,36 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file grid_provider
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include "gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
-// #include <gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h>
-// #include <gpu/VirtualFluids_GPU/Parameter/Parameter.h>
-// #include "gpu/GridGenerator/grid/GridBuilder/GridBuilder.h"
 
 namespace grid_provider
 {
@@ -11,6 +39,6 @@ namespace grid_provider
     void makeModule(py::module_ &parentModule)
     {
         py::class_<GridProvider, std::shared_ptr<GridProvider>>(parentModule, "GridProvider")
-        .def("make_grid_generator", &GridProvider::makeGridGenerator, py::return_value_policy::reference);
+        .def_static("make_grid_generator", &GridProvider::makeGridGenerator, py::return_value_policy::reference, py::arg("builder"), py::arg("para"), py::arg("cuda_memory_manager"), py::arg("communicator"));
     }
 }
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/grid_scaling_factory.cpp b/pythonbindings/src/gpu/submodules/grid_scaling_factory.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3a572875a4695871c482a4308acab4214dbb481
--- /dev/null
+++ b/pythonbindings/src/gpu/submodules/grid_scaling_factory.cpp
@@ -0,0 +1,52 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file grid_scaling_factory.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
+#include <pybind11/pybind11.h>
+#include <gpu/VirtualFluids_GPU/Factories/GridScalingFactory.h>
+
+namespace grid_scaling_factory
+{
+    namespace py = pybind11;
+
+    void makeModule(py::module_ &parentModule)
+    {
+        
+        py::class_<GridScalingFactory, std::shared_ptr<GridScalingFactory>>(parentModule, "GridScalingFactory")
+        .def(py::init<>())
+        .def("set_scaling_factory", &GridScalingFactory::setScalingFactory, py::arg("scaling_type"));
+
+        py::enum_<GridScalingFactory::GridScaling>(parentModule, "GridScaling")
+        .value("ScaleCompressible", GridScalingFactory::GridScaling::ScaleCompressible)
+        .value("ScaleRhoSq", GridScalingFactory::GridScaling::ScaleRhoSq)
+        .value("NotSpecified", GridScalingFactory::GridScaling::NotSpecified);
+    }
+}
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/parameter.cpp b/pythonbindings/src/gpu/submodules/parameter.cpp
index 7b4e67f101e3928abbd4262557864ea1d0f45b02..a7c42223e6a5bfa3caa89c0879e4133fc4123ad0 100644
--- a/pythonbindings/src/gpu/submodules/parameter.cpp
+++ b/pythonbindings/src/gpu/submodules/parameter.cpp
@@ -1,10 +1,46 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file parameter.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <pybind11/functional.h>
 #include <pybind11/stl.h>
 #include <gpu/VirtualFluids_GPU/Parameter/Parameter.h>
+#include "lbm/constants/NumericConstants.h"
 #include <basics/config/ConfigurationFile.h>
 #include <gpu/VirtualFluids_GPU/PreCollisionInteractor/PreCollisionInteractor.h>
 
+
+using namespace vf::lbm::constant;
+
 namespace parameter
 {
     namespace py = pybind11;
@@ -13,42 +49,44 @@ namespace parameter
     {
         py::class_<Parameter, std::shared_ptr<Parameter>>(parentModule, "Parameter")
         .def(py::init<
-                const vf::basics::ConfigurationFile&, 
                 int,
-                int
-                >(),
-                "config_data",
-                "number_of_processes",
-                "my_ID")
-        .def("set_forcing", &Parameter::setForcing)
-        .def("set_diff_on", &Parameter::setDiffOn)
-        .def("set_comp_on", &Parameter::setCompOn)
-        .def("set_max_level", &Parameter::setMaxLevel)
-        .def("set_t_end", &Parameter::setTEnd)
-        .def("set_t_out", &Parameter::setTOut)
-        .def("set_t_start_out", &Parameter::setTStartOut)
-        .def("set_timestep_of_coarse_level", &Parameter::setTimestepOfCoarseLevel)
-        .def("set_output_path", &Parameter::setOutputPath)
-        .def("set_output_prefix", &Parameter::setOutputPrefix)
-        .def("set_f_name", &Parameter::setFName)
-        .def("set_print_files", &Parameter::setPrintFiles)
-        .def("set_temperature_init", &Parameter::setTemperatureInit)
-        .def("set_temperature_BC", &Parameter::setTemperatureBC)
-        .def("set_viscosity", &Parameter::setViscosity)
-        .def("set_velocity", &Parameter::setVelocity)
-        .def("set_viscosity_ratio", &Parameter::setViscosityRatio)
-        .def("set_velocity_ratio", &Parameter::setVelocityRatio)
-        .def("set_density_ratio", &Parameter::setDensityRatio)
-        .def("set_devices", &Parameter::setDevices)
-        .def("set_is_body_force", &Parameter::setIsBodyForce)
-        .def("set_use_AMD", &Parameter::setUseAMD)
-        .def("set_use_Wale", &Parameter::setUseWale)
-        .def("set_SGS_constant", &Parameter::setSGSConstant)
-        .def("set_main_kernel", &Parameter::setMainKernel)
-        .def("set_AD_kernel", &Parameter::setADKernel)
-        .def("set_use_AMD", &Parameter::setUseAMD)
-        .def("set_use_Wale", &Parameter::setUseWale)
-        .def("set_SGS_constant", &Parameter::setSGSConstant)
+                int,
+                std::optional<const vf::basics::ConfigurationFile*>>(),
+                py::arg("number_of_processes"),
+                py::arg("my_ID"),
+                py::arg("config_data"))
+        .def(py::init<int, int>(),
+                py::arg("number_of_processes"),
+                py::arg("my_ID"))
+        .def(py::init<const vf::basics::ConfigurationFile*>(), py::arg("config_data"))
+        .def("set_forcing", &Parameter::setForcing, py::arg("forcing_x"), py::arg("forcing_y"), py::arg("forcing_z"))
+        .def("set_quadric_limiters", &Parameter::setQuadricLimiters, py::arg("quadric_limiter_p"), py::arg("quadric_limiter_m"), py::arg("quadric_limiter_d"))
+        .def("set_diff_on", &Parameter::setDiffOn, py::arg("is_diff"))
+        .def("set_comp_on", &Parameter::setCompOn, py::arg("is_comp"))
+        .def("set_max_level", &Parameter::setMaxLevel, py::arg("number_of_levels"))
+        .def("set_timestep_end", &Parameter::setTimestepEnd, py::arg("tend"))
+        .def("set_timestep_out", &Parameter::setTimestepOut, py::arg("tout"))
+        .def("set_timestep_start_out", &Parameter::setTimestepStartOut, py::arg("t_start_out"))
+        .def("set_timestep_of_coarse_level", &Parameter::setTimestepOfCoarseLevel, py::arg("timestep"))
+        .def("set_calc_turbulence_intensity", &Parameter::setCalcTurbulenceIntensity, py::arg("calc_velocity_and_fluctuations"))
+        .def("set_output_path", &Parameter::setOutputPath, py::arg("o_path"))
+        .def("set_output_prefix", &Parameter::setOutputPrefix, py::arg("o_prefix"))
+        .def("set_print_files", &Parameter::setPrintFiles, py::arg("print_files"))
+        .def("set_temperature_init", &Parameter::setTemperatureInit, py::arg("temp"))
+        .def("set_temperature_BC", &Parameter::setTemperatureBC, py::arg("temp_bc"))
+        .def("set_viscosity_LB", &Parameter::setViscosityLB, py::arg("viscosity"))
+        .def("set_velocity_LB", &Parameter::setVelocityLB, py::arg("velocity"))
+        .def("set_viscosity_ratio", &Parameter::setViscosityRatio, py::arg("viscosity_ratio"))
+        .def("set_velocity_ratio", &Parameter::setVelocityRatio, py::arg("velocity_ratio"))
+        .def("set_density_ratio", &Parameter::setDensityRatio, py::arg("density_ratio"))
+        .def("set_devices", &Parameter::setDevices, py::arg("devices"))
+        .def("set_max_dev", &Parameter::setMaxDev, py::arg("max_dev"))
+        .def("set_is_body_force", &Parameter::setIsBodyForce, py::arg("is_body_force"))
+        .def("set_use_streams", &Parameter::setUseStreams, py::arg("use_streams"))
+        .def("set_main_kernel", &Parameter::setMainKernel, py::arg("kernel"))
+        .def("set_AD_kernel", &Parameter::setADKernel, py::arg("ad_kernel"))
+        .def("set_has_wall_model_monitor", &Parameter::setHasWallModelMonitor, py::arg("has_wall_monitor"))
+        .def("set_outflow_pressure_correction_factor", &Parameter::setOutflowPressureCorrectionFactor, py::arg("correction_factor"))
         .def("set_initial_condition", [](Parameter &para, std::function<std::vector<float>(real, real, real)> &init_func)
         {
             para.setInitialCondition([init_func](real coordX, real coordY, real coordZ, real& rho, real& vx, real& vy, real& vz)
@@ -59,9 +97,46 @@ namespace parameter
                 vy = values[2];
                 vz = values[3];
             });
-        })
-        .def("add_actuator", &Parameter::addActuator)
-        .def("add_probe", &Parameter::addProbe)
+        }, py::arg("init_func"))
+        .def("set_initial_condition_uniform", [](Parameter &para, real velocity_x, real velocity_y, real velocity_z)
+        {
+            para.setInitialCondition([velocity_x, velocity_y, velocity_z](real coordX, real coordY, real coordZ, real& rho, real& vx, real& vy, real& vz) // must capture values explicitly!
+            {
+                rho = c0o1;
+                vx = velocity_x;
+                vy = velocity_y;
+                vz = velocity_z;
+            });
+        }, py::arg("velocity_x"), py::arg("velocity_y"), py::arg("velocity_z"))
+        .def("set_initial_condition_log_law", [](Parameter &para, real u_star, real z0, real velocityRatio)
+        {
+            para.setInitialCondition(
+                [u_star, z0, velocityRatio](real coordX, real coordY, real coordZ, real& rho, real& vx, real& vy, real& vz)
+                {
+                    coordZ = coordZ > c0o1 ? coordZ : c0o1;
+
+                    rho = c0o1;
+                    vx  = u_star/c4o10 * log(coordZ/z0+c1o1) / velocityRatio;
+                    vy = c0o1;
+                    vz = c0o1;
+                }
+            );
+        }, py::arg("u_star"), py::arg("z0"), py::arg("velocity_ratio"))
+        .def("set_initial_condition_perturbed_log_law", [](Parameter &para, real u_star, real z0, real L_x, real L_z, real H, real velocityRatio)
+        {
+            para.setInitialCondition(
+                [u_star, z0, L_x, L_z, H, velocityRatio](real coordX, real coordY, real coordZ, real& rho, real& vx, real& vy, real& vz)
+                {
+                    coordZ = coordZ > c0o1 ? coordZ : c0o1;
+                    rho = c0o1;
+                    vx  = (u_star/c4o10 * log(coordZ/z0+c1o1) + c2o1*sin(cPi*c16o1*coordX/L_x)*sin(cPi*c8o1*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1)) / velocityRatio; 
+                    vy  = c2o1*sin(cPi*c16o1*coordX/L_x)*sin(cPi*c8o1*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1) / velocityRatio; 
+                    vz  = c8o1*u_star/c4o10*(sin(cPi*c8o1*coordY/H)*sin(cPi*c8o1*coordZ/H)+sin(cPi*c8o1*coordX/L_x))/(pow(c1o2*L_z-coordZ, c2o1)+c1o1) / velocityRatio;
+                }
+            );
+        }, py::arg("u_star"), py::arg("z0"), py::arg("length_x"), py::arg("length_z"), py::arg("height"), py::arg("velocity_ratio"))
+        .def("add_actuator", &Parameter::addActuator, py::arg("actuator"))
+        .def("add_probe", &Parameter::addProbe, py::arg("probe"))
         .def("get_output_path", &Parameter::getOutputPath)
         .def("get_output_prefix", &Parameter::getOutputPrefix)
         .def("get_velocity", &Parameter::getVelocity)
@@ -70,11 +145,9 @@ namespace parameter
         .def("get_viscosity_ratio", &Parameter::getViscosityRatio)
         .def("get_density_ratio", &Parameter::getDensityRatio)
         .def("get_force_ratio", &Parameter::getForceRatio)
-        .def("get_use_AMD", &Parameter::getUseAMD)
-        .def("get_use_Wale", &Parameter::getUseWale)
         .def("get_SGS_constant", &Parameter::getSGSConstant)
         .def("get_is_body_force", &Parameter::getIsBodyForce)
-        .def("set_has_wall_model_monitor", &Parameter::setHasWallModelMonitor)
         ;
+
     }
 }
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/pre_collision_interactor.cpp b/pythonbindings/src/gpu/submodules/pre_collision_interactor.cpp
index 362ee1a8ce6112cfa9543f1b254e10f3e35822a1..308f6c37aada14c8c25c69245f603274ae2f18d8 100644
--- a/pythonbindings/src/gpu/submodules/pre_collision_interactor.cpp
+++ b/pythonbindings/src/gpu/submodules/pre_collision_interactor.cpp
@@ -1,3 +1,36 @@
+
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file pre_collision_interactor.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <gpu/VirtualFluids_GPU/PreCollisionInteractor/PreCollisionInteractor.h>
 
diff --git a/pythonbindings/src/gpu/submodules/precursor_writer.cpp b/pythonbindings/src/gpu/submodules/precursor_writer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..64164ef9993d7b4f22bff2390b418718f7c3208f
--- /dev/null
+++ b/pythonbindings/src/gpu/submodules/precursor_writer.cpp
@@ -0,0 +1,67 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file precursor_writer.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+#include <gpu/VirtualFluids_GPU/PreCollisionInteractor/PreCollisionInteractor.h>
+#include <gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.h>
+
+namespace precursor_writer
+{
+    namespace py = pybind11;
+
+    void makeModule(py::module_ &parentModule)
+    {
+        py::enum_<OutputVariable>(parentModule, "OutputVariable")
+        .value("Velocities", OutputVariable::Velocities)
+        .value("Distributions", OutputVariable::Distributions);
+
+        py::class_<PrecursorWriter, PreCollisionInteractor, std::shared_ptr<PrecursorWriter>>(parentModule, "PrecursorWriter")
+        .def(py::init < std::string,
+                        std::string,
+                        real,
+                        real, real,
+                        real, real,
+                        uint, uint, 
+                        OutputVariable, 
+                        uint>(),
+                        py::arg("filename"),
+                        py::arg("output_path"), 
+                        py::arg("x_pos"),
+                        py::arg("y_min"), py::arg("y_max"),
+                        py::arg("z_min"), py::arg("z_max"),
+                        py::arg("t_start_out"), py::arg("t_save"), 
+                        py::arg("output_variable"), 
+                        py::arg("max_timesteps_per_file"));
+    }
+}
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/probes.cpp b/pythonbindings/src/gpu/submodules/probes.cpp
index 6993d9617d870922d7ed90ed9ecbebb8a797be25..7c26958df81a60f00c9909a91f5576a5931652d4 100644
--- a/pythonbindings/src/gpu/submodules/probes.cpp
+++ b/pythonbindings/src/gpu/submodules/probes.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file probes.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h>
@@ -29,7 +61,7 @@ namespace probes
         .value("SpatioTemporalFlatness", Statistic::SpatioTemporalFlatness);
 
         py::class_<Probe, PreCollisionInteractor, std::shared_ptr<Probe>>(probeModule, "Probe")
-        .def("add_statistic", &Probe::addStatistic)
+        .def("add_statistic", &Probe::addStatistic, py::arg("variable"))
         .def("set_file_name_to_n_out", &Probe::setFileNameToNOut)
         .def("add_all_available_statistics", &Probe::addAllAvailableStatistics);
 
@@ -41,14 +73,14 @@ namespace probes
                         uint, 
                         uint,
                         uint>(), 
-                        "probe_name",
-                        "output_path"
-                        "t_start_avg",
-                        "t_avg",
-                        "t_start_out",
-                        "t_out")
-        .def("add_probe_points_from_list", &PointProbe::addProbePointsFromList)
-        .def("add_probe_points_from_x_normal_plane", &PointProbe::addProbePointsFromXNormalPlane);
+                        py::arg("probe_name"),
+                        py::arg("output_path"),
+                        py::arg("t_start_avg"),
+                        py::arg("t_avg"),
+                        py::arg("t_start_out"),
+                        py::arg("t_out"))
+        .def("add_probe_points_from_list", &PointProbe::addProbePointsFromList, py::arg("point_coords_x"), py::arg("point_coords_y"), py::arg("point_coords_z"))
+        .def("add_probe_points_from_x_normal_plane", &PointProbe::addProbePointsFromXNormalPlane, py::arg("pos_x"), py::arg("pos0_y"), py::arg("pos0_z"), py::arg("pos1_y"), py::arg("pos1_z"), py::arg("n_y"), py::arg("n_z"));
 
         py::class_<PlaneProbe, Probe, std::shared_ptr<PlaneProbe>>(probeModule, "PlaneProbe")
         .def(py::init<
@@ -58,13 +90,13 @@ namespace probes
                         uint, 
                         uint,
                         uint>(), 
-                        "probe_name",
-                        "output_path"
-                        "t_start_avg",
-                        "t_avg",
-                        "t_start_out",
-                        "t_out")
-        .def("set_probe_plane", &PlaneProbe::setProbePlane);
+                        py::arg("probe_name"),
+                        py::arg("output_path"),
+                        py::arg("t_start_avg"),
+                        py::arg("t_avg"),
+                        py::arg("t_start_out"),
+                        py::arg("t_out"))
+        .def("set_probe_plane", &PlaneProbe::setProbePlane, py::arg("pos_x"), py::arg("pos_y"), py::arg("pos_z"), py::arg("delta_x"), py::arg("delta_y"), py::arg("delta_z"));
 
         py::class_<PlanarAverageProbe, Probe, std::shared_ptr<PlanarAverageProbe>>(probeModule, "PlanarAverageProbe")
         .def(py::init<
@@ -76,14 +108,14 @@ namespace probes
                         uint,
                         uint,
                         char>(),
-                        "probe_name",
-                        "output_path",
-                        "t_start_avg",
-                        "t_start_tmp_avg",
-                        "t_avg",
-                        "t_start_out",
-                        "t_out",
-                        "plane_normal");
+                        py::arg("probe_name"),
+                        py::arg("output_path"),
+                        py::arg("t_start_avg"),
+                        py::arg("t_start_tmp_avg"),
+                        py::arg("t_avg"),
+                        py::arg("t_start_out"),
+                        py::arg("t_out"),
+                        py::arg("plane_normal"));
 
 
         py::class_<WallModelProbe, Probe, std::shared_ptr<WallModelProbe>>(probeModule, "WallModelProbe")
@@ -95,15 +127,15 @@ namespace probes
                         uint,
                         uint,
                         uint>(), 
-                        "probe_name",
-                        "output_path"
-                        "t_start_avg",
-                        "t_start_tmp_avg",
-                        "t_avg",
-                        "t_start_out",
-                        "t_out")
-        .def("set_force_output_to_stress", &WallModelProbe::setForceOutputToStress)
-        .def("set_evaluate_pressure_gradient", &WallModelProbe::setEvaluatePressureGradient);
+                        py::arg("probe_name"),
+                        py::arg("output_path"),
+                        py::arg("t_start_avg"),
+                        py::arg("t_start_tmp_avg"),
+                        py::arg("t_avg"),
+                        py::arg("t_start_out"),
+                        py::arg("t_out"))
+        .def("set_force_output_to_stress", &WallModelProbe::setForceOutputToStress, py::arg("output_stress"))
+        .def("set_evaluate_pressure_gradient", &WallModelProbe::setEvaluatePressureGradient, py::arg("eval_press_grad"));
 
         return probeModule;
     }
diff --git a/pythonbindings/src/gpu/submodules/simulation.cpp b/pythonbindings/src/gpu/submodules/simulation.cpp
index b775d604ba41530223f22738c72785b2c15348b3..d32ef272a1fd26510439dde6ab3a9438d68009a7 100644
--- a/pythonbindings/src/gpu/submodules/simulation.cpp
+++ b/pythonbindings/src/gpu/submodules/simulation.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file simulation.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <gpu/VirtualFluids_GPU/LBM/Simulation.h>
 #include <gpu/VirtualFluids_GPU/Communication/Communicator.h>
@@ -8,6 +40,9 @@
 #include <gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h>
 #include <gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h>
 #include <gpu/VirtualFluids_GPU/Output/DataWriter.h>
+#include "gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
+#include "gpu/VirtualFluids_GPU/TurbulenceModels/TurbulenceModelFactory.h"
+#include "gpu/VirtualFluids_GPU/Factories/GridScalingFactory.h"
 
 namespace simulation
 {
@@ -20,13 +55,41 @@ namespace simulation
         .def(py::init<  std::shared_ptr<Parameter>,
                         std::shared_ptr<CudaMemoryManager>,
                         vf::gpu::Communicator &,
-                        GridProvider &>(), 
-                        "parameter",
-                        "memoryManager",
-                        "communicator",
-                        "gridProvider")
+                        GridProvider &,
+                        BoundaryConditionFactory*,
+                        GridScalingFactory*>(), 
+                        py::arg("parameter"),
+                        py::arg("memoryManager"),
+                        py::arg("communicator"),
+                        py::arg("gridProvider"),
+                        py::arg("bcFactory"),
+                        py::arg("gridScalingFactory"))
+        .def(py::init<  std::shared_ptr<Parameter>,
+                        std::shared_ptr<CudaMemoryManager>,
+                        vf::gpu::Communicator &,
+                        GridProvider &,
+                        BoundaryConditionFactory*>(), 
+                        py::arg("parameter"),
+                        py::arg("memoryManager"),
+                        py::arg("communicator"),
+                        py::arg("gridProvider"),
+                        py::arg("bcFactory"))
+        .def(py::init<  std::shared_ptr<Parameter>,
+                        std::shared_ptr<CudaMemoryManager>,
+                        vf::gpu::Communicator &,
+                        GridProvider &,
+                        BoundaryConditionFactory*,
+                        std::shared_ptr<TurbulenceModelFactory>,
+                        GridScalingFactory*>(), 
+                        py::arg("parameter"),
+                        py::arg("memoryManager"),
+                        py::arg("communicator"),
+                        py::arg("gridProvider"),
+                        py::arg("bcFactory"),
+                        py::arg("tmFactory"),
+                        py::arg("gridScalingFactory"))
         .def("run", &Simulation::run)
-        .def("addKineticEnergyAnalyzer", &Simulation::addKineticEnergyAnalyzer)
-        .def("addEnstrophyAnalyzer", &Simulation::addEnstrophyAnalyzer);
+        .def("addKineticEnergyAnalyzer", &Simulation::addKineticEnergyAnalyzer, py::arg("t_analyse"))
+        .def("addEnstrophyAnalyzer", &Simulation::addEnstrophyAnalyzer, py::arg("t_analyse"));
     }
 }
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/transient_bc_setter.cpp b/pythonbindings/src/gpu/submodules/transient_bc_setter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..89370ef4c1b91a0c8e480e968a1df3bd4fe540ca
--- /dev/null
+++ b/pythonbindings/src/gpu/submodules/transient_bc_setter.cpp
@@ -0,0 +1,52 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file transient_bc_setter.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
+#include <pybind11/pybind11.h>
+#include <gpu/GridGenerator/TransientBCSetter/TransientBCSetter.h>
+
+namespace transient_bc_setter
+{
+    namespace py = pybind11;
+
+    void makeModule(py::module_ &parentModule)
+    {
+        py::enum_<FileType>(parentModule, "FileType")
+        .value("VTK", FileType::VTK);
+
+        parentModule.def("create_file_collection", &createFileCollection, py::arg("prefix"), py::arg("type"));
+
+        py::class_<FileCollection, std::shared_ptr<FileCollection>>(parentModule, "FileCollection");
+
+        py::class_<VTKFileCollection, FileCollection, std::shared_ptr<VTKFileCollection>>(parentModule, "VTKFileCollection")
+        .def(py::init <std::string>(), py::arg("prefix"));
+    }
+}
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/turbulence_models.cpp b/pythonbindings/src/gpu/submodules/turbulence_models.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cfbb9e56127fee0cd90a482dde258d8b96389989
--- /dev/null
+++ b/pythonbindings/src/gpu/submodules/turbulence_models.cpp
@@ -0,0 +1,56 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file turbulence_models.cpp
+//! \ingroup submodules
+//! \author Henry Korb
+//=======================================================================================
+#include "pybind11/pybind11.h"
+#include "gpu/VirtualFluids_GPU/TurbulenceModels/TurbulenceModelFactory.h"
+#include "gpu/VirtualFluids_GPU/LBM/LB.h"
+
+namespace turbulence_model
+{
+    namespace py = pybind11;
+
+    void makeModule(py::module_ &parentModule)
+    {
+        py::enum_<TurbulenceModel>(parentModule, "TurbulenceModel")
+        .value("Smagorinsky", TurbulenceModel::Smagorinsky)
+        .value("AMD", TurbulenceModel::AMD)
+        .value("QR", TurbulenceModel::QR)
+        .value("None", TurbulenceModel::None);
+
+        py::class_<TurbulenceModelFactory, std::shared_ptr<TurbulenceModelFactory>>(parentModule, "TurbulenceModelFactory")
+        .def(py::init< std::shared_ptr<Parameter>>(), py::arg("para"))
+        .def("set_turbulence_model", &TurbulenceModelFactory::setTurbulenceModel, py::arg("turbulence_model"))
+        .def("set_model_constant", &TurbulenceModelFactory::setModelConstant, py::arg("model_constant"))
+        .def("read_config_file", &TurbulenceModelFactory::readConfigFile, py::arg("config_data"));
+
+    }
+}
\ No newline at end of file
diff --git a/pythonbindings/src/lbm/lbm.cpp b/pythonbindings/src/lbm/lbm.cpp
index 441b9ff372f4e4513fee58c4a8a1cd78d38582dd..90fd4a71b0101469666936c89974de316e0e2b18 100644
--- a/pythonbindings/src/lbm/lbm.cpp
+++ b/pythonbindings/src/lbm/lbm.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file lbm.cpp
+//! \ingroup lbm
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 
 namespace lbm
diff --git a/pythonbindings/src/logger/logger.cpp b/pythonbindings/src/logger/logger.cpp
index 82ad3d92760ae38c0eb62b16be726e4eeaca08ac..555b502fa9a56299895de0fa6dd6cfeb66c15024 100644
--- a/pythonbindings/src/logger/logger.cpp
+++ b/pythonbindings/src/logger/logger.cpp
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file logging.cpp
+//! \ingroup logger
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <logger/Logger.h>
 
@@ -10,15 +42,15 @@ namespace logging
         py::module loggerModule = parentModule.def_submodule("logger");
 
         py::class_<vf::logging::Logger>(loggerModule, "Logger")
-        .def("initialize_logger", &vf::logging::Logger::initalizeLogger)
-        .def("change_log_path", &vf::logging::Logger::changeLogPath);
+        .def_static("initialize_logger", &vf::logging::Logger::initalizeLogger)
+        .def_static("change_log_path", &vf::logging::Logger::changeLogPath, py::arg("path"));
 
         // use f-strings (f"text {float}") in python for compounded messages
-        loggerModule.def("vf_log_trace", [](std::string arg){ VF_LOG_TRACE(arg); });        
-        loggerModule.def("vf_log_debug", [](std::string arg){ VF_LOG_DEBUG(arg); });        
-        loggerModule.def("vf_log_info", [](std::string arg){ VF_LOG_INFO(arg); });        
-        loggerModule.def("vf_log_warning", [](std::string arg){ VF_LOG_WARNING(arg); });        
-        loggerModule.def("vf_log_critical", [](std::string arg){ VF_LOG_CRITICAL(arg); });        
+        loggerModule.def("vf_log_trace", [](std::string message){ VF_LOG_TRACE(message); }, py::arg("message"));        
+        loggerModule.def("vf_log_debug", [](std::string message){ VF_LOG_DEBUG(message); }, py::arg("message"));        
+        loggerModule.def("vf_log_info", [](std::string message){ VF_LOG_INFO(message); }, py::arg("message"));        
+        loggerModule.def("vf_log_warning", [](std::string message){ VF_LOG_WARNING(message); }, py::arg("message"));        
+        loggerModule.def("vf_log_critical", [](std::string message){ VF_LOG_CRITICAL(message); }, py::arg("message"));        
 
         return loggerModule;
     }
diff --git a/pythonbindings/src/muParser.cpp b/pythonbindings/src/muParser.cpp
index 47408c2758fc92991f1be3113d78b8741215b152..eec39de0b72c21aaa924ea805414847aa9de4492 100644
--- a/pythonbindings/src/muParser.cpp
+++ b/pythonbindings/src/muParser.cpp
@@ -1,9 +1,41 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file muParser.cpp
+//! \ingroup src
+//! \author Henry Korb
+//=======================================================================================
 #include <pybind11/pybind11.h>
 #include <muParser.h>
 
 namespace py = pybind11;
 
-PYBIND11_MODULE(pymuparser, m) {
+PYBIND11_MODULE(bindings, m) {
     py::class_<mu::ParserBase>(m, "_ParserBase");
 
     py::class_<mu::Parser, mu::ParserBase>(m, "Parser")
diff --git a/regression-tests/driven_cavity_test.sh b/regression-tests/driven_cavity_test.sh
index e10a829d2680ab647ba0f66e0f2e85a70186007e..7f799facb4459ddafcd8b210a5477954af1444cb 100755
--- a/regression-tests/driven_cavity_test.sh
+++ b/regression-tests/driven_cavity_test.sh
@@ -7,8 +7,8 @@
 # build VirtualFluids accordingly to our specific test scenario.
 # in this case adding -DUSER_APPS="apps/gpu/LBM/DrivenCavity to the cmake command is not necessary, because the DrivenCavity is added to VirtualFluids by default.
 mkdir -p build
-cmake -B build --preset=gpu_make -DCMAKE_CUDA_ARCHITECTURES=75 #-DUSER_APPS="apps/gpu/LBM/DrivenCavity"
-cd build && make -j 8 && cd ..
+cmake -B build --preset=make_gpu -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES=75 #-DUSER_APPS="apps/gpu/LBM/DrivenCavity"
+cmake --build build --parallel 8
 
 # execute VirtualFluids
 ./build/bin/DrivenCavity
@@ -21,4 +21,4 @@ PATH_TO_DIR=output/DrivenCavity
 PATH_TO_REFERENCE_DIR=regression-tests/reference_data/regression_tests/gpu/DrivenCavity_2Levels
 
 # execute fieldcompare (A more comprehensive manual can be found here https://gitlab.com/dglaeser/fieldcompare)
-fieldcompare dir $PATH_TO_DIR --reference $PATH_TO_REFERENCE_DIR --include-files "*.vtu"
\ No newline at end of file
+fieldcompare dir $PATH_TO_DIR $PATH_TO_REFERENCE_DIR --include-files "*.vtu"
diff --git a/regression-tests/driven_cavity_uniform_test.sh b/regression-tests/driven_cavity_uniform_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..95e2bab635d3a6a73fb514a1f67902083c98e5d3
--- /dev/null
+++ b/regression-tests/driven_cavity_uniform_test.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+#################################
+# Driven Cavity Regression Test
+#################################
+
+# build VirtualFluids accordingly to our specific test scenario.
+# in this case adding -DUSER_APPS="apps/gpu/LBM/DrivenCavity to the cmake command is not necessary, because the DrivenCavity is added to VirtualFluids by default.
+mkdir -p build
+cmake -B build --preset=make_gpu -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES=75 -DUSER_APPS="apps/gpu/LBM/DrivenCavityUniform"
+cmake --build build --parallel 8
+
+# execute VirtualFluids
+./build/bin/DrivenCavityUniform
+
+
+# set the path to the produced data
+PATH_TO_DIR=output/DrivenCavity_uniform
+
+# set the path to the reference data.
+# `regression-tests/reference_data` is fix `regression_tests/gpu/DrivenCavity_uniform_2022_12_16` must match the structure in https://github.com/irmb/test_data:
+PATH_TO_REFERENCE_DIR=regression-tests/reference_data/regression_tests/gpu/DrivenCavity_uniform
+
+# execute fieldcompare (A more comprehensive manual can be found here https://gitlab.com/dglaeser/fieldcompare)
+fieldcompare dir $PATH_TO_DIR $PATH_TO_REFERENCE_DIR --include-files "*.vtu"
diff --git a/regression-tests/multigpu_test/rocket.yml b/regression-tests/multigpu_test/rocket.yml
new file mode 100755
index 0000000000000000000000000000000000000000..f621b1349c042e02f2e834e697147da0822ffe1f
--- /dev/null
+++ b/regression-tests/multigpu_test/rocket.yml
@@ -0,0 +1,48 @@
+host: $PHOENIX_REMOTE_HOST
+user: $PHOENIX_REMOTE_USER
+private_keyfile: $PHOENIX_PRIVATE_KEY
+
+copy:
+  - from: regression-tests/multigpu_test/slurm.job
+    to: multigpu_test/slurm.job
+    overwrite: true
+
+  - from: "CMake/"
+    to: "multigpu_test/CMake/"
+    overwrite: true
+
+  - from: "3rdParty/"
+    to: "multigpu_test/3rdParty/"
+    overwrite: true
+
+  - from: "CMakeLists.txt"
+    to: "multigpu_test/CMakeLists.txt"
+    overwrite: true
+
+  - from: "gpu.cmake"
+    to: "multigpu_test/gpu.cmake"
+    overwrite: true
+
+  - from: "src/"
+    to: "multigpu_test/src/"
+    overwrite: true
+
+  - from: "CMakePresets.json"
+    to: "multigpu_test/CMakePresets.json"
+    overwrite: true
+
+  - from: "apps/gpu/LBM/"
+    to: "multigpu_test/apps/gpu/LBM/"
+    overwrite: true
+
+collect:
+  - from: multigpu_test/output/
+    to: output/results/
+    overwrite: true
+
+  - from: multigpu_test/slurmMultiGPU.out
+    to: output/slurmMultiGPU.out
+    overwrite: true
+
+sbatch: multigpu_test/slurm.job
+continue_if_job_fails: true
diff --git a/regression-tests/multigpu_test/slurm.job b/regression-tests/multigpu_test/slurm.job
new file mode 100755
index 0000000000000000000000000000000000000000..0ee0df46ab64bab6520f9f46fc939d5b3186fae7
--- /dev/null
+++ b/regression-tests/multigpu_test/slurm.job
@@ -0,0 +1,29 @@
+#!/bin/bash -l
+
+#SBATCH --partition=gpu01_queue
+#SBATCH --nodes=1
+#SBATCH --time=10:00:00
+#SBATCH --job-name=Cavity4GPU
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --output=multigpu_test/slurmMultiGPU.out
+##SBATCH --exclusive
+
+module purge 
+module load comp/ccache/4.1 # loads comp/gcc/9.3.0 
+module load mpi/openmpi/4.0.5_gcc_9.3/openmpi 
+module load cuda/11.3
+module load comp/git/2.27.0
+PATH=/home/irmb/tools/cmake-3.20.3-linux-x86_64/bin:$PATH
+
+module list
+
+cd multigpu_test
+mkdir -p build
+cd build
+cmake .. -DBUILD_VF_GPU=ON -DCMAKE_CUDA_ARCHITECTURES=60 -DUSER_APPS="apps/gpu/LBM/DrivenCavityMultiGPU"
+make -j 16
+cd ..
+mkdir -p output
+
+mpirun -np 4 "./build/bin/DrivenCavityMultiGPU" "configPhoenix4GPU.txt"
\ No newline at end of file
diff --git a/regression-tests/regression-tests.sh b/regression-tests/regression-tests.sh
index 5b7d227907594b727103be91d2382c05a07b9c6f..9f5dc8cf758b380709fcc9ad8020d1335f760f64 100755
--- a/regression-tests/regression-tests.sh
+++ b/regression-tests/regression-tests.sh
@@ -13,11 +13,10 @@ git clone https://github.com/irmb/test_data regression-tests/reference_data
 #    by cloning our meshio patch and fieldcompare into a venv
 python3 -m venv .venv
 source .venv/bin/activate
-pip install rich
-pip install git+https://github.com/soerenPeters/meshio@update-pyproject-version
-pip install git+https://gitlab.com/dglaeser/fieldcompare
+pip install fieldcompare
 
 # 3. Running the specific tests
+./regression-tests/driven_cavity_uniform_test.sh
 ./regression-tests/driven_cavity_test.sh
 
 
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..5894f9dec06953c3eeb909af96db9cb19d202d65
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,11 @@
+[metadata]
+name = pyfluids
+description = Python binding for VirtualFluids
+long_description = file: README.md
+long_description_content_type = text/markdown
+platforms = any
+url = https://git.rz.tu-bs.de/irmb/virtualfluids
+version = 0.1.0
+
+[options]
+python_requires = >=3.6
diff --git a/setup.py b/setup.py
index b26e1c13d09447d17f8e9fd6e2cd0d0671595bf3..530431b3775970b5222bc87d32bfb407363f95d6 100644
--- a/setup.py
+++ b/setup.py
@@ -1,137 +1,72 @@
-import os
-import re
 import sys
-import platform
-import subprocess
+from pathlib import Path
+from typing import List
 
-from setuptools import setup, Extension
-from setuptools.command.build_ext import build_ext
-from setuptools.command.install import install
-from setuptools.command.develop import develop
-from distutils.version import LooseVersion
+import skbuild
 
 """
-Install python wrapper of virtual fluids
-Install GPU backend with option --GPU
-(pass to pip via --install-option="--GPU")
+Install python wrapper of Virtual Fluids
+install via python:
+    python setup.py install
+    set CMAKE Flags via -DBUILD_VF_GPU:BOOL=ON
+    CMAKE flags have to be separated by -- 
+    example: python setup.py install -- -DBUILD_VF_CPU:BOOL=ON
+or install via pip:
+    pip install .
+    for pip>21:
+        set CMAKE Flags via --config-settings "-DBUILD_VF_GPU=ON"
+        example: pip install . --config-settings="-DBUILD_VF_GPU=ON"
+        each option has to be passed in individually i.e --config-settings="-DOPT1=ON" --config-settings="-DOPT2=OFF"
+    for pip <21:
+        set CMAKE Flags via --global-option ="-DBUILD_VF_GPU=ON"
+        example: pip install . --global-option="-DBUILD_VF_GPU=ON"
 """
 
-vf_cmake_args = [
-    "-DBUILD_VF_PYTHON_BINDINGS=ON",
-    "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
-    "-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache",
-    "-DCMAKE_C_COMPILER_LAUNCHER=ccache",
-    "-DBUILD_SHARED_LIBS=OFF",
-    "-DBUILD_WARNINGS_AS_ERRORS=OFF"
-]
-
-vf_cpu_cmake_args = [
-    "-DBUILD_VF_DOUBLE_ACCURACY=ON",
-    "-DBUILD_VF_CPU:BOOL=ON",
-    "-DBUILD_VF_UNIT_TESTS:BOOL=ON",
-    "-DUSE_METIS=ON",
-    "-DUSE_MPI=ON"
-]
-
-vf_gpu_cmake_args = [
-    "-DBUILD_VF_DOUBLE_ACCURACY=OFF",
-    "-DBUILD_VF_GPU:BOOL=ON",
-    "-DBUILD_VF_UNIT_TESTS:BOOL=OFF",
-]
-
-GPU = False
-
-class CommandMixin:
-    user_options = [
-        ('GPU', None, 'compile pyfluids with GPU backend'),
+package_name = "pyfluids"
+target = "python_bindings"
+src_dir = "pythonbindings"
+stub_package = package_name+"-stubs"
+
+stub_dir = Path(src_dir)/stub_package
+
+
+def add_subfiles(dir_path: Path, suffix: str, root_dir: Path) -> List[str]:
+    files = []
+    for f in dir_path.iterdir():
+        if f.is_dir():
+            files.extend(add_subfiles(f, suffix, root_dir))
+        if f.is_file():
+            if f.suffix != suffix:
+                continue
+            files.append(str(f.relative_to(root_dir)))
+    return files
+
+def add_directory(dir_path: Path, suffix: str):
+    return add_subfiles(dir_path, suffix, dir_path)
+
+stub_files = add_directory(stub_dir, ".pyi")
+
+# hack to get config-args for installation with pip>21
+cmake_args = []
+if "config_args" in locals():
+    cmake_args.extend([f"{k}={v}" for k, v in locals()["config_args"].items()])
+
+cmake_args += [
+        f"-DPython3_ROOT_DIR={Path(sys.prefix)}",
+        "-DBUILD_VF_PYTHON_BINDINGS=ON",
+        "-DBUILD_SHARED_LIBS=OFF",
+        "-DBUILD_VF_DOUBLE_ACCURACY=OFF",
+        "-DBUILD_VF_UNIT_TESTS:BOOL=OFF",
+        "-DBUILD_WARNINGS_AS_ERRORS=OFF",
     ]
 
-    def initialize_options(self):
-        super().initialize_options()
-        self.GPU = False
-
-    def finalize_options(self):
-        super().finalize_options()
-
-    def run(self):
-        global GPU
-        GPU = GPU or self.GPU
-        super().run()
-
-
-class InstallCommand(CommandMixin, install):
-    user_options = getattr(install, 'user_options', []) + CommandMixin.user_options
-
-
-class DevelopCommand(CommandMixin, develop):
-    user_options = getattr(develop, 'user_options', []) + CommandMixin.user_options
-
-
-class CMakeExtension(Extension):
-    def __init__(self, name, sourcedir=''):
-        Extension.__init__(self, name, sources=[])
-        self.sourcedir = os.path.abspath(sourcedir)
-
-
-class CMakeBuild(CommandMixin, build_ext):
-    user_options = getattr(build_ext, 'user_options', []) + CommandMixin.user_options
-
-    def run(self):
-        super().run()
-        try:
-            out = subprocess.check_output(['cmake', '--version'])
-        except OSError:
-            raise RuntimeError("CMake must be installed to build the following extensions: " +
-                               ", ".join(e.name for e in self.extensions))
-
-        if platform.system() == "Windows":
-            cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
-            if cmake_version < '3.1.0':
-                raise RuntimeError("CMake >= 3.1.0 is required on Windows")
-
-        for ext in self.extensions:
-            self.build_extension(ext)
-
-    def build_extension(self, ext):
-        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
-        # required for auto-detection of auxiliary "native" libs
-        if not extdir.endswith(os.path.sep):
-            extdir += os.path.sep
-
-        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
-                      '-DPYTHON_EXECUTABLE=' + sys.executable]
-
-        cfg = 'Debug' if self.debug else 'Release'
-        build_args = ['--config', cfg]
-
-        if platform.system() == "Windows":
-            cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
-            if sys.maxsize > 2**32:
-                cmake_args += ['-A', 'x64']
-            build_args += ['--', '/m']
-        else:
-            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
-            build_args += ['--', '-j2']
-
-        cmake_args.extend(vf_cmake_args)
-        cmake_args.extend(vf_gpu_cmake_args if GPU else vf_cpu_cmake_args)
-
-        env = os.environ.copy()
-        env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
-                                                              self.distribution.get_version())
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-        cmake_cache_file = self.build_temp+"/CMakeCache.txt"
-        if os.path.exists(cmake_cache_file):
-            os.remove(cmake_cache_file)
-        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
-        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
-
-
-setup(
-    name='pyfluids',
-    version='0.0.1',
-    ext_modules=[CMakeExtension('pyfluids')],
-    cmdclass={"install": InstallCommand, "develop": DevelopCommand, "build_ext": CMakeBuild},
-    zip_safe=False,
+skbuild.setup(
+    name=package_name,
+    packages=[package_name, "pymuparser", "pyfluids-stubs"],
+    package_dir={"": src_dir},
+    cmake_args=cmake_args,
+    cmake_install_target=target,
+    package_data={  "pyfluids": ["py.typed"],
+                    "pyfluids-stubs": stub_files},
+    include_package_data=True,
 )
diff --git a/src/basics/basics/utilities/UbTuple.h b/src/basics/basics/utilities/UbTuple.h
index fe9c787cead38621beafab3d082122277bdcff73..228ab48898e5e61777d2fcc0061eb6f0434d5cad 100644
--- a/src/basics/basics/utilities/UbTuple.h
+++ b/src/basics/basics/utilities/UbTuple.h
@@ -597,6 +597,8 @@ inline UbTuple<T1, T2, T3, T4, T5, T6, T7, T8> makeUbTuple(T1 const &a1, T2 cons
 // some typedefs
 using UbTupleFloat2        = UbTuple<float, float>;
 using UbTupleFloat3        = UbTuple<float, float, float>;
+using UbTupleFloat4        = UbTuple<float, float, float, float>;
+using UbTupleFloat6        = UbTuple<float, float, float,float, float, float>;
 using UbTupleInt2          = UbTuple<int, int>;
 using UbTupleInt3          = UbTuple<int, int, int>;
 using UbTupleInt4          = UbTuple<int, int, int, int>;
diff --git a/src/basics/basics/writer/WbWriter.h b/src/basics/basics/writer/WbWriter.h
index 26d43464c03311a2cbc14cd4fc9fe717d4b01531..55dceb7cb4a64dc90f0677796cab52135b726f56 100644
--- a/src/basics/basics/writer/WbWriter.h
+++ b/src/basics/basics/writer/WbWriter.h
@@ -88,7 +88,12 @@ public:
     {
         throw UbException(UB_EXARGS, "not implemented for " + (std::string) typeid(*this).name());
     }
-
+    virtual std::string writeLinesWithLineData(const std::string & /*filename*/, std::vector<UbTupleFloat3> & /*nodes*/,
+                                               std::vector<UbTupleInt2> & /*lines*/, std::vector<std::string> & /*datanames*/,
+                                               std::vector<std::vector<float>> & /*celldata*/)
+    {
+        throw UbException(UB_EXARGS, "not implemented for " + (std::string) typeid(*this).name());
+    }
     //////////////////////////////////////////////////////////////////////////
     // triangles
     // cell numbering:
diff --git a/src/basics/basics/writer/WbWriterVtkXmlBinary.cpp b/src/basics/basics/writer/WbWriterVtkXmlBinary.cpp
index 6731fa56026ca284ad671cb6ce59000a609bbb8c..55c3541983ea4248512508146792832a34a1c563 100644
--- a/src/basics/basics/writer/WbWriterVtkXmlBinary.cpp
+++ b/src/basics/basics/writer/WbWriterVtkXmlBinary.cpp
@@ -34,6 +34,8 @@
 #include <basics/writer/WbWriterVtkXmlASCII.h>
 #include <basics/writer/WbWriterVtkXmlBinary.h>
 #include <cstring>
+#include <fstream>
+#include <string>
 
 using namespace std;
 
@@ -154,12 +156,13 @@ string WbWriterVtkXmlBinary::writeParallelFile(const string &filename, vector<st
 
     return vtkfilename;
 }
+
 /*===============================================================================*/
-string WbWriterVtkXmlBinary::writeLines(const string &filename, vector<UbTupleFloat3> &nodes,
-                                        vector<UbTupleInt2> &lines)
+
+// helper functions
+
+ofstream createFileStream(std::string vtkfilename)
 {
-    string vtkfilename = filename + getFileExtension();
-    UBLOG(logDEBUG1, "WbWriterVtkXmlBinary::writeLines to " << vtkfilename << " - start");
 
     ofstream out(vtkfilename.c_str(), ios::out | ios::binary);
     if (!out) {
@@ -172,89 +175,199 @@ string WbWriterVtkXmlBinary::writeLines(const string &filename, vector<UbTupleFl
         if (!out)
             throw UbException(UB_EXARGS, "couldn't open file " + vtkfilename);
     }
+    return out;
+}
 
-    int nofNodes = (int)nodes.size();
-    int nofCells = (int)lines.size();
-
-    int bytesPerByteVal      = 4; //==sizeof(int)
-    int bytesPoints          = 3 /*x1/x2/x3        */ * nofNodes * sizeof(float);
-    int bytesCellConnectivty = 2 /*nodes per line */ * nofCells * sizeof(int);
-    int bytesCellOffsets     = 1 /*offset per line */ * nofCells * sizeof(int);
-    int bytesCellTypes       = 1 /*type of line */ * nofCells * sizeof(unsigned char);
-
-    int offset = 0;
-    // VTK FILE
+void writeVtkHeader(ofstream &out, int numberOfNodes, int numberOfCells)
+{
     out << "<?xml version=\"1.0\"?>\n";
     out << "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\" >"
         << "\n";
     out << "   <UnstructuredGrid>"
         << "\n";
-    out << "      <Piece NumberOfPoints=\"" << nofNodes << "\" NumberOfCells=\"" << nofCells << "\">\n";
+    out << "      <Piece NumberOfPoints=\"" << numberOfNodes << "\" NumberOfCells=\"" << numberOfCells << "\">\n";
+}
 
-    // POINTS SECTION
+int writePointHeader(ofstream &out, int offset, int bytesPerByteVal, int bytesPoints)
+{
     out << "         <Points>\n";
     out << "            <DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"appended\" offset=\"" << offset
         << "\"  />\n";
     out << "         </Points>\n";
     offset += (bytesPerByteVal + bytesPoints);
+    return offset;
+}
 
-    // CELLS SECTION
+int writeCellHeader(ofstream &out, int offset, int bytesPerByteVal, int bytesCellConnectivity, int bytesCellOffsets,
+                    int bytesCellTypes)
+{
     out << "         <Cells>\n";
     out << "            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
-    offset += (bytesPerByteVal + bytesCellConnectivty);
+    offset += (bytesPerByteVal + bytesCellConnectivity);
     out << "            <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
     offset += (bytesPerByteVal + bytesCellOffsets);
     out << "            <DataArray type=\"UInt8\" Name=\"types\" format=\"appended\" offset=\"" << offset << "\" />\n ";
     offset += (bytesPerByteVal + bytesCellTypes);
     out << "         </Cells>\n";
+    return offset;
+}
 
+int writeDataHeader(ofstream &out, vector<string> &datanames, int offset, int bytesPerByteVal, int bytesScalarData)
+{
+    out << "         <CellData>\n";
+    for (size_t s = 0; s < datanames.size(); ++s) {
+        out << "            <DataArray type=\"Float32\" Name=\"" << datanames[s] << "\" format=\"appended\" offset=\""
+            << offset << "\" /> \n";
+        offset += (bytesPerByteVal + bytesScalarData);
+    }
+    out << "         </CellData>\n";
+    return offset;
+}
+
+void writeAppendDataHeader(ofstream &out)
+{
     out << "      </Piece>\n";
     out << "   </UnstructuredGrid>\n";
-
-    // AppendedData SECTION
     out << "   <AppendedData encoding=\"raw\">\n";
     out << "_";
+}
 
-    // POINTS SECTION
+void writePoints(ofstream &out, int bytesPerByteVal, int bytesPoints, vector<UbTupleFloat3> &nodes)
+{
     out.write((char *)&bytesPoints, bytesPerByteVal);
-    for (int n = 0; n < nofNodes; n++) {
+    for (int n = 0; n < (int)nodes.size(); n++) {
         out.write((char *)&val<1>(nodes[n]), sizeof(float));
         out.write((char *)&val<2>(nodes[n]), sizeof(float));
         out.write((char *)&val<3>(nodes[n]), sizeof(float));
     }
+}
 
-    // CELLS SECTION
-    // cellConnectivity
-    out.write((char *)&bytesCellConnectivty, bytesPerByteVal);
-    for (int c = 0; c < nofCells; c++) {
-        out.write((char *)&val<1>(lines[c]), sizeof(int));
-        out.write((char *)&val<2>(lines[c]), sizeof(int));
+void writeCellConnectivity(ofstream &out, int bytesPerByteVal, int bytesCellConnectivity, vector<UbTupleInt2> &cells)
+{
+    out.write((char *)&bytesCellConnectivity, bytesPerByteVal);
+    for (int c = 0; c < (int)cells.size(); c++) {
+        out.write((char *)&val<1>(cells[c]), sizeof(int));
+        out.write((char *)&val<2>(cells[c]), sizeof(int));
     }
+}
 
-    // cellOffsets
+void writeCellOffsets(ofstream &out, int bytesPerByteVal, int bytesCellOffsets, int numberOfCells)
+{
     out.write((char *)&bytesCellOffsets, bytesPerByteVal);
     int itmp;
-    for (int c = 1; c <= nofCells; c++) {
+    for (int c = 1; c <= numberOfCells; c++) {
         itmp = 2 * c;
         out.write((char *)&itmp, sizeof(int));
     }
+}
 
-    // cellTypes
+void writeCellTypes(ofstream &out, int bytesPerByteVal, int bytesCellTypes, int numberOfCells)
+{
     out.write((char *)&bytesCellTypes, bytesPerByteVal);
     unsigned char vtkCellType = 3;
-    for (int c = 0; c < nofCells; c++) {
+    for (int c = 0; c < numberOfCells; c++) {
         out.write((char *)&vtkCellType, sizeof(unsigned char));
     }
+}
+
+void writeCellData(ofstream &out, int bytesPerByteVal, int bytesScalarData, vector<string> &datanames,
+                   vector<vector<float>> &celldata)
+{
+    for (size_t s = 0; s < datanames.size(); ++s) {
+        out.write((char *)&bytesScalarData, bytesPerByteVal);
+        for (size_t d = 0; d < celldata[s].size(); ++d) {
+            // loake kopie machen, da in celldata "doubles" sind
+            float tmp = (float)celldata[s][d];
+            out.write((char *)&tmp, sizeof(float));
+        }
+    }
+}
+
+void writeEndOfFile(ofstream &out)
+{
     out << "\n</AppendedData>\n";
     out << "</VTKFile>";
     out << endl;
     out.close();
+}
+
+/*===============================================================================*/
+string WbWriterVtkXmlBinary::writeLines(const string &filename, vector<UbTupleFloat3> &nodes,
+                                        vector<UbTupleInt2> &lines)
+{
+    string vtkfilename = filename + getFileExtension();
+    UBLOG(logDEBUG1, "WbWriterVtkXmlBinary::writeLines to " << vtkfilename << " - start");
+
+    ofstream out = createFileStream(vtkfilename);
+
+    int nofNodes = (int)nodes.size();
+    int nofCells = (int)lines.size();
+
+    int bytesPerByteVal = 4; //==sizeof(int)
+    int bytesPoints = 3 /*x1/x2/x3        */ * nofNodes * sizeof(float);
+    int bytesCellConnectivity = 2 /*nodes per line */ * nofCells * sizeof(int);
+    int bytesCellOffsets = 1 /*offset per line */ * nofCells * sizeof(int);
+    int bytesCellTypes = 1 /*type of line */ * nofCells * sizeof(unsigned char);
+
+    int offset = 0;
+
+    writeVtkHeader(out, nofNodes, nofCells);
+    offset = writePointHeader(out, offset, bytesPerByteVal, bytesPoints);
+    writeCellHeader(out, offset, bytesPerByteVal, bytesCellConnectivity, bytesCellOffsets, bytesCellTypes);
+    writeAppendDataHeader(out);
+
+    writePoints(out, bytesPerByteVal, bytesPoints, nodes);
+    writeCellConnectivity(out, bytesPerByteVal, bytesCellConnectivity, lines);
+    writeCellOffsets(out, bytesPerByteVal, bytesCellOffsets, nofCells);
+    writeCellTypes(out, bytesPerByteVal, bytesCellTypes, nofCells);
+    writeEndOfFile(out);
     UBLOG(logDEBUG1, "WbWriterVtkXmlBinary::writeLines to " << vtkfilename << " - end");
 
     return vtkfilename;
 }
+
+/*===============================================================================*/
+string WbWriterVtkXmlBinary::writeLinesWithLineData(const string &filename, vector<UbTupleFloat3> &nodes,
+                                                    vector<UbTupleInt2> &lines, vector<string> &datanames,
+                                                    vector<vector<float>> &celldata)
+{
+    string vtkfilename = filename + getFileExtension();
+    UBLOG(logDEBUG1, "WbWriterVtkXmlBinary::writeLinesWithLineData to " << vtkfilename << " - start");
+
+    ofstream out = createFileStream(vtkfilename);
+
+    int nofNodes = (int)nodes.size();
+    int nofCells = (int)lines.size();
+
+    int bytesPerByteVal = 4; //==sizeof(int)
+    int bytesPoints = 3 /*x1/x2/x3        */ * nofNodes * sizeof(float);
+    int bytesCellConnectivity = 2 /*nodes per line */ * nofCells * sizeof(int);
+    int bytesCellOffsets = 1 /*offset per line */ * nofCells * sizeof(int);
+    int bytesCellTypes = 1 /*type of line */ * nofCells * sizeof(unsigned char);
+    int bytesScalarData = 1 /*scalar        */ * nofCells * sizeof(float);
+
+    int offset = 0;
+
+    writeVtkHeader(out, nofNodes, nofCells);
+    offset = writePointHeader(out, offset, bytesPerByteVal, bytesPoints);
+    offset = writeCellHeader(out, offset, bytesPerByteVal, bytesCellConnectivity, bytesCellOffsets, bytesCellTypes);
+    writeDataHeader(out, datanames, offset, bytesPerByteVal, bytesScalarData);
+    writeAppendDataHeader(out);
+
+    writePoints(out, bytesPerByteVal, bytesPoints, nodes);
+    writeCellConnectivity(out, bytesPerByteVal, bytesCellConnectivity, lines);
+    writeCellOffsets(out, bytesPerByteVal, bytesCellOffsets, nofCells);
+    writeCellTypes(out, bytesPerByteVal, bytesCellTypes, nofCells);
+    writeCellData(out, bytesPerByteVal, bytesScalarData, datanames, celldata);
+    writeEndOfFile(out);
+
+    UBLOG(logDEBUG1, "WbWriterVtkXmlBinary::writeLinesWithLineData to " << vtkfilename << " - end");
+
+    return vtkfilename;
+}
+
 /*===============================================================================*/
 // std::string WbWriterVtkXmlBinary::writeLinesWithNodeData(const string& filename,vector<UbTupleFloat3 >& nodes,
 // vector<UbTupleInt2 >& lines, std::vector< std::string >& datanames, std::vector< std::vector< double > >& nodedata)
@@ -276,7 +389,7 @@ string WbWriterVtkXmlBinary::writeLines(const string &filename, vector<UbTupleFl
 //
 //   int bytesPerByteVal      = 4; //==sizeof(int)
 //   int bytesPoints          = 3 /*x1/x2/x3        */ * nofNodes * sizeof(float);
-//   int bytesCellConnectivty = 2 /*nodes per line  */ * nofCells * sizeof(int  );
+//   int bytesCellConnectivity = 2 /*nodes per line  */ * nofCells * sizeof(int  );
 //   int bytesCellOffsets     = 1 /*offset per line */ * nofCells * sizeof(int  );
 //   int bytesCellTypes       = 1 /*type of line    */ * nofCells * sizeof(unsigned char);
 //   int bytesScalarData      = 1 /*scalar          */ * nofNodes * sizeof(float);
@@ -296,7 +409,7 @@ string WbWriterVtkXmlBinary::writeLines(const string &filename, vector<UbTupleFl
 //   //CELLS SECTION
 //   out<<"         <Cells>\n";
 //   out<<"            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\""<< offset <<"\"
-//   />\n"; offset += (bytesPerByteVal + bytesCellConnectivty); out<<"            <DataArray type=\"Int32\"
+//   />\n"; offset += (bytesPerByteVal + bytesCellConnectivity); out<<"            <DataArray type=\"Int32\"
 //   Name=\"offsets\" format=\"appended\" offset=\""<< offset <<"\" />\n"; offset += (bytesPerByteVal +
 //   bytesCellOffsets); out<<"            <DataArray type=\"UInt8\" Name=\"types\" format=\"appended\" offset=\""<<
 //   offset <<"\" />\n "; offset += (bytesPerByteVal + bytesCellTypes); out<<"         </Cells>\n";
@@ -328,7 +441,7 @@ string WbWriterVtkXmlBinary::writeLines(const string &filename, vector<UbTupleFl
 //
 //   //CELLS SECTION
 //   //cellConnectivity
-//   out.write( (char*)&bytesCellConnectivty, bytesPerByteVal );
+//   out.write( (char*)&bytesCellConnectivity, bytesPerByteVal );
 //   for(int c=0; c<nofCells; c++)
 //   {
 //      out.write( (char*)&val<1>(lines[c]), sizeof(int) );
@@ -397,7 +510,7 @@ string WbWriterVtkXmlBinary::writeTriangles(const string &filename, vector<UbTup
 
     int bytesPerByteVal      = 4; //==sizeof(int)
     int bytesPoints          = 3 /*x1/x2/x3 - coord    */ * nofNodes * sizeof(float);
-    int bytesCellConnectivty = 3 /*nodes per triangle  */ * nofCells * sizeof(int);
+    int bytesCellConnectivity = 3 /*nodes per triangle  */ * nofCells * sizeof(int);
     int bytesCellOffsets     = 1 /*offset per triangle */ * nofCells * sizeof(int);
     int bytesCellTypes       = 1 /*type of triangle    */ * nofCells * sizeof(unsigned char);
 
@@ -421,7 +534,7 @@ string WbWriterVtkXmlBinary::writeTriangles(const string &filename, vector<UbTup
     out << "         <Cells>\n";
     out << "            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
-    offset += (bytesPerByteVal + bytesCellConnectivty);
+    offset += (bytesPerByteVal + bytesCellConnectivity);
     out << "            <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
     offset += (bytesPerByteVal + bytesCellOffsets);
@@ -446,7 +559,7 @@ string WbWriterVtkXmlBinary::writeTriangles(const string &filename, vector<UbTup
 
     // CELLS SECTION
     // cellConnectivity
-    out.write((char *)&bytesCellConnectivty, bytesPerByteVal);
+    out.write((char *)&bytesCellConnectivity, bytesPerByteVal);
     for (int c = 0; c < nofCells; c++) {
         out.write((char *)&val<1>(triangles[c]), sizeof(int));
         out.write((char *)&val<2>(triangles[c]), sizeof(int));
@@ -502,7 +615,7 @@ string WbWriterVtkXmlBinary::writeTrianglesWithNodeData(const string &filename,
 
     int bytesPerByteVal      = 4; //==sizeof(int)
     int bytesPoints          = 3 /*x1/x2/x3        */ * nofNodes * sizeof(float);
-    int bytesCellConnectivty = 3 /*nodes per tri   */ * nofCells * sizeof(int);
+    int bytesCellConnectivity = 3 /*nodes per tri   */ * nofCells * sizeof(int);
     int bytesCellOffsets     = 1 /*offset per tri  */ * nofCells * sizeof(int);
     int bytesCellTypes       = 1 /*type of tri     */ * nofCells * sizeof(unsigned char);
     int bytesScalarData      = 1 /*scalar          */ * nofNodes * sizeof(float);
@@ -527,7 +640,7 @@ string WbWriterVtkXmlBinary::writeTrianglesWithNodeData(const string &filename,
     out << "         <Cells>\n";
     out << "            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
-    offset += (bytesPerByteVal + bytesCellConnectivty);
+    offset += (bytesPerByteVal + bytesCellConnectivity);
     out << "            <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
     offset += (bytesPerByteVal + bytesCellOffsets);
@@ -561,7 +674,7 @@ string WbWriterVtkXmlBinary::writeTrianglesWithNodeData(const string &filename,
 
     // CELLS SECTION
     // cellConnectivity
-    out.write((char *)&bytesCellConnectivty, bytesPerByteVal);
+    out.write((char *)&bytesCellConnectivity, bytesPerByteVal);
     for (int c = 0; c < nofCells; c++) {
         out.write((char *)&val<1>(cells[c]), sizeof(int));
         out.write((char *)&val<2>(cells[c]), sizeof(int));
@@ -625,7 +738,7 @@ string WbWriterVtkXmlBinary::writeQuads(const string &filename, vector<UbTupleFl
 
     int bytesPerByteVal      = 4; //==sizeof(int)
     int bytesPoints          = 3 /*x1/x2/x3        */ * nofNodes * sizeof(float);
-    int bytesCellConnectivty = 4 /*nodes per quad  */ * nofCells * sizeof(int);
+    int bytesCellConnectivity = 4 /*nodes per quad  */ * nofCells * sizeof(int);
     int bytesCellOffsets     = 1 /*offset per quad */ * nofCells * sizeof(int);
     int bytesCellTypes       = 1 /*type of quad    */ * nofCells * sizeof(unsigned char);
 
@@ -649,7 +762,7 @@ string WbWriterVtkXmlBinary::writeQuads(const string &filename, vector<UbTupleFl
     out << "         <Cells>\n";
     out << "            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
-    offset += (bytesPerByteVal + bytesCellConnectivty);
+    offset += (bytesPerByteVal + bytesCellConnectivity);
     out << "            <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
     offset += (bytesPerByteVal + bytesCellOffsets);
@@ -674,7 +787,7 @@ string WbWriterVtkXmlBinary::writeQuads(const string &filename, vector<UbTupleFl
 
     // CELLS SECTION
     // cellConnectivity
-    out.write((char *)&bytesCellConnectivty, bytesPerByteVal);
+    out.write((char *)&bytesCellConnectivity, bytesPerByteVal);
     for (int c = 0; c < nofCells; c++) {
         out.write((char *)&val<1>(cells[c]), sizeof(int));
         out.write((char *)&val<2>(cells[c]), sizeof(int));
@@ -730,7 +843,7 @@ string WbWriterVtkXmlBinary::writeQuadsWithNodeData(const string &filename, vect
 
     int bytesPerByteVal      = 4; //==sizeof(int)
     int bytesPoints          = 3 /*x1/x2/x3        */ * nofNodes * sizeof(float);
-    int bytesCellConnectivty = 4 /*nodes per quad  */ * nofCells * sizeof(int);
+    int bytesCellConnectivity = 4 /*nodes per quad  */ * nofCells * sizeof(int);
     int bytesCellOffsets     = 1 /*offset per quad */ * nofCells * sizeof(int);
     int bytesCellTypes       = 1 /*type of quad    */ * nofCells * sizeof(unsigned char);
     int bytesScalarData      = 1 /*scalar          */ * nofNodes * sizeof(float);
@@ -755,7 +868,7 @@ string WbWriterVtkXmlBinary::writeQuadsWithNodeData(const string &filename, vect
     out << "         <Cells>\n";
     out << "            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
-    offset += (bytesPerByteVal + bytesCellConnectivty);
+    offset += (bytesPerByteVal + bytesCellConnectivity);
     out << "            <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
     offset += (bytesPerByteVal + bytesCellOffsets);
@@ -789,7 +902,7 @@ string WbWriterVtkXmlBinary::writeQuadsWithNodeData(const string &filename, vect
 
     // CELLS SECTION
     // cellConnectivity
-    out.write((char *)&bytesCellConnectivty, bytesPerByteVal);
+    out.write((char *)&bytesCellConnectivity, bytesPerByteVal);
     for (int c = 0; c < nofCells; c++) {
         out.write((char *)&val<1>(cells[c]), sizeof(int));
         out.write((char *)&val<2>(cells[c]), sizeof(int));
@@ -855,7 +968,7 @@ string WbWriterVtkXmlBinary::writeQuadsWithCellData(const string &filename, vect
 
     int bytesPerByteVal      = 4; //==sizeof(int)
     int bytesPoints          = 3 /*x1/x2/x3        */ * nofNodes * sizeof(float);
-    int bytesCellConnectivty = 4 /*nodes per quad  */ * nofCells * sizeof(int);
+    int bytesCellConnectivity = 4 /*nodes per quad  */ * nofCells * sizeof(int);
     int bytesCellOffsets     = 1 /*offset per quad */ * nofCells * sizeof(int);
     int bytesCellTypes       = 1 /*type of quad    */ * nofCells * sizeof(unsigned char);
     int bytesScalarData      = 1 /*scalar          */ * nofCells * sizeof(float);
@@ -880,7 +993,7 @@ string WbWriterVtkXmlBinary::writeQuadsWithCellData(const string &filename, vect
     out << "         <Cells>\n";
     out << "            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
-    offset += (bytesPerByteVal + bytesCellConnectivty);
+    offset += (bytesPerByteVal + bytesCellConnectivity);
     out << "            <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
     offset += (bytesPerByteVal + bytesCellOffsets);
@@ -914,7 +1027,7 @@ string WbWriterVtkXmlBinary::writeQuadsWithCellData(const string &filename, vect
 
     // CELLS SECTION
     // cellConnectivity
-    out.write((char *)&bytesCellConnectivty, bytesPerByteVal);
+    out.write((char *)&bytesCellConnectivity, bytesPerByteVal);
     for (int c = 0; c < nofCells; c++) {
         out.write((char *)&val<1>(cells[c]), sizeof(int));
         out.write((char *)&val<2>(cells[c]), sizeof(int));
@@ -984,7 +1097,7 @@ string WbWriterVtkXmlBinary::writeQuadsWithNodeAndCellData(const string &filenam
 
     int bytesPerByteVal      = 4; //==sizeof(int)
     int bytesPoints          = 3 /*x1/x2/x3        */ * nofNodes * sizeof(float);
-    int bytesCellConnectivty = 4 /*nodes per quad  */ * nofCells * sizeof(int);
+    int bytesCellConnectivity = 4 /*nodes per quad  */ * nofCells * sizeof(int);
     int bytesCellOffsets     = 1 /*offset per quad */ * nofCells * sizeof(int);
     int bytesCellTypes       = 1 /*type of quad    */ * nofCells * sizeof(unsigned char);
     int bytesScalarDataPoint = 1 /*scalar          */ * nofNodes * sizeof(float);
@@ -1010,7 +1123,7 @@ string WbWriterVtkXmlBinary::writeQuadsWithNodeAndCellData(const string &filenam
     out << "         <Cells>\n";
     out << "            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
-    offset += (bytesPerByteVal + bytesCellConnectivty);
+    offset += (bytesPerByteVal + bytesCellConnectivity);
     out << "            <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
     offset += (bytesPerByteVal + bytesCellOffsets);
@@ -1052,7 +1165,7 @@ string WbWriterVtkXmlBinary::writeQuadsWithNodeAndCellData(const string &filenam
 
     // CELLS SECTION
     // cellConnectivity
-    out.write((char *)&bytesCellConnectivty, bytesPerByteVal);
+    out.write((char *)&bytesCellConnectivity, bytesPerByteVal);
     for (int c = 0; c < nofCells; c++) {
         out.write((char *)&val<1>(cells[c]), sizeof(int));
         out.write((char *)&val<2>(cells[c]), sizeof(int));
@@ -1128,7 +1241,7 @@ string WbWriterVtkXmlBinary::writeOctsWithCellData(const string &filename, vecto
 
     int bytesPerByteVal      = 4; //==sizeof(int)
     int bytesPoints          = 3 /*x1/x2/x3      */ * nofNodes * sizeof(float);
-    int bytesCellConnectivty = 8 /*nodes per oct */ * nofCells * sizeof(int);
+    int bytesCellConnectivity = 8 /*nodes per oct */ * nofCells * sizeof(int);
     int bytesCellOffsets     = 1 /*offset per oct*/ * nofCells * sizeof(int);
     int bytesCellTypes       = 1 /*type of oct   */ * nofCells * sizeof(unsigned char);
     int bytesScalarData      = 1 /*scalar        */ * nofCells * sizeof(float);
@@ -1153,7 +1266,7 @@ string WbWriterVtkXmlBinary::writeOctsWithCellData(const string &filename, vecto
     out << "         <Cells>\n";
     out << "            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
-    offset += (bytesPerByteVal + bytesCellConnectivty);
+    offset += (bytesPerByteVal + bytesCellConnectivity);
     out << "            <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
     offset += (bytesPerByteVal + bytesCellOffsets);
@@ -1187,7 +1300,7 @@ string WbWriterVtkXmlBinary::writeOctsWithCellData(const string &filename, vecto
 
     // CELLS SECTION
     // cellConnectivity
-    out.write((char *)&bytesCellConnectivty, bytesPerByteVal);
+    out.write((char *)&bytesCellConnectivity, bytesPerByteVal);
     for (int c = 0; c < nofCells; c++) {
         out.write((char *)&val<1>(cells[c]), sizeof(int));
         out.write((char *)&val<2>(cells[c]), sizeof(int));
@@ -1257,7 +1370,7 @@ string WbWriterVtkXmlBinary::writeOctsWithNodeData(const string &filename, vecto
 
     int bytesPerByteVal      = 4; //==sizeof(int)
     int bytesPoints          = 3 /*x1/x2/x3      */ * nofNodes * sizeof(float);
-    int bytesCellConnectivty = 8 /*nodes per oct */ * nofCells * sizeof(int);
+    int bytesCellConnectivity = 8 /*nodes per oct */ * nofCells * sizeof(int);
     int bytesCellOffsets     = 1 /*offset per oct*/ * nofCells * sizeof(int);
     int bytesCellTypes       = 1 /*type of oct   */ * nofCells * sizeof(unsigned char);
     int bytesScalarData      = 1 /*scalar        */ * nofNodes * sizeof(double);
@@ -1282,7 +1395,7 @@ string WbWriterVtkXmlBinary::writeOctsWithNodeData(const string &filename, vecto
     out << "         <Cells>\n";
     out << "            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
-    offset += (bytesPerByteVal + bytesCellConnectivty);
+    offset += (bytesPerByteVal + bytesCellConnectivity);
     out << "            <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
     offset += (bytesPerByteVal + bytesCellOffsets);
@@ -1316,7 +1429,7 @@ string WbWriterVtkXmlBinary::writeOctsWithNodeData(const string &filename, vecto
 
     // CELLS SECTION
     // cellConnectivity
-    out.write((char *)&bytesCellConnectivty, bytesPerByteVal);
+    out.write((char *)&bytesCellConnectivity, bytesPerByteVal);
     for (int c = 0; c < nofCells; c++) {
         out.write((char *)&val<1>(cells[c]), sizeof(int));
         out.write((char *)&val<2>(cells[c]), sizeof(int));
@@ -1386,7 +1499,7 @@ string WbWriterVtkXmlBinary::writeOcts(const string &filename, vector<UbTupleFlo
 
     int bytesPerByteVal      = 4; //==sizeof(int)
     int bytesPoints          = 3 /*x1/x2/x3      */ * nofNodes * sizeof(float);
-    int bytesCellConnectivty = 8 /*nodes per oct */ * nofCells * sizeof(int);
+    int bytesCellConnectivity = 8 /*nodes per oct */ * nofCells * sizeof(int);
     int bytesCellOffsets     = 1 /*offset per oct*/ * nofCells * sizeof(int);
     int bytesCellTypes       = 1 /*type of oct   */ * nofCells * sizeof(unsigned char);
     // int bytesScalarData      = 1 /*scalar        */ * nofNodes * sizeof(float);
@@ -1411,7 +1524,7 @@ string WbWriterVtkXmlBinary::writeOcts(const string &filename, vector<UbTupleFlo
     out << "         <Cells>\n";
     out << "            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
-    offset += (bytesPerByteVal + bytesCellConnectivty);
+    offset += (bytesPerByteVal + bytesCellConnectivity);
     out << "            <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
     offset += (bytesPerByteVal + bytesCellOffsets);
@@ -1436,7 +1549,7 @@ string WbWriterVtkXmlBinary::writeOcts(const string &filename, vector<UbTupleFlo
 
     // CELLS SECTION
     // cellConnectivity
-    out.write((char *)&bytesCellConnectivty, bytesPerByteVal);
+    out.write((char *)&bytesCellConnectivity, bytesPerByteVal);
     for (int c = 0; c < nofCells; c++) {
         out.write((char *)&val<1>(cells[c]), sizeof(int));
         out.write((char *)&val<2>(cells[c]), sizeof(int));
@@ -1491,7 +1604,7 @@ std::string WbWriterVtkXmlBinary::writeNodes(const std::string &filename, std::v
 
     int bytesPerByteVal      = 4; //==sizeof(int)
     int bytesPoints          = 3 /*x1/x2/x3        */ * nofNodes * sizeof(float);
-    int bytesCellConnectivty = 1 /*nodes per cell  */ * nofNodes * sizeof(int);
+    int bytesCellConnectivity = 1 /*nodes per cell  */ * nofNodes * sizeof(int);
     int bytesCellOffsets     = 1 /*offset per cell */ * nofNodes * sizeof(int);
     int bytesCellTypes       = 1 /*type of line    */ * nofNodes * sizeof(unsigned char);
 
@@ -1515,7 +1628,7 @@ std::string WbWriterVtkXmlBinary::writeNodes(const std::string &filename, std::v
     out << "         <Cells>\n";
     out << "            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
-    offset += (bytesPerByteVal + bytesCellConnectivty);
+    offset += (bytesPerByteVal + bytesCellConnectivity);
     out << "            <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
     offset += (bytesPerByteVal + bytesCellOffsets);
@@ -1540,7 +1653,7 @@ std::string WbWriterVtkXmlBinary::writeNodes(const std::string &filename, std::v
 
     // CELLS SECTION
     // cellConnectivity
-    out.write((char *)&bytesCellConnectivty, bytesPerByteVal);
+    out.write((char *)&bytesCellConnectivity, bytesPerByteVal);
     for (int c = 0; c < nofNodes; c++)
         out.write((char *)&c, sizeof(int));
 
@@ -1586,7 +1699,7 @@ std::string WbWriterVtkXmlBinary::writeNodesWithNodeData(const std::string &file
 
     int bytesPerByteVal      = 4; //==sizeof(int)
     int bytesPoints          = 3 /*x1/x2/x3       */ * nofNodes * sizeof(float);
-    int bytesCellConnectivty = 1 /*nodes per cell */ * nofNodes * sizeof(int);
+    int bytesCellConnectivity = 1 /*nodes per cell */ * nofNodes * sizeof(int);
     int bytesCellOffsets     = 1 /*offset per cell*/ * nofNodes * sizeof(int);
     int bytesCellTypes       = 1 /*type of oct    */ * nofNodes * sizeof(unsigned char);
     int bytesScalarData      = 1 /*scalar         */ * nofNodes * sizeof(double);
@@ -1611,7 +1724,7 @@ std::string WbWriterVtkXmlBinary::writeNodesWithNodeData(const std::string &file
     out << "         <Cells>\n";
     out << "            <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
-    offset += (bytesPerByteVal + bytesCellConnectivty);
+    offset += (bytesPerByteVal + bytesCellConnectivity);
     out << "            <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\"" << offset
         << "\" />\n";
     offset += (bytesPerByteVal + bytesCellOffsets);
@@ -1645,7 +1758,7 @@ std::string WbWriterVtkXmlBinary::writeNodesWithNodeData(const std::string &file
 
     // CELLS SECTION
     // cellConnectivity
-    out.write((char *)&bytesCellConnectivty, bytesPerByteVal);
+    out.write((char *)&bytesCellConnectivity, bytesPerByteVal);
     for (int c = 0; c < nofNodes; c++)
         out.write((char *)&c, sizeof(int));
 
diff --git a/src/basics/basics/writer/WbWriterVtkXmlBinary.h b/src/basics/basics/writer/WbWriterVtkXmlBinary.h
index 421148d90497e3628ed274439c0b2fd7636b7fd2..0f2c31eda81ad0c1975c9715ac1b7fb37a06339b 100644
--- a/src/basics/basics/writer/WbWriterVtkXmlBinary.h
+++ b/src/basics/basics/writer/WbWriterVtkXmlBinary.h
@@ -93,6 +93,9 @@ public:
     // nodedata);
     // FIXME: hides function in base class
 
+    std::string writeLinesWithLineData(const std::string &filename, std::vector<UbTupleFloat3> &nodes, std::vector<UbTupleInt2> &lines,
+                                       std::vector<std::string> &datanames, std::vector<std::vector<float>> &celldata) override;
+
     //////////////////////////////////////////////////////////////////////////
     // triangles
     //                    2
diff --git a/src/basics/basics/writer/WbWriterVtkXmlImageBinary.cpp b/src/basics/basics/writer/WbWriterVtkXmlImageBinary.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..798b55919df9e24dbc71ecfded5fb8a913cff8cf
--- /dev/null
+++ b/src/basics/basics/writer/WbWriterVtkXmlImageBinary.cpp
@@ -0,0 +1,360 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file WbWriterVtkXmlImageBinary.cpp
+//! \ingroup writer
+//! \author Soeren Freudiger, Sebastian Geller, Henry Korb, Henrik Asmuth
+//=======================================================================================
+#include <basics/utilities/UbLogger.h>
+#include <basics/writer/WbWriterVtkXmlImageBinary.h>
+#include <cstring>
+
+using namespace std;
+
+/*===============================================================================*/
+const std::string WbWriterVtkXmlImageBinary::pvdEndTag = "   </Collection>\n</VTKFile>";
+/*===============================================================================*/
+string WbWriterVtkXmlImageBinary::writeCollection(const string &filename, const vector<string> &filenames,
+                                                  const double &timeStep, const bool &sepGroups)
+{
+    string vtkfilename = filename + ".pvd";
+    ofstream out(vtkfilename.c_str());
+    if (!out) {
+        out.clear(); // flags ruecksetzen (ansonsten liefert utern if(!out) weiterhin true!!!
+        string path = UbSystem::getPathFromString(vtkfilename);
+        if (path.size() > 0) {
+            UbSystem::makeDirectory(path);
+            out.open(vtkfilename.c_str());
+        }
+        if (!out)
+            throw UbException(UB_EXARGS, "couldn't open file " + vtkfilename);
+    }
+
+    string endian;
+    if (UbSystem::isLittleEndian())
+        endian = "LittleEndian";
+    else
+        endian = "BigEndian";
+    out << "<VTKFile type=\"Collection\" version=\"0.1\" byte_order=\"" << endian << "\" >" << endl;
+    out << "   <Collection>" << endl;
+
+    int group = 0, part = 0;
+    for (size_t i = 0; i < filenames.size(); i++) {
+        out << "       <DataSet timestep=\"" << timeStep << "\" group=\"" << group << "\" part=\"" << part
+            << "\" file=\"" << filenames[i] << "\"/>" << endl;
+        if (sepGroups)
+            group++;
+        else
+            part++;
+    }
+    out << pvdEndTag;
+    out.close();
+
+    return vtkfilename;
+}
+/*===============================================================================*/
+string WbWriterVtkXmlImageBinary::addFilesToCollection(const string &filename, const vector<string> &filenames,
+                                                       const double &timeStep, const bool &sepGroups)
+{
+    string vtkfilename = filename;
+    fstream test(vtkfilename.c_str(), ios::in);
+    if (!test) {
+        test.clear();
+        vtkfilename += ".pvd";
+        test.open(vtkfilename.c_str(), ios::in);
+        if (!test)
+            return this->writeCollection(filename, filenames, timeStep, sepGroups);
+    }
+
+    fstream out(vtkfilename.c_str(), ios::in | ios::out);
+    out.seekp(-(int)pvdEndTag.size() - 1, ios_base::end);
+
+    int group = 0;
+    for (size_t i = 0; i < filenames.size(); i++) {
+        out << "       <DataSet timestep=\"" << timeStep << "\" group=\"" << group << "\" part=\"" << i << "\" file=\""
+            << filenames[i] << "\"/>" << endl;
+        if (sepGroups)
+            group++;
+    }
+    out << pvdEndTag;
+
+    return vtkfilename;
+}
+/*===============================================================================*/
+string WbWriterVtkXmlImageBinary::writeParallelFile(const string &filename, const UbTupleInt6 &wholeExtent,
+                                                    const UbTupleFloat3 &origin, const UbTupleFloat3 &spacing,
+                                                    vector<string> &pieceSources, vector<UbTupleInt6> &pieceExtents,
+                                                    vector<string> &pointDataNames, vector<string> &cellDataNames)
+{
+    string vtkfilename = filename + ".pvti";
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeParallelFile to " << vtkfilename << " - start");
+
+    ofstream out(vtkfilename.c_str());
+    if (!out) {
+        out.clear(); // flags ruecksetzen (ansonsten liefert utern if(!out) weiterhin true!!!
+        string path = UbSystem::getPathFromString(vtkfilename);
+        if (path.size() > 0) {
+            UbSystem::makeDirectory(path);
+            out.open(vtkfilename.c_str());
+        }
+        if (!out)
+            throw UbException(UB_EXARGS, "couldn't open file " + vtkfilename);
+    }
+
+    // VTK FILE
+    out << "<VTKFile type=\"PImageData\" version=\"0.1\" byte_order=\"LittleEndian\">"
+        << "\n";
+    out << "  <PImageData "
+            << "WholeExtent=\"" << val<1>(wholeExtent) << " "
+                                << val<2>(wholeExtent) << " " 
+                                << val<3>(wholeExtent) << " " 
+                                << val<4>(wholeExtent) << " " 
+                                << val<5>(wholeExtent) << " "
+                                << val<6>(wholeExtent) << "\" "
+            << "GhostLevel=\"0\" "
+            << "Origin=\""  << val<1>(origin) << " "
+                            << val<2>(origin) << " "
+                            << val<3>(origin) << "\" "
+            << "Spacing=\"" << val<1>(spacing) << " "
+                            << val<2>(spacing) << " "
+                            << val<3>(spacing) << "\" "
+        << "> \n";
+    out << "    <PPointData>\n";
+    for (size_t s = 0; s < pointDataNames.size(); s++)
+        out << "      <PDataArray type=\"Float32\" Name=\"" << pointDataNames[s] << "\"/>\n";
+    out << "    </PPointData>\n";
+    if (cellDataNames.size() > 0) {
+        out << "    <PCellData>\n";
+        for (size_t s = 0; s < cellDataNames.size(); s++)
+            out << "      <PDataArray type=\"Float32\" Name=\"" << cellDataNames[s] << "\"/>\n";
+        out << "    </PCellData>\n";
+    }
+    for (size_t s = 0; s < pieceSources.size(); s++)
+        out << "    <Piece Extent=\""   << val<1>(pieceExtents[s]) << " " 
+                                        << val<2>(pieceExtents[s]) << " " 
+                                        << val<3>(pieceExtents[s]) << " " 
+                                        << val<4>(pieceExtents[s]) << " " 
+                                        << val<5>(pieceExtents[s]) << " "
+                                        << val<6>(pieceExtents[s]) << "\" Source=\"" << pieceSources[s] << "\"/>\n";
+    out << "  </PImageData>\n";
+    out << "</VTKFile>";
+    out << endl;
+    out.close();
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeParallelFile to " << vtkfilename << " - end");
+
+    return vtkfilename;
+}
+/*===============================================================================*/
+string WbWriterVtkXmlImageBinary::writeOctsWithCellData(const string &filename, vector<UbTupleFloat3> &nodes,
+                                                        vector<UbTupleInt8> & /*cells*/, vector<string> &datanames,
+                                                        vector<vector<double>> &celldata)
+{
+    string vtkfilename = filename + getFileExtension();
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeOctsWithCellData to " << vtkfilename << " - start");
+
+    vector<string> nodeDataNames;
+    vector<vector<double>> nodedata;
+
+    UbTupleFloat3 origin, spacing;
+    UbTupleInt6 extent;
+
+    getMetaDataOfImage(nodes, origin, spacing, extent);
+
+    this->writeData(vtkfilename, nodeDataNames, datanames, nodedata, celldata, extent, origin, spacing, extent);
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeOctsWithCellData to " << vtkfilename << " - end");
+
+    return vtkfilename;
+}
+/*===============================================================================*/
+string WbWriterVtkXmlImageBinary::writeOctsWithNodeData(const string &filename, vector<UbTupleFloat3> &nodes,
+                                                        vector<UbTupleUInt8> & /*cells*/, vector<string> &datanames,
+                                                        vector<vector<double>> &nodedata)
+{
+    string vtkfilename = filename + getFileExtension();
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeOctsWithNodeData to " << vtkfilename << " - start");
+
+    vector<string> cellDataNames;
+    vector<vector<double>> cellData;
+
+    UbTupleFloat3 origin, spacing;
+    UbTupleInt6 extent;
+
+    getMetaDataOfImage(nodes, origin, spacing, extent);
+
+    this->writeData(vtkfilename, datanames, cellDataNames, nodedata, cellData, extent, origin, spacing, extent);
+
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeOctsWithNodeData to " << vtkfilename << " - end");
+
+    return vtkfilename;
+}
+/*===============================================================================*/
+string WbWriterVtkXmlImageBinary::writeNodesWithNodeData(const string &filename, vector<UbTupleFloat3> &nodes,
+                                                         vector<string> &datanames, vector<vector<double>> &nodedata)
+{
+    string vtkfilename = filename + getFileExtension();
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeNodesWithNodeData to " << vtkfilename << " - start");
+
+    vector<string> cellDataNames;
+    vector<vector<double>> cellData;
+
+    UbTupleFloat3 origin, spacing;
+    UbTupleInt6 extent;
+
+    getMetaDataOfImage(nodes, origin, spacing, extent);
+    this->writeData(vtkfilename, datanames, cellDataNames, nodedata, cellData, extent, origin, spacing, extent);
+
+    return vtkfilename;
+}
+
+void WbWriterVtkXmlImageBinary::getMetaDataOfImage(vector<UbTupleFloat3> &nodes, UbTupleFloat3 &origin,
+                                                   UbTupleFloat3 &spacing, UbTupleInt6 &extent)
+{
+    int nofNodes = (int)nodes.size();
+    val<1>(origin) = val<1>(nodes[0]);
+    val<2>(origin) = val<2>(nodes[0]);
+    val<3>(origin) = val<3>(nodes[0]);
+
+    float l_x = val<1>(nodes[nofNodes-1])-val<1>(origin);
+    float l_y = val<2>(nodes[nofNodes-1])-val<2>(origin);
+
+    val<1>(spacing) = val<1>(nodes[1])-val<1>(nodes[0]);
+    int nx = (l_x) / val<1>(spacing);
+    val<2>(spacing) = val<2>(nodes[nx])-val<2>(nodes[0]);    
+    int ny = (l_y) / val<2>(spacing);
+    val<3>(spacing) = val<3>(nodes[nx*ny])-val<3>(nodes[0]);
+
+    val<1>(extent) = val<1>(origin) / val<1>(spacing); val<2>(extent) = val<1>(nodes[nofNodes - 1]) / val<1>(spacing);    
+    val<3>(extent) = val<2>(origin) / val<2>(spacing); val<4>(extent) = val<2>(nodes[nofNodes - 1]) / val<2>(spacing);    
+    val<5>(extent) = val<3>(origin) / val<3>(spacing); val<6>(extent) = val<3>(nodes[nofNodes - 1]) / val<3>(spacing);    
+
+}
+
+void WbWriterVtkXmlImageBinary::writeData(const string &vtkfilename, vector<string> &pointDataNames,
+                                          vector<string> &cellDataNames, vector<vector<double>> &nodedata,
+                                          vector<vector<double>> &celldata, UbTupleInt6 &wholeExtent,
+                                          UbTupleFloat3 &origin, UbTupleFloat3 &spacing, UbTupleInt6 &extent,
+                                          unsigned int precision)
+{
+    ofstream out(vtkfilename.c_str(), ios::out | ios::binary);
+    out.precision(precision);
+
+    if (!out) {
+        out.clear(); // flags ruecksetzen (ansonsten liefert utern if(!out) weiterhin true!!!
+        string path = UbSystem::getPathFromString(vtkfilename);
+        if (path.size() > 0) {
+            UbSystem::makeDirectory(path);
+            out.open(vtkfilename.c_str(), ios::out | ios::binary);
+        }
+        if (!out)
+            throw UbException(UB_EXARGS, "couldn't open file " + vtkfilename);
+    }
+
+    size_t nPoints = pointDataNames.size() > 0 ? nodedata[0].size() : celldata[0].size();
+
+    int bytesPerByteVal = 4; //==sizeof(int)
+
+    int bytesScalarData = 1 /*scalar         */ * (int)nPoints * sizeof(double);
+
+    int offset = 0;
+
+    // VTK FILE
+    out << "<?xml version=\"1.0\"?>\n";
+    out << "<VTKFile type=\"ImageData\" version=\"0.1\" byte_order=\"LittleEndian\" >"
+        << "\n";
+    out << "   <ImageData "
+            << "WholeExtent=\"" << val<1>(wholeExtent) << " " 
+                                << val<2>(wholeExtent) << " " 
+                                << val<3>(wholeExtent) << " " 
+                                << val<4>(wholeExtent) << " " 
+                                << val<5>(wholeExtent) << " "
+                                << val<6>(wholeExtent) << "\" "
+            << "Origin=\""  << val<1>(origin) << " " 
+                            << val<2>(origin) << " "
+                            << val<3>(origin) << "\" "
+            << "Spacing=\"" << val<1>(spacing) << " " 
+                            << val<2>(spacing) << " " 
+                            << val<3>(spacing) << "\""
+        << "> \n";
+    out << "      <Piece Extent=\"" << val<1>(extent) << " " 
+                                    << val<2>(extent) << " " 
+                                    << val<3>(extent) << " " 
+                                    << val<4>(extent) << " " 
+                                    << val<5>(extent) << " "
+                                    << val<6>(extent) << "\">\n";
+
+    // DATA SECTION
+    if (pointDataNames.size() > 0) {
+        out << "         <PointData>\n";
+        for (size_t s = 0; s < pointDataNames.size(); ++s) {
+            out << "            <DataArray type=\"Float64\" Name=\"" << pointDataNames[s]
+                << "\" format=\"appended\" offset=\"" << offset << "\" /> \n";
+            offset += (bytesPerByteVal + bytesScalarData);
+        }
+        out << "         </PointData>\n";
+    }
+
+    if (cellDataNames.size() > 0) {
+        out << "         <CellData>\n";
+        for (size_t s = 0; s < cellDataNames.size(); ++s) {
+            out << "            <DataArray type=\"Float64\" Name=\"" << cellDataNames[s]
+                << "\" format=\"appended\" offset=\"" << offset << "\" /> \n";
+            offset += (bytesPerByteVal + bytesScalarData);
+        }
+        out << "         </CellData>\n";
+    }
+
+    out << "      </Piece>\n";
+    out << "   </ImageData>\n";
+
+    // AppendedData SECTION
+    out << "   <AppendedData encoding=\"raw\">\n";
+    out << "_";
+
+    // DATA SECTION
+    // pointData
+    for (size_t s = 0; s < pointDataNames.size(); ++s) {
+        out.write((char *)&bytesScalarData, bytesPerByteVal);
+        for (size_t d = 0; d < nodedata[s].size(); ++d) {
+            double tmp = nodedata[s][d];
+            out.write((char *)&tmp, sizeof(double));
+        }
+    }
+
+    // cellData
+    for (size_t s = 0; s < cellDataNames.size(); ++s) {
+        out.write((char *)&bytesScalarData, bytesPerByteVal);
+        for (size_t d = 0; d < celldata[s].size(); ++d) {
+            double tmp = celldata[s][d];
+            out.write((char *)&tmp, sizeof(double));
+        }
+    }
+    out << "\n   </AppendedData>\n";
+    out << "</VTKFile>";
+    out << endl;
+    out.close();
+}
diff --git a/src/basics/basics/writer/WbWriterVtkXmlImageBinary.h b/src/basics/basics/writer/WbWriterVtkXmlImageBinary.h
new file mode 100644
index 0000000000000000000000000000000000000000..c41ff442732e5f65db0f1dd1ec63e5c3ffca1486
--- /dev/null
+++ b/src/basics/basics/writer/WbWriterVtkXmlImageBinary.h
@@ -0,0 +1,110 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file WbWriterVtkXmlBinary.h
+//! \ingroup writer
+//! \author Soeren Freudiger, Sebastian Geller
+//=======================================================================================
+#ifndef WBWRITERVTKXMLIMAGEBINARY_H
+#define WBWRITERVTKXMLIMAGEBINARY_H
+
+#include <string>
+
+#include <basics/writer/WbWriter.h>
+
+#include "basics_export.h"
+
+class BASICS_EXPORT WbWriterVtkXmlImageBinary : public WbWriter
+{
+public:
+    static WbWriterVtkXmlImageBinary *getInstance()
+    {
+        static WbWriterVtkXmlImageBinary instance;
+        return &instance;
+    }
+
+    WbWriterVtkXmlImageBinary(const WbWriterVtkXmlImageBinary &) = delete;
+    const WbWriterVtkXmlImageBinary &operator=(const WbWriterVtkXmlImageBinary &) = delete;
+
+private:
+    WbWriterVtkXmlImageBinary() : WbWriter()
+    {
+        if (sizeof(unsigned char) != 1)
+            throw UbException(UB_EXARGS, "machine error char  type mismatch");
+        if (sizeof(int) != 4)
+            throw UbException(UB_EXARGS, "machine error int   type mismatch");
+        if (sizeof(float) != 4)
+            throw UbException(UB_EXARGS, "machine error float type mismatch");
+    }
+
+    static const std::string pvdEndTag;
+
+public:
+    std::string getFileExtension() override { return ".bin.vti"; }
+
+    // write a metafile
+    std::string writeCollection(const std::string &filename, const std::vector<std::string> &filenames,
+                                const double &timestep, const bool &sepGroups);
+    std::string addFilesToCollection(const std::string &filename, const std::vector<std::string> &filenames,
+                                     const double &timestep, const bool &sepGroups);
+    std::string writeParallelFile(const std::string &filename, const UbTupleInt6 &wholeExtent, const UbTupleFloat3 &origin, const UbTupleFloat3 &spacing, 
+                                std::vector<std::string> &pieceSources, std::vector<UbTupleInt6> &pieceExtents,
+                                std::vector<std::string> &pointDataNames, std::vector<std::string> &cellDataNames);
+
+    //////////////////////////////////////////////////////////////////////////
+    // nodes
+    std::string writeNodesWithNodeData(const std::string &filename, std::vector<UbTupleFloat3> &nodes,
+                                       std::vector<std::string> &datanames,
+                                       std::vector<std::vector<double>> &nodedata) override;
+
+    //////////////////////////////////////////////////////////////////////////
+    // octs
+    //     7 ---- 6
+    //    /|     /|
+    //   4 +--- 5 |
+    //   | |    | |
+    //   | 3 ---+ 2
+    //   |/     |/
+    //   0 ---- 1
+    std::string writeOctsWithCellData(const std::string &filename, std::vector<UbTupleFloat3> &nodes,
+                                      std::vector<UbTupleInt8> &cells, std::vector<std::string> &datanames,
+                                      std::vector<std::vector<double>> &celldata) override;
+    std::string writeOctsWithNodeData(const std::string &filename, std::vector<UbTupleFloat3> &nodes,
+                                      std::vector<UbTupleUInt8> &cells, std::vector<std::string> &datanames,
+                                      std::vector<std::vector<double>> &nodedata) override;
+    void writeData(const std::string &vtkfilename,
+                                            std::vector<std::string> &pointDataNames, std::vector<std::string> &cellDataNames,
+                                            std::vector<std::vector<double>> &nodedata, std::vector<std::vector<double>> &celldata, 
+                                            UbTupleInt6 &wholeExtent,
+                                            UbTupleFloat3 &origin, UbTupleFloat3 &spacing, UbTupleInt6 &extent, unsigned int precision=6);
+
+private:
+    void getMetaDataOfImage(std::vector<UbTupleFloat3> &nodes, UbTupleFloat3& origin, UbTupleFloat3& spacing, UbTupleInt6& extent);
+};
+
+#endif // WBWRITERVTKXMLIMAGEBINARY_H
diff --git a/src/basics/config/ConfigurationFile.h b/src/basics/config/ConfigurationFile.h
index ef7e7c9f06f94cabb3ba9cbefe95c8ee75736958..4a53f7add85b9c6461fda0bab20fa6656eebc5d3 100644
--- a/src/basics/config/ConfigurationFile.h
+++ b/src/basics/config/ConfigurationFile.h
@@ -64,6 +64,10 @@ public:
    template<class T>
    T getValue(const std::string& key) const;
 
+   //! get value with key and default value
+   template<class T>
+   T getValue(const std::string& key, T defaultValue) const;
+
 private:
    //! the container
    std::map<std::string, std::string> data;
@@ -138,6 +142,19 @@ T ConfigurationFile::getValue(const std::string& key) const
    return x;
 }
 
+template<class T>
+T ConfigurationFile::getValue(const std::string& key, T defaultValue) const
+{
+   if (contains(key))
+   {
+      return getValue<T>(key);
+   }
+   else
+   {
+      return defaultValue;
+   }
+}
+
 }
 
 #endif
diff --git a/src/basics/tests/testUtilities.h b/src/basics/tests/testUtilities.h
index c70d9cc5c11633ded6b696d92692e3d4edf8d2ca..57606edc130b0471b957202420cb12859a9cde84 100644
--- a/src/basics/tests/testUtilities.h
+++ b/src/basics/tests/testUtilities.h
@@ -1,6 +1,8 @@
 #ifndef TESTUTILITIES_H
 #define TESTUTILITIES_H
 
+#include <gmock/gmock.h>
+
 inline auto RealEq = [](auto value) {
 #ifdef VF_DOUBLE_ACCURACY
     return testing::DoubleEq(value);
diff --git a/src/gpu/GksGpu/BoundaryConditions/BoundaryCondition.h b/src/gpu/GksGpu/BoundaryConditions/BoundaryCondition.h
index fe4078af95904fa5e1580b54f3aa2edbb006bd3d..9c3bac9c3e2795fa99f339461c6a7f2d16448696 100644
--- a/src/gpu/GksGpu/BoundaryConditions/BoundaryCondition.h
+++ b/src/gpu/GksGpu/BoundaryConditions/BoundaryCondition.h
@@ -47,13 +47,13 @@ struct GKSGPU_EXPORT BoundaryCondition : virtual public BoundaryConditionStruct,
     virtual bool isWall() = 0;
 
     virtual bool isFluxBC();
-    
+
     virtual bool isInsulated();
 
     virtual bool secondCellsNeeded();
 
     virtual void runBoundaryConditionKernel( const SPtr<DataBase> dataBase,
-                                             const Parameters parameters, 
+                                             const Parameters parameters,
                                              const uint level ) = 0;
 
     BoundaryConditionStruct toStruct()
diff --git a/src/gpu/GksGpu/CMakeLists.txt b/src/gpu/GksGpu/CMakeLists.txt
index 5dbc533cc5f45c006c29a12242350f0433518bbf..6db6cbac1ff60c76986c3c22cc8017300d4f71ea 100644
--- a/src/gpu/GksGpu/CMakeLists.txt
+++ b/src/gpu/GksGpu/CMakeLists.txt
@@ -1,6 +1,6 @@
 project(GksGpu LANGUAGES CUDA CXX)
 
-vf_add_library(PRIVATE_LINK basics lbmCuda GksMeshAdapter OpenMP::OpenMP_CXX MPI::MPI_CXX)
+vf_add_library(PRIVATE_LINK basics lbm GksMeshAdapter OpenMP::OpenMP_CXX MPI::MPI_CXX)
 
 target_include_directories(GksGpu PRIVATE "${VF_THIRD_DIR}/cuda_samples/")
 
diff --git a/src/gpu/GksMeshAdapter/CMakeLists.txt b/src/gpu/GksMeshAdapter/CMakeLists.txt
index b9a2d12df4d0bee9396a706c6636b5f4056b2d3a..8ac5e69513eca94710797db1f971b2461336b769 100644
--- a/src/gpu/GksMeshAdapter/CMakeLists.txt
+++ b/src/gpu/GksMeshAdapter/CMakeLists.txt
@@ -1,3 +1,3 @@
 project(GksMeshAdapter LANGUAGES CUDA CXX)
 
-vf_add_library(PRIVATE_LINK basics GridGenerator lbmCuda)
+vf_add_library(PRIVATE_LINK basics GridGenerator lbm)
diff --git a/src/gpu/GridGenerator/TransientBCSetter/TransientBCSetter.cpp b/src/gpu/GridGenerator/TransientBCSetter/TransientBCSetter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5f3c4ad492b16c09b26acd00a624a54ad65dffda
--- /dev/null
+++ b/src/gpu/GridGenerator/TransientBCSetter/TransientBCSetter.cpp
@@ -0,0 +1,444 @@
+#include "TransientBCSetter.h"
+#include "GridGenerator/grid/Grid.h"
+#include "GridGenerator/grid/BoundaryConditions/BoundaryCondition.h"
+#include <logger/Logger.h>
+
+
+#include <math.h>
+#include <sstream>
+#include <fstream>
+#include <iostream>
+#include <algorithm>
+
+SPtr<FileCollection> createFileCollection(std::string prefix, FileType type)
+{
+    switch(type)
+    {
+        case FileType::VTK:
+            return std::make_shared<VTKFileCollection>(prefix);
+            break;
+        default:
+            return nullptr;
+    }
+}
+
+SPtr<TransientBCInputFileReader> createReaderForCollection(SPtr<FileCollection> fileCollection, uint readLevel)
+{
+    switch(fileCollection->getFileType())
+    {
+        case FileType::VTK:
+            return std::make_shared<VTKReader>(std::static_pointer_cast<VTKFileCollection>(fileCollection), readLevel);
+            break;
+        default:
+            return nullptr;
+    }
+}
+
+template<typename T>
+std::vector<T> readStringToVector(std::string s)
+{
+    std::vector<T> out;
+    std::stringstream input(s);
+    float num;
+    while(input >> num)
+    {
+        out.push_back(num);
+    }
+    return out;
+}
+
+std::string readElement(std::string line)
+{
+    size_t elemStart = line.find("<")+1;
+    // size_t elemEnd = line.find("/>", elemStart);
+    size_t nameLen = line.find(" ", elemStart)-elemStart;
+    return line.substr(elemStart, nameLen);
+}
+
+std::string readAttribute(std::string line, std::string attributeName)
+{
+    size_t attributeStart = line.find(attributeName)+attributeName.size() + 2; // add 2 for '="'
+    size_t attributeLen = line.find("\"", attributeStart)-attributeStart;
+    return line.substr(attributeStart, attributeLen);
+}
+
+void VTKFile::readHeader()
+{
+    //TODO make this more flexible
+    std::ifstream file(this->fileName);
+
+    std::string line;
+
+    getline(file, line); // VTKFile
+    if(line[1]=='?') getline(file, line); // ignore first line if xml version
+
+    getline(file, line); // ImageData
+    std::vector<int> wholeExtent = readStringToVector<int>(readAttribute(line, "WholeExtent"));
+    std::vector<float> origin = readStringToVector<float>(readAttribute(line, "Origin"));
+    std::vector<float> spacing = readStringToVector<float>(readAttribute(line, "Spacing"));
+
+    getline(file, line); // Piece 
+    std::vector<int> pieceExtent = readStringToVector<int>(readAttribute(line, "Extent"));
+    getline(file, line); // PointData
+
+    getline(file, line);
+    while(strcmp(readElement(line).c_str(), "DataArray")==0)
+    {
+        Quantity quant = Quantity();
+        quant.name = readAttribute(line, "Name");
+        quant.offset = std::stoi(readAttribute(line, "offset"));
+        this->quantities.push_back( quant );
+        getline(file, line);
+    }
+    getline(file, line); // </Piece
+    getline(file, line); // </ImageData
+    getline(file, line); // AppendedData
+
+    int offset = int(file.tellg())+sizeof(char)+4; // skip underscore and bytesPerVal
+
+    for(auto& quantity: this->quantities)
+    {
+        quantity.offset += offset;
+    }
+
+    file.close();
+
+    this->deltaX = spacing[0];
+    this->deltaY = spacing[1];
+    this->deltaZ = spacing[2];
+
+    this->nx = pieceExtent[1]-pieceExtent[0]+1;
+    this->ny = pieceExtent[3]-pieceExtent[2]+1;
+    this->nz = pieceExtent[5]-pieceExtent[4]+1;
+
+    this->minX = origin[0]+this->deltaX*pieceExtent[0]; this->maxX = (this->nx-1)*this->deltaX+this->minX;
+    this->minY = origin[1]+this->deltaY*pieceExtent[2]; this->maxY = (this->ny-1)*this->deltaY+this->minY;
+    this->minZ = origin[2]+this->deltaZ*pieceExtent[4]; this->maxZ = (this->nz-1)*this->deltaZ+this->minZ;
+    // printFileInfo();
+
+}
+
+bool VTKFile::markNANs(std::vector<uint> readIndices)
+{
+    std::ifstream buf(fileName.c_str(), std::ios::in | std::ios::binary);
+
+    std::vector<double> tmp;
+    tmp.reserve(readIndices.size());
+    buf.seekg(this->quantities[0].offset);
+    buf.read((char*) tmp.data(), sizeof(double)*readIndices.size());
+    auto firstNAN = std::find_if(tmp.begin(), tmp.end(), [](auto it){ return isnan(it); });
+    
+    return firstNAN != tmp.end();
+}
+
+void VTKFile::loadFile()
+{
+    std::ifstream buf(this->fileName.c_str(), std::ios::in | std::ios::binary);
+    for(auto& quantity: this->quantities)
+    {
+        quantity.values.resize(getNumberOfPoints());
+        buf.seekg(quantity.offset);
+        buf.read(reinterpret_cast<char*>(quantity.values.data()), this->getNumberOfPoints()*sizeof(double));
+    }
+
+    buf.close();
+
+    this->loaded = true;
+}
+
+void VTKFile::unloadFile()
+{
+    for(auto& quantity : this->quantities)
+    {
+        std::vector<double> replacement;
+        quantity.values.swap(replacement);
+    }
+    this->loaded = false;
+}
+
+void VTKFile::getData(real *data, uint numberOfNodes, const std::vector<uint> &readIndices,
+                      const std::vector<uint> &writeIndices, uint offsetRead, uint offsetWrite)
+{
+    if(!this->loaded) loadFile();
+
+    size_t nPoints = writeIndices.size();
+
+    for(size_t j=0; j<this->quantities.size(); j++)
+    {
+        real* quant = &data[j*numberOfNodes];
+        for(size_t i=0; i<nPoints; i++)
+        {
+            quant[offsetWrite+writeIndices[i]] = this->quantities[j].values[readIndices[i]+offsetRead];
+        }
+    }
+}
+
+void VTKFile::printFileInfo()
+{
+    printf("file %s with \n nx %i ny %i nz %i \n origin %f %f %f \n spacing %f %f %f \n", 
+            fileName.c_str(), nx, ny, nz, minX, minY, minZ, deltaX, deltaY, deltaZ);
+    for(auto quantity: this->quantities)
+    {
+        printf("\t quantity %s offset %i \n", quantity.name.c_str(), quantity.offset);
+    }
+        
+}
+
+
+void VTKFileCollection::findFiles()
+{
+    bool foundLastLevel = false;
+
+    while(!foundLastLevel)
+    {
+        bool foundLastID = false;
+        std::vector<std::vector<VTKFile>> filesOnThisLevel;
+        while(!foundLastID)
+        {
+            bool foundLastPart = false;
+            std::vector<VTKFile> filesWithThisId;
+            while (!foundLastPart)
+            {
+                std::string fname = makeFileName((int)files.size(), (int)filesOnThisLevel.size(), (int)filesWithThisId.size());
+                std::ifstream f(fname);
+                if(f.good())
+                    filesWithThisId.emplace_back(fname);
+                else
+                    foundLastPart = true;    
+            }
+            if(!filesWithThisId.empty())
+            {
+                VF_LOG_INFO("VTKFileCollection found {} files with ID {} level {}", filesWithThisId.size(), filesOnThisLevel.size(), files.size() );
+                filesOnThisLevel.push_back(filesWithThisId);
+            }
+            else foundLastID = true;
+        }
+
+
+        if(!filesOnThisLevel.empty())
+            files.push_back(filesOnThisLevel);
+        else 
+            foundLastLevel = true;
+
+    }
+
+    if(files.empty())
+        VF_LOG_CRITICAL("VTKFileCollection found no files!"); 
+}
+    
+void TransientBCInputFileReader::getNeighbors(uint* neighbor0PP, uint* neighbor0PM, uint* neighbor0MP, uint* neighbor0MM)
+{
+    std::copy(planeNeighbor0PP.begin(), planeNeighbor0PP.end(), &neighbor0PP[writingOffset]);
+    std::copy(planeNeighbor0PM.begin(), planeNeighbor0PM.end(), &neighbor0PM[writingOffset]);
+    std::copy(planeNeighbor0MP.begin(), planeNeighbor0MP.end(), &neighbor0MP[writingOffset]);
+    std::copy(planeNeighbor0MM.begin(), planeNeighbor0MM.end(), &neighbor0MM[writingOffset]);
+}
+
+void TransientBCInputFileReader::getWeights(real* _weights0PP, real* _weights0PM, real* _weights0MP, real* _weights0MM)
+{
+    std::copy(weights0PP.begin(), weights0PP.end(), &_weights0PP[writingOffset]);
+    std::copy(weights0PM.begin(), weights0PM.end(), &_weights0PM[writingOffset]);
+    std::copy(weights0MP.begin(), weights0MP.end(), &_weights0MP[writingOffset]);
+    std::copy(weights0MM.begin(), weights0MM.end(), &_weights0MM[writingOffset]);
+}
+
+
+void VTKReader::initializeIndexVectors()
+{
+    this->readIndices.resize(this->fileCollection->files.size());
+    this->writeIndices.resize(this->fileCollection->files.size());
+    this->nFile.resize(this->fileCollection->files.size());
+    for(size_t lev=0; lev<this->fileCollection->files.size(); lev++)
+    {
+        this->readIndices[lev].resize(this->fileCollection->files[lev].size());
+        this->writeIndices[lev].resize(this->fileCollection->files[lev].size());
+        this->nFile[lev].resize(this->fileCollection->files[lev].size());
+    }
+}
+
+void VTKReader::fillArrays(std::vector<real>& coordsY, std::vector<real>& coordsZ)
+{
+    this->nPoints = (uint)coordsY.size();
+    this->initializeIndexVectors();
+    real max_diff = 1e-4; // maximum distance between point on grid and precursor plane to count as exact match
+    real eps = 1e-7; // small number to avoid division by zero
+    bool perfect_match = true;
+
+    this->weights0PP.reserve(this->nPoints);
+    this->weights0PM.reserve(this->nPoints);
+    this->weights0MP.reserve(this->nPoints);
+    this->weights0MM.reserve(this->nPoints);
+
+    this->planeNeighbor0PP.reserve(this->nPoints);
+    this->planeNeighbor0PM.reserve(this->nPoints);
+    this->planeNeighbor0MP.reserve(this->nPoints);
+    this->planeNeighbor0MM.reserve(this->nPoints);
+
+    for(uint i=0; i<nPoints; i++)
+    {
+
+        real posY = coordsY[i];
+        real posZ = coordsZ[i];
+        bool found0PP = false, found0PM = false, found0MP = false, found0MM = false, foundAll = false;
+
+        uint level = this->readLevel;
+
+        for(int fileId=0; fileId<(int)this->fileCollection->files[level].size(); fileId++)
+        {
+            VTKFile &file = this->fileCollection->files[level][fileId][0];
+            if(!file.inBoundingBox(posY, posZ, 0.0f)) continue;
+
+            // y in simulation is x in precursor/file, z in simulation is y in precursor/file 
+            // simulation -> file: N -> E, S -> W, T -> N, B -> S
+            int idx = file.findNeighborMMM(posY, posZ, 0.f);                            //!> index of nearest WSB neighbor on precursor file
+            
+            if(idx!=-1)
+            {
+                // Filter for exact matches
+                if(abs(posY-file.getX(idx)) < max_diff && abs(posZ-file.getY(idx)) < max_diff) 
+                {
+                    this->weights0PP.emplace_back(1e6f);
+                    this->weights0PM.emplace_back(0.f);
+                    this->weights0MP.emplace_back(0.f);
+                    this->weights0MM.emplace_back(0.f);
+                    uint writeIdx = this->getWriteIndex(level, fileId, idx);            //!> writeIdx: index on host/device array where precursor value will be written to after loading from file
+                    this->planeNeighbor0PP.push_back(writeIdx);                          //!> neighbor lists mapping where BC kernel should read from on host/device array
+                    this->planeNeighbor0PM.push_back(writeIdx);
+                    this->planeNeighbor0MP.push_back(writeIdx);
+                    this->planeNeighbor0MM.push_back(writeIdx);
+                    found0PP = true;
+                    found0PM = true;
+                    found0MM = true;
+                    found0MP = true;
+                } 
+                else
+                {
+                    perfect_match = false;
+                }
+
+                if(!found0MM)
+                {
+                    found0MM = true;
+                    real dy = file.getX(idx)-posY;
+                    real dz = file.getY(idx)-posZ;
+                    this->weights0MM.emplace_back(1.f/(dy*dy+dz*dz+eps));
+                    this->planeNeighbor0MM.emplace_back(getWriteIndex(level, fileId, idx));
+                }
+                
+            } 
+            
+            if(!found0PP) //NT in simulation is EN in precursor
+            {
+                int index = file.findNeighborPPM(posY, posZ, 0.f);
+                if(index!=-1)
+                {
+                    found0PP = true;
+                    real dy = file.getX(index)-posY;
+                    real dz = file.getY(index)-posZ;
+                    this->weights0PP.emplace_back(1.f/(dy*dy+dz*dz+eps));
+                    this->planeNeighbor0PP.emplace_back(getWriteIndex(level, fileId, index));
+                }
+            }
+
+            if(!found0PM) //NB in simulation is ES in precursor
+            {
+                int index = file.findNeighborPMM(posY, posZ, 0.f);
+                if(index!=-1)
+                {
+                    found0PM = true;
+                    real dy = file.getX(index)-posY;
+                    real dz = file.getY(index)-posZ;
+                    this->weights0PM.emplace_back(1.f/(dy*dy+dz*dz+eps));
+                    this->planeNeighbor0PP.emplace_back(getWriteIndex(level, fileId, index));
+                }
+            }
+
+            if(!found0MP) //ST in simulation is WN in precursor
+            {
+                int index = file.findNeighborMPM(posY, posZ, 0.f);
+                if(index!=-1)
+                {
+                    found0MP = true;
+                    real dy = file.getX(index)-posY;
+                    real dz = file.getY(index)-posZ;
+                    this->weights0MP.emplace_back(1.f/(dy*dy+dz*dz+eps));
+                    this->planeNeighbor0MP.emplace_back(getWriteIndex(level, fileId, index));
+                }
+            }
+
+            foundAll = found0PP && found0PM && found0MP && found0MM;
+
+            if(foundAll) break;
+        }
+
+        if(!foundAll)
+        {
+            VF_LOG_CRITICAL("Found no matching precursor neighbors for grid point at y={}, z={} \n", posY, posZ);
+            throw std::runtime_error("VTKReader::fillArrays(): Did not find neighbors in the FileCollection for all points");
+        }
+    }
+
+    if(perfect_match)
+        printf("Precursor was a perfect match \n");
+
+
+    for(size_t level=0; level<this->fileCollection->files.size(); level++){
+        for(size_t id=0; id<this->fileCollection->files[level].size(); id++){
+            if(this->fileCollection->files[level][id][0].markNANs(this->readIndices[level][id]))
+                throw std::runtime_error("Found a NAN in the precursor where a velocity is needed");
+    }}
+}
+
+uint VTKReader::getWriteIndex(int level, int id, int linearIndex)
+{
+    auto it = std::find(this->writeIndices[level][id].begin(), this->writeIndices[level][id].end(), linearIndex);
+    uint idx = it-this->writeIndices[level][id].begin();
+    if(it==this->writeIndices[level][id].end())                         
+    {
+        this->writeIndices[level][id].push_back(this->nPointsRead);     //!> index on host/device array where value from file will be written to
+        this->readIndices[level][id].push_back(linearIndex);            //!> index in file that will be read from 
+        this->nPointsRead++;
+    }
+    return idx;
+}
+
+
+void VTKReader::getNextData(real* data, uint numberOfNodes, real time)
+{
+    // for(size_t level=0; level<this->fileCollection->files.size(); level++)
+    // {
+        uint level = this->readLevel;
+        for(size_t id=0; id<this->fileCollection->files[level].size(); id++)
+        {
+            size_t numberOfFiles = this->nFile[level][id];
+
+
+            if(!this->fileCollection->files[level][id][numberOfFiles].inZBounds(time))
+            {
+                numberOfFiles++;
+
+                printf("switching to precursor file no. %zu\n", numberOfFiles);
+                if(numberOfFiles == this->fileCollection->files[level][id].size())
+                    throw std::runtime_error("Not enough Precursor Files to read");
+
+                this->fileCollection->files[level][id][numberOfFiles-1].unloadFile();
+                if(numberOfFiles+1<this->fileCollection->files[level][id].size())
+                {
+                    VTKFile* nextFile = &this->fileCollection->files[level][id][numberOfFiles+1];
+                    if(! nextFile->isLoaded())
+                    {
+                        read.wait();
+                        read = std::async(std::launch::async, [](VTKFile* file){ file->loadFile(); }, &this->fileCollection->files[level][id][numberOfFiles+1]);
+                    }
+                }
+            }
+        
+
+            VTKFile* file = &this->fileCollection->files[level][id][numberOfFiles];
+
+            int off = file->getClosestIdxZ(time)*file->getNumberOfPointsInXYPlane();
+            file->getData(data, numberOfNodes, this->readIndices[level][id], this->writeIndices[level][id], off, this->writingOffset);
+            this->nFile[level][id] = numberOfFiles;
+        }
+    // }
+}
diff --git a/src/gpu/GridGenerator/TransientBCSetter/TransientBCSetter.h b/src/gpu/GridGenerator/TransientBCSetter/TransientBCSetter.h
new file mode 100644
index 0000000000000000000000000000000000000000..1663a3ff37ba1bb062647847462d4e364baed93b
--- /dev/null
+++ b/src/gpu/GridGenerator/TransientBCSetter/TransientBCSetter.h
@@ -0,0 +1,201 @@
+#ifndef TRANSIENTBCSETTER_H_
+#define TRANSIENTBCSETTER_H_
+
+#include "Core/DataTypes.h"
+#include <Core/StringUtilities/StringUtil.h>
+#include "PointerDefinitions.h"
+
+#include <string>
+#include <vector>
+#include <math.h>
+#include <sstream>
+#include <future>
+class Grid;
+namespace gg
+{
+    class BoundaryCondition;
+}
+
+
+enum class FileType
+{
+    VTK
+};
+
+struct Quantity
+{
+    std::string name;
+    int offset;
+    std::vector<double> values;
+};
+
+class VTKFile
+{
+public: 
+    explicit VTKFile(std::string _fileName): 
+    fileName(_fileName)
+    {
+        readHeader();
+        this->loaded = false;
+        // printFileInfo();
+    };
+
+    void getData(real* data, uint numberOfNodes, const std::vector<uint>& readIndices, const std::vector<uint>& writeIndices, uint offsetRead, uint offsetWrite);
+    bool markNANs(std::vector<uint> readIndices);
+    bool inBoundingBox(real posX, real posY, real posZ){return  inXBounds(posX) && inYBounds(posY) && inZBounds(posZ); };
+    bool inXBounds(real posX){ return posX<=maxX && posX>=minX; };
+    bool inYBounds(real posY){ return posY<=maxY && posY>=minY; };
+    bool inZBounds(real posZ){ return posZ<=maxZ && posZ>=minZ; };
+    int findNeighborMMM(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxM00(posX)  , getIdx0M0(posY)  , getIdx00M(posZ)  ); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborMMP(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxM00(posX)  , getIdx0M0(posY)  , getIdx00M(posZ)+1); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborMPM(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxM00(posX)  , getIdx0M0(posY)+1, getIdx00M(posZ)  ); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborMPP(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxM00(posX)  , getIdx0M0(posY)+1, getIdx00M(posZ)+1); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborPMM(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxM00(posX)+1, getIdx0M0(posY)  , getIdx00M(posZ)  ); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborPMP(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxM00(posX)+1, getIdx0M0(posY)  , getIdx00M(posZ)+1); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborPPM(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxM00(posX)+1, getIdx0M0(posY)+1, getIdx00M(posZ)  ); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborPPP(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxM00(posX)+1, getIdx0M0(posY)+1, getIdx00M(posZ)+1); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int getIdxX(int linearIdx){ return linearIdx%nx;};
+    int getIdxY(int linearIdx){ return (linearIdx/nx)%ny;};
+    int getIdxZ(int linearIdx){ return linearIdx/(nx*ny); };
+    real getX(int linearIdx){ return getIdxX(linearIdx)*deltaX+minX; };
+    real getY(int linearIdx){ return getIdxY(linearIdx)*deltaY+minY; };
+    real getZ(int linearIdx){ return getIdxZ(linearIdx)*deltaZ+minZ; };
+    int getIdxM00(real posX){ return (posX-minX)/deltaX; };
+    int getIdx0M0(real posY){ return (posY-minY)/deltaY; };
+    int getIdx00M(real posZ){ return (posZ-minZ)/deltaZ; };
+    int getClosestIdxX(real posX){ int x = round((posX-minX)/deltaX); return x>nx ? nx : (x<0 ? 0 : x);};
+    int getClosestIdxY(real posY){ int y = round((posY-minY)/deltaY); return y>ny ? ny : (y<0 ? 0 : y);};
+    int getClosestIdxZ(real posZ){ int z = round((posZ-minZ)/deltaZ); return z>nz ? nz : (z<0 ? 0 : z);};
+    int getLinearIndex(int idxX, int idxY, int idxZ){ return idxX + nx*(idxY+ny*idxZ); };
+    int getNumberOfPointsInXYPlane(){ return nx*ny; }
+    int getNumberOfPointsInYZPlane(){ return ny*nz; }
+    int getNumberOfPointsInXZPlane(){ return nx*nz; }
+    int getNumberOfPoints(){ return nx*ny*nz; }
+    size_t getNumberOfQuantities(){ return quantities.size(); }
+    void loadFile();
+    void unloadFile();
+    bool isLoaded(){return loaded;};
+
+
+private:
+    void readHeader();
+    void printFileInfo();
+
+public:
+
+private:
+    std::string fileName;
+    real minX, maxX, minY, maxY, minZ, maxZ;
+    real deltaX, deltaY, deltaZ;
+    int nx, ny, nz;
+    std::vector<Quantity> quantities;
+    bool loaded;
+};
+
+class FileCollection
+{
+public:
+    FileCollection(std::string _prefix): 
+    prefix(_prefix){};
+
+    virtual ~FileCollection() = default;
+
+    virtual size_t getNumberOfQuantities() = 0;
+
+    virtual FileType getFileType() = 0;
+
+protected:
+    std::string prefix;
+};
+
+
+class VTKFileCollection : public FileCollection
+{
+public:
+    VTKFileCollection(std::string _prefix): 
+    FileCollection(_prefix)
+    {
+        findFiles();
+    };
+
+    FileType getFileType() override{ return FileType::VTK; };
+    size_t getNumberOfQuantities() override{ return files[0][0][0].getNumberOfQuantities(); }
+    
+
+private:
+    void findFiles();
+    std::string makeFileName(int level, int id, int part)
+    { 
+        return prefix + "_lev_" + StringUtil::toString<int>(level)
+                    + "_ID_" +    StringUtil::toString<int>(id)
+                    + "_File_" +  StringUtil::toString<int>(part) 
+                    + ".bin." + suffix;
+    };
+
+
+public:
+    static const inline std::string suffix = "vti";
+    std::vector<std::vector<std::vector<VTKFile>>> files;
+};
+
+
+class TransientBCInputFileReader
+{
+public:
+    TransientBCInputFileReader()
+    { 
+        this->nPoints = 0; 
+        this->nPointsRead = 0;
+        this->writingOffset = 0;        
+    };
+    virtual ~TransientBCInputFileReader() = default;
+
+    virtual void getNextData(real* data, uint numberOfNodes, real time)=0;
+    virtual void fillArrays(std::vector<real>& coordsY, std::vector<real>& coordsZ)=0;
+    uint getNPoints(){return nPoints; };
+    uint getNPointsRead(){return nPointsRead; };
+    size_t getNumberOfQuantities(){ return nQuantities; };
+    void setWritingOffset(uint offset){ this->writingOffset = offset; }
+    void getNeighbors(uint* neighbor0PP, uint* neighbor0PM, uint* neighbor0MP, uint* neighbor0MM);
+    void getWeights(real* _weights0PP, real* _weights0PM, real* _weights0MP, real* _weights0MM);
+
+public:
+    std::vector<uint> planeNeighbor0PP,  planeNeighbor0PM, planeNeighbor0MP, planeNeighbor0MM;
+    std::vector<real> weights0PP, weights0PM, weights0MP,  weights0MM;
+
+protected:
+    uint nPoints, nPointsRead, writingOffset;
+    uint nReads=0;
+    size_t nQuantities=0;
+};
+
+
+class VTKReader : public TransientBCInputFileReader
+{
+public:
+    VTKReader(SPtr<VTKFileCollection> _fileCollection, uint _readLevel):
+    fileCollection(_fileCollection), 
+    readLevel(_readLevel)
+    {
+        this->nQuantities = fileCollection->getNumberOfQuantities();
+        read = std::async([](){});
+    };
+    void getNextData(real* data, uint numberOfNodes, real time) override;
+    void fillArrays(std::vector<real>& coordsY, std::vector<real>& coordsZ) override;
+private:  
+    uint getWriteIndex(int level, int id, int linearIdx);
+    void initializeIndexVectors();
+
+private:
+    std::vector<std::vector<std::vector<uint>>> readIndices, writeIndices;
+    std::vector<std::vector<size_t>> nFile;
+    SPtr<VTKFileCollection> fileCollection;
+    uint readLevel;
+    std::future<void> read;
+};
+
+
+SPtr<FileCollection> createFileCollection(std::string prefix, FileType type);
+SPtr<TransientBCInputFileReader> createReaderForCollection(SPtr<FileCollection> fileCollection, uint readLevel);
+
+#endif //TRANSIENTBCSETTER_H_
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp
index 5102f60fc295aadf4323a4b332bf3dd8f7f21dbf..b0fb2604946b83ead45c30adabbcfe8dc26fa656 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp
@@ -36,12 +36,12 @@
 
 #include "grid/BoundaryConditions/Side.h"
 #include "grid/Grid.h"
+#include "GridGenerator/TransientBCSetter/TransientBCSetter.h"
 
 bool gg::BoundaryCondition::isSide( SideType side ) const
 {
     return this->side->whoAmI() == side;
 }
-
 //////////////////////////////////////////////////////////////////////////
 
 void VelocityBoundaryCondition::setVelocityProfile(
@@ -124,5 +124,4 @@ void StressBoundaryCondition::fillSamplingIndices(std::vector<SPtr<Grid> > grid,
         this->velocitySamplingIndices.push_back( grid[level]->transCoordToIndex(x_sampling, y_sampling, z_sampling) );
     }
     
-}
-
+}
\ No newline at end of file
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h
index 4a3990d9f815042297be76ae83a61268c8ad6815..22342aec9839afad9bb37b1b11812f6d1750ed7b 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h
@@ -45,6 +45,8 @@ class Grid;
 class Side;
 enum class SideType;
 
+class TransientBCInputFileReader;
+
 namespace gg
 {
 class BoundaryCondition
@@ -63,6 +65,8 @@ public:
     bool isSide(SideType side) const;
 
     real getQ(uint index, uint dir) { return this->qs[index][dir]; }
+
+    void getCoords( SPtr<Grid> grid, std::vector<real>& x, std::vector<real>& y, std::vector<real>& z);
 };
 
 }
@@ -246,6 +250,7 @@ public:
     real getVy(uint index) { return this->vyList[index]; }
     real getVz(uint index) { return this->vzList[index]; }
 
+
     void setVelocityProfile( SPtr<Grid> grid, std::function<void(real,real,real,real&,real&,real&)> velocityProfile );
 };
 
@@ -329,5 +334,32 @@ public:
     real getNormalz(uint index) { return this->normalZList[index]; }
 };
 
+class PrecursorBoundaryCondition : public gg::BoundaryCondition
+{
+public:
+    static SPtr<PrecursorBoundaryCondition> make(SPtr<TransientBCInputFileReader> reader, int timeStepsBetweenReads, real velocityX, real velocityY, real velocityZ)
+    {
+        return SPtr<PrecursorBoundaryCondition>(new PrecursorBoundaryCondition(reader, timeStepsBetweenReads, velocityX, velocityY, velocityZ));
+    }
 
+    SPtr<TransientBCInputFileReader> getReader(){ return reader; }
+    real getVelocityX() { return velocityX; }
+    real getVelocityY() { return velocityY; }
+    real getVelocityZ() { return velocityZ; }
+
+private:
+    PrecursorBoundaryCondition(SPtr<TransientBCInputFileReader> _reader, uint _timeStepsBetweenReads, real vx, real vy, real vz) : reader(_reader), timeStepsBetweenReads(_timeStepsBetweenReads), velocityX(vx), velocityY(vy), velocityZ(vz) { };
+    virtual char getType() const override
+    {
+        return vf::gpu::BC_VELOCITY;
+    }
+public:
+    uint timeStepsBetweenReads; //!> read data every nth timestep
+
+private:
+    real velocityX = 0.0;
+    real velocityY = 0.0;
+    real velocityZ = 0.0;
+    SPtr<TransientBCInputFileReader> reader;
+};
 #endif
\ No newline at end of file
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
index 6c7bf8ca1853826d83fb6a713ffe03716bd2cf9a..ba4eea50ffb6bc136528db31207274d626fe9b15 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
@@ -1,28 +1,28 @@
 //=======================================================================================
-// ____          ____    __    ______     __________   __      __       __        __         
-// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
-//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
-//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
-//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
-//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
-//      \    \  |    |   ________________________________________________________________    
-//       \    \ |    |  |  ______________________________________________________________|   
-//        \    \|    |  |  |         __          __     __     __     ______      _______    
-//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
-//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
 //           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
-//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
 //
-//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -37,9 +37,24 @@
 #include "grid/NodeValues.h"
 
 #include "utilities/math/Math.h"
+#include <array>
+#include <cstddef>
+#include <vector>
 
 using namespace gg;
 
+std::array<real, 3> Side::getNormal() const
+{
+    std::array<real, 3> normal;
+    if(this->getCoordinate()==X_INDEX)
+        normal = {(real)this->getDirection(), 0.0, 0.0};
+    if(this->getCoordinate()==Y_INDEX)
+        normal = {0.0, (real)this->getDirection(), 0.0};
+    if(this->getCoordinate()==Z_INDEX)
+        normal = {0.0, 0.0, (real)this->getDirection()};
+    return normal;
+}
+
 void Side::addIndices(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition, std::string coord, real constant,
                       real startInner, real endInner, real startOuter, real endOuter)
 {
@@ -49,11 +64,17 @@ void Side::addIndices(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition
         {
             const uint index = getIndex(grid, coord, constant, v1, v2);
 
-            if ((index != INVALID_INDEX) && (  grid->getFieldEntry(index) == vf::gpu::FLUID
-                                            || grid->getFieldEntry(index) == vf::gpu::FLUID_CFC
-                                            || grid->getFieldEntry(index) == vf::gpu::FLUID_CFF
-                                            || grid->getFieldEntry(index) == vf::gpu::FLUID_FCC
-                                            || grid->getFieldEntry(index) == vf::gpu::FLUID_FCF ))
+            if(index == INVALID_INDEX)
+                continue;
+
+            if (   grid->getFieldEntry(index) == vf::gpu::FLUID
+                                            ||  grid->getFieldEntry(index) == vf::gpu::FLUID_CFC
+                                            ||  grid->getFieldEntry(index) == vf::gpu::FLUID_CFF
+                                            ||  grid->getFieldEntry(index) == vf::gpu::FLUID_FCC
+                                            ||  grid->getFieldEntry(index) == vf::gpu::FLUID_FCF
+                                            ||  grid->getFieldEntry(index) == vf::gpu::FLUID_FCF
+                                            // Overlap of BCs on edge nodes
+                                            || grid->nodeHasBC(index) )
             {
                 grid->setFieldEntry(index, boundaryCondition->getType());
                 boundaryCondition->indices.push_back(index);
@@ -64,9 +85,12 @@ void Side::addIndices(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition
 
                 boundaryCondition->patches.push_back(0);
             }
-
         }
     }
+
+    const auto currentBCSide = this->whoAmI();
+    if(currentBCSide != SideType::GEOMETRY)
+        grid->addBCalreadySet(currentBCSide);
 }
 
 void Side::setPressureNeighborIndices(SPtr<BoundaryCondition> boundaryCondition, SPtr<Grid> grid, const uint index)
@@ -119,50 +143,111 @@ void Side::setStressSamplingIndices(SPtr<BoundaryCondition> boundaryCondition, S
 
 void Side::setQs(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition, uint index)
 {
-
     std::vector<real> qNode(grid->getEndDirection() + 1);
 
-    for (int dir = 0; dir <= grid->getEndDirection(); dir++)
-    {
-        real x,y,z;
-        grid->transIndexToCoords( index, x, y, z );
+    for (int dir = 0; dir <= grid->getEndDirection(); dir++) {
+        real x, y, z;
+        grid->transIndexToCoords(index, x, y, z);
 
-        real coords[3] = {x,y,z};
+        std::array<real, 3> coords = { x, y, z };
+        std::array<real, 3> neighborCoords = getNeighborCoordinates(grid.get(), coords, (size_t)dir);
 
-        real neighborX = x + grid->getDirection()[dir * DIMENSION + 0] * grid->getDelta();
-        real neighborY = y + grid->getDirection()[dir * DIMENSION + 1] * grid->getDelta();
-        real neighborZ = z + grid->getDirection()[dir * DIMENSION + 2] * grid->getDelta();
+        correctNeighborForPeriodicBoundaries(grid.get(), coords, neighborCoords);
 
-        // correct neighbor coordinates in case of periodic boundaries
-        if( grid->getPeriodicityX() && grid->getFieldEntry( grid->transCoordToIndex( neighborX, y, z ) ) == vf::gpu::STOPPER_OUT_OF_GRID_BOUNDARY )
-        {
-            if( neighborX > x ) neighborX = grid->getFirstFluidNode( coords, 0, grid->getStartX() );
-            else                neighborX = grid->getLastFluidNode ( coords, 0, grid->getEndX() );
-        }
+        const uint neighborIndex = grid->transCoordToIndex(neighborCoords[0], neighborCoords[1], neighborCoords[2]);
 
-        if( grid->getPeriodicityY() && grid->getFieldEntry( grid->transCoordToIndex( x, neighborY, z ) ) == vf::gpu::STOPPER_OUT_OF_GRID_BOUNDARY )
-        {
-            if( neighborY > y ) neighborY = grid->getFirstFluidNode( coords, 1, grid->getStartY() );
-            else                neighborY = grid->getLastFluidNode ( coords, 1, grid->getEndY() );
+        //! Only setting q's that partially point in the Side-normal direction
+        const bool alignedWithNormal = this->isAlignedWithMyNormal(grid.get(), dir);
+        if (grid->isStopperForBC(neighborIndex) && alignedWithNormal) {
+            qNode[dir] = 0.5;
+        } else {
+            qNode[dir] = -1.0;
         }
 
-        if( grid->getPeriodicityZ() && grid->getFieldEntry( grid->transCoordToIndex( x, y, neighborZ ) ) == vf::gpu::STOPPER_OUT_OF_GRID_BOUNDARY )
-        {
-            if( neighborZ > z ) neighborZ = grid->getFirstFluidNode( coords, 2, grid->getStartZ() );
-            else                neighborZ = grid->getLastFluidNode ( coords, 2, grid->getEndZ() );
+        // reset diagonals in case they were set by another bc
+        resetDiagonalsInCaseOfOtherBC(grid.get(), qNode, dir, coords);
+    }
+
+    boundaryCondition->qs.push_back(qNode);
+}
+
+std::array<real, 3> Side::getNeighborCoordinates(Grid *grid, const std::array<real, 3> &coordinates, size_t direction) const
+{
+    return { coordinates[0] + grid->getDirection()[direction * DIMENSION + 0] * grid->getDelta(),
+             coordinates[1] + grid->getDirection()[direction * DIMENSION + 1] * grid->getDelta(),
+             coordinates[2] + grid->getDirection()[direction * DIMENSION + 2] * grid->getDelta() };
+}
+
+bool Side::neighborNormalToSideIsAStopper(Grid *grid, const std::array<real, 3> &coordinates, SideType side) const
+{
+    const auto neighborCoords = getNeighborCoordinates(grid, coordinates, sideToD3Q27.at(side));
+    const auto neighborIndex = grid->transCoordToIndex(neighborCoords[0], neighborCoords[1], neighborCoords[2]);
+    return grid->isStopperForBC(neighborIndex);
+}
+
+void Side::resetDiagonalsInCaseOfOtherBC(Grid *grid, std::vector<real> &qNode, int dir,
+                                         const std::array<real, 3> &coordinates) const
+{
+    // When to reset a diagonal q to -1:
+    // - it is normal to another boundary condition which was already set
+    // - and it actually is influenced by the other bc:
+    //   We check if its neighbor in the regular direction to the other bc is a stopper. If it is a stopper, it is influenced by the other bc.
+
+    if (qNode[dir] == 0.5 && grid->getBCAlreadySet().size() > 0) {
+        for (int i = 0; i < (int)grid->getBCAlreadySet().size(); i++) {
+            SideType otherDir = grid->getBCAlreadySet()[i];
+
+            // only reset normals for nodes on edges and corners, not on faces
+            if (!neighborNormalToSideIsAStopper(grid, coordinates, otherDir))
+                continue;
+
+            const auto otherNormal = normals.at(otherDir);
+            if (isAlignedWithNormal(grid, dir, otherNormal)) {
+                qNode[dir] = -1.0;
+            }
         }
+    }
+}
 
-        uint neighborIndex = grid->transCoordToIndex( neighborX, neighborY, neighborZ );
-        if( grid->getFieldEntry(neighborIndex) == vf::gpu::STOPPER_OUT_OF_GRID_BOUNDARY ||
-            grid->getFieldEntry(neighborIndex) == vf::gpu::STOPPER_OUT_OF_GRID ||
-            grid->getFieldEntry(neighborIndex) == vf::gpu::STOPPER_SOLID )
-            qNode[dir] = 0.5;
+bool Side::isAlignedWithMyNormal(const Grid *grid, int dir) const
+{
+    std::array<real, 3> normal = this->getNormal();
+    return isAlignedWithNormal(grid, dir, normal);
+}
+
+bool Side::isAlignedWithNormal(const Grid *grid, int dir, const std::array<real, 3> &normal) const
+{
+    return (normal[0] * grid->getDirection()[dir * DIMENSION + 0] +
+            normal[1] * grid->getDirection()[dir * DIMENSION + 1] +
+            normal[2] * grid->getDirection()[dir * DIMENSION + 2]) > 0;
+}
+
+void Side::correctNeighborForPeriodicBoundaries(const Grid *grid, std::array<real, 3>& coords, std::array<real, 3>& neighborCoords) const
+{
+    // correct neighbor coordinates in case of periodic boundaries
+    if (grid->getPeriodicityX() &&
+        grid->getFieldEntry(grid->transCoordToIndex(neighborCoords[0], coords[1], coords[2])) == vf::gpu::STOPPER_OUT_OF_GRID_BOUNDARY) {
+        if (neighborCoords[0] > coords[0])
+            neighborCoords[0] = grid->getFirstFluidNode(coords.data(), 0, grid->getStartX());
         else
-            qNode[dir] = -1.0;
+            neighborCoords[0] = grid->getLastFluidNode(coords.data(), 0, grid->getEndX());
+    }
 
+    if (grid->getPeriodicityY() &&
+        grid->getFieldEntry(grid->transCoordToIndex(coords[0], neighborCoords[1], coords[2])) == vf::gpu::STOPPER_OUT_OF_GRID_BOUNDARY) {
+        if (neighborCoords[1] > coords[1])
+            neighborCoords[1] = grid->getFirstFluidNode(coords.data(), 1, grid->getStartY());
+        else
+            neighborCoords[1] = grid->getLastFluidNode(coords.data(), 1, grid->getEndY());
     }
 
-    boundaryCondition->qs.push_back(qNode);
+    if (grid->getPeriodicityZ() &&
+        grid->getFieldEntry(grid->transCoordToIndex(coords[0], coords[1], neighborCoords[2])) == vf::gpu::STOPPER_OUT_OF_GRID_BOUNDARY) {
+        if (neighborCoords[2] > coords[2])
+            neighborCoords[2] = grid->getFirstFluidNode(coords.data(), 2, grid->getStartZ());
+        else
+            neighborCoords[2] = grid->getLastFluidNode(coords.data(), 2, grid->getEndZ());
+    }
 }
 
 uint Side::getIndex(SPtr<Grid> grid, std::string coord, real constant, real v1, real v2)
@@ -177,7 +262,7 @@ uint Side::getIndex(SPtr<Grid> grid, std::string coord, real constant, real v1,
 }
 
 
-void Geometry::addIndices(std::vector<SPtr<Grid> > grids, uint level, SPtr<BoundaryCondition> boundaryCondition)
+void Geometry::addIndices(const std::vector<SPtr<Grid>> &grids, uint level, SPtr<BoundaryCondition> boundaryCondition)
 {
     auto geometryBoundaryCondition = std::dynamic_pointer_cast<GeometryBoundaryCondition>(boundaryCondition);
 
@@ -190,7 +275,7 @@ void Geometry::addIndices(std::vector<SPtr<Grid> > grids, uint level, SPtr<Bound
 
         for (int dir = 0; dir <= grids[level]->getEndDirection(); dir++)
         {
-			const real q = grids[level]->getQValue(index, dir);
+            const real q = grids[level]->getQValue(index, dir);
 
             qNode[dir] = q;
 
@@ -218,7 +303,7 @@ void Geometry::addIndices(std::vector<SPtr<Grid> > grids, uint level, SPtr<Bound
 
 
 
-void MX::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition)
+void MX::addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<BoundaryCondition> boundaryCondition)
 {
     real startInner = grid[level]->getStartY();
     real endInner = grid[level]->getEndY();
@@ -234,7 +319,7 @@ void MX::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCond
 
 }
 
-void PX::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition)
+void PX::addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<BoundaryCondition> boundaryCondition)
 {
     real startInner = grid[level]->getStartY();
     real endInner = grid[level]->getEndY();
@@ -249,7 +334,7 @@ void PX::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCond
     Side::addIndices(grid[level], boundaryCondition, "x", coordinateNormal, startInner, endInner, startOuter, endOuter);
 }
 
-void MY::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition)
+void MY::addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<BoundaryCondition> boundaryCondition)
 {
     real startInner = grid[level]->getStartX();
     real endInner = grid[level]->getEndX();
@@ -265,7 +350,7 @@ void MY::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCond
 }
 
 
-void PY::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition)
+void PY::addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<BoundaryCondition> boundaryCondition)
 {
     real startInner = grid[level]->getStartX();
     real endInner = grid[level]->getEndX();
@@ -281,7 +366,7 @@ void PY::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCond
 }
 
 
-void MZ::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition)
+void MZ::addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<BoundaryCondition> boundaryCondition)
 {
     real startInner = grid[level]->getStartX();
     real endInner = grid[level]->getEndX();
@@ -296,7 +381,7 @@ void MZ::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCond
     Side::addIndices(grid[level], boundaryCondition, "z", coordinateNormal, startInner, endInner, startOuter, endOuter);
 }
 
-void PZ::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition)
+void PZ::addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<BoundaryCondition> boundaryCondition)
 {
     real startInner = grid[level]->getStartX();
     real endInner = grid[level]->getEndX();
@@ -307,6 +392,6 @@ void PZ::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCond
     real coordinateNormal = grid[level]->getEndZ() - grid[level]->getDelta();
 
     if( coordinateNormal < grid[0]->getEndZ() - grid[0]->getDelta() ) return;
-    
+
     Side::addIndices(grid[level], boundaryCondition, "z", coordinateNormal, startInner, endInner, startOuter, endOuter);
 }
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h
index 6df6bfccc9a39b80de3ac43d057a03945d035b34..624b3722a1c909ba26063b49565779b924d34adc 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h
@@ -33,10 +33,14 @@
 #ifndef SIDE_H
 #define SIDE_H
 
+#include <cstddef>
 #include <string>
 #include <vector>
+#include <map>
+#include <array>
 
 #include "gpu/GridGenerator/global.h"
+#include "lbm/constants/D3Q27.h"
 
 #define X_INDEX 0
 #define Y_INDEX 1
@@ -59,37 +63,59 @@ enum class SideType
     MX, PX, MY, PY, MZ, PZ, GEOMETRY
 };
 
-
-
 class Side
 {
 public:
     virtual ~Side() = default;
-    virtual void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) = 0;
+    virtual void addIndices(const std::vector<SPtr<Grid>> &grid, uint level,
+                            SPtr<gg::BoundaryCondition> boundaryCondition) = 0;
 
     virtual int getCoordinate() const = 0;
     virtual int getDirection() const = 0;
 
     virtual SideType whoAmI() const = 0;
 
+    std::array<real, 3> getNormal() const;
+
 protected:
-    static void addIndices(SPtr<Grid> grid, SPtr<gg::BoundaryCondition> boundaryCondition, std::string coord, real constant,
+    void addIndices(SPtr<Grid> grid, SPtr<gg::BoundaryCondition> boundaryCondition, std::string coord, real constant,
                            real startInner, real endInner, real startOuter, real endOuter);
 
     static void setPressureNeighborIndices(SPtr<gg::BoundaryCondition> boundaryCondition, SPtr<Grid> grid, const uint index);
 
     static void setStressSamplingIndices(SPtr<gg::BoundaryCondition> boundaryCondition, SPtr<Grid> grid, const uint index);
 
-    static void setQs(SPtr<Grid> grid, SPtr<gg::BoundaryCondition> boundaryCondition, uint index);
+    void setQs(SPtr<Grid> grid, SPtr<gg::BoundaryCondition> boundaryCondition, uint index);
+
+    virtual void correctNeighborForPeriodicBoundaries(const Grid *grid, std::array<real, 3>& coords, std::array<real, 3>& neighbors) const;
+
+    virtual bool isAlignedWithMyNormal(const Grid *grid, int dir) const;
+    bool isAlignedWithNormal(const Grid *grid, int dir, const std::array<real, 3>& normal) const;
 
 private:
     static uint getIndex(SPtr<Grid> grid, std::string coord, real constant, real v1, real v2);
+    void resetDiagonalsInCaseOfOtherBC(Grid *grid, std::vector<real>& qNode, int dir, const std::array<real, 3> &coordinates) const;
+    std::array<real, 3> getNeighborCoordinates(Grid *grid, const std::array<real, 3> &coordinates,
+                                               size_t direction) const;
+    bool neighborNormalToSideIsAStopper(Grid *grid, const std::array<real, 3> &coordinates, SideType side) const;
+
+protected:
+    const std::map<SideType, const std::array<real, 3>> normals = {
+        { SideType::MX, { NEGATIVE_DIR, 0.0, 0.0 } }, { SideType::PX, { POSITIVE_DIR, 0.0, 0.0 } },
+        { SideType::MY, { 0.0, NEGATIVE_DIR, 0.0 } }, { SideType::PY, { 0.0, POSITIVE_DIR, 0.0 } },
+        { SideType::MZ, { 0.0, 0.0, NEGATIVE_DIR } }, { SideType::PZ, { 0.0, 0.0, POSITIVE_DIR } }
+    };
+    const std::map<SideType, size_t> sideToD3Q27 = {
+        { SideType::MX, vf::lbm::dir::DIR_M00 }, { SideType::PX, vf::lbm::dir::DIR_P00 },
+        { SideType::MY, vf::lbm::dir::DIR_0M0 }, { SideType::PY, vf::lbm::dir::DIR_0P0 },
+        { SideType::MZ, vf::lbm::dir::DIR_00M }, { SideType::PZ, vf::lbm::dir::DIR_00P }
+    };
 };
 
 class Geometry : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
+    void addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
@@ -110,7 +136,7 @@ public:
 class MX : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
+    void addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
@@ -131,7 +157,7 @@ public:
 class PX : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
+    void addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
@@ -153,7 +179,7 @@ public:
 class MY : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
+    void addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
@@ -174,7 +200,7 @@ public:
 class PY : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
+    void addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
@@ -196,7 +222,7 @@ public:
 class MZ : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
+    void addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
@@ -217,7 +243,7 @@ public:
 class PZ : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
+    void addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/SideTest.cpp b/src/gpu/GridGenerator/grid/BoundaryConditions/SideTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..36a286a8766db4af7e109eb3f8d47add401779f9
--- /dev/null
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/SideTest.cpp
@@ -0,0 +1,873 @@
+#include "Side.h"
+#include "PointerDefinitions.h"
+#include "gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h"
+#include "grid/GridImp.h"
+#include "grid/NodeValues.h"
+#include "lbm/constants/D3Q27.h"
+#include "gmock/gmock.h"
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <vector>
+
+using namespace vf::gpu;
+using namespace vf::lbm::dir;
+
+class SideTestSpecificSubclass : public Side
+{
+
+public:
+    void setQs(SPtr<Grid> grid, SPtr<gg::BoundaryCondition> boundaryCondition, uint index)
+    {
+        Side::setQs(grid, boundaryCondition, index);
+    };
+    int sideDirection = POSITIVE_DIR;
+    int coordinateDirection = X_INDEX;
+    SideType mySide = SideType::PX;
+
+private:
+    void correctNeighborForPeriodicBoundaries(const Grid *grid, std::array<real, 3>& coords, std::array<real, 3>& neighbors) const override
+    {
+    }
+
+    int getDirection() const override
+    {
+        return sideDirection;
+    }
+
+    void addIndices(const std::vector<SPtr<Grid>> &grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override
+    {
+    }
+
+    int getCoordinate() const override
+    {
+        return coordinateDirection;
+    }
+
+    SideType whoAmI() const override
+    {
+        return mySide;
+    }
+};
+
+class GridDouble : public GridImp
+{
+
+public:
+    int endDirection = -1;
+
+    GridDouble()
+    {
+        this->distribution = DistributionHelper::getDistribution27();
+    }
+
+    void transIndexToCoords(uint index, real &x, real &y, real &z) const override
+    {
+        x = 0;
+        y = 0;
+        z = 0;
+    }
+
+    real getDelta() const override
+    {
+        return 1.0;
+    }
+
+    uint transCoordToIndex(const real &x, const real &y, const real &z) const override
+    {
+        return 0;
+    }
+
+    char getFieldEntry(uint /*matrixIndex*/) const override
+    {
+        return STOPPER_OUT_OF_GRID_BOUNDARY;
+    }
+
+    int getEndDirection() const override
+    {
+        return endDirection;
+    }
+};
+
+class BoundaryConditionSpy : public gg::BoundaryCondition
+{
+public:
+    char getType() const override
+    {
+        return 't';
+    };
+    const std::vector<std::vector<real>> &getQs()
+    {
+        return this->qs;
+    }
+    void resetQVector()
+    {
+        this->qs.clear();
+    }
+};
+
+class SideTestBC : public testing::Test
+{
+protected:
+    SideTestSpecificSubclass side;
+    SPtr<GridDouble> grid = std::make_shared<GridDouble>();
+    SPtr<BoundaryConditionSpy> bc = std::make_shared<BoundaryConditionSpy>();
+    uint index = 0;
+
+    std::vector<real> noBC;
+
+    void SetUp() override
+    {
+        grid->endDirection = 26;
+    }
+};
+
+TEST_F(SideTestBC, setQs2D_whenSettingPX_setAllQsNormalToBC)
+{
+    grid->endDirection = 10;
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(11, -1);
+    expectedQs[DIR_P00] = 0.5;
+    expectedQs[DIR_PP0] = 0.5;
+    expectedQs[DIR_PM0] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs2D_givenPYhasBeenSet_thenSetPX_doNotSetSameQsAgain)
+{
+    grid->endDirection = 10;
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(11, -1);
+    expectedQs[DIR_P00] = 0.5;
+    expectedQs[DIR_PM0] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMXhasBeenSet_thenSetPX_setAllQsNormalToPX)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+
+    // no previous BC on this node
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_P00] = 0.5;
+    expectedQs[DIR_PP0] = 0.5;
+    expectedQs[DIR_PM0] = 0.5;
+    expectedQs[DIR_P0P] = 0.5;
+    expectedQs[DIR_P0M] = 0.5;
+    expectedQs[DIR_PPP] = 0.5;
+    expectedQs[DIR_PMP] = 0.5;
+    expectedQs[DIR_PPM] = 0.5;
+    expectedQs[DIR_PMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+
+    // node already has BC in MX direction, but this does not change anything
+
+    grid->addBCalreadySet(SideType::MX);
+
+    side.setQs(grid, bc, index);
+    actualQs = bc->getQs()[0];
+
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenGeometryBCInVector_thenSetPX_throws)
+{
+    // do not add Geometry BC to this vector, as it has an invalid normal
+    grid->addBCalreadySet(SideType::GEOMETRY);
+
+    EXPECT_THROW(side.setQs(grid, bc, index), std::out_of_range);
+}
+
+TEST_F(SideTestBC, setQs3D_whenSettingPX_setAllQsNormalToBC)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_P00] = 0.5;
+    expectedQs[DIR_PP0] = 0.5;
+    expectedQs[DIR_PM0] = 0.5;
+    expectedQs[DIR_P0P] = 0.5;
+    expectedQs[DIR_P0M] = 0.5;
+    expectedQs[DIR_PPP] = 0.5;
+    expectedQs[DIR_PMP] = 0.5;
+    expectedQs[DIR_PPM] = 0.5;
+    expectedQs[DIR_PMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPYhasBeenSet_thenSetPX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_P00] = 0.5;
+    expectedQs[DIR_PM0] = 0.5;
+    expectedQs[DIR_P0P] = 0.5;
+    expectedQs[DIR_P0M] = 0.5;
+    expectedQs[DIR_PMP] = 0.5;
+    expectedQs[DIR_PMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMYhasBeenSet_thenSetPX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::MY);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_P00] = 0.5;
+    expectedQs[DIR_PP0] = 0.5;
+    expectedQs[DIR_P0P] = 0.5;
+    expectedQs[DIR_P0M] = 0.5;
+    expectedQs[DIR_PPP] = 0.5;
+    expectedQs[DIR_PPM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPZhasBeenSet_thenSetPX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::PZ);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_P00] = 0.5;
+    expectedQs[DIR_PP0] = 0.5;
+    expectedQs[DIR_PM0] = 0.5;
+    expectedQs[DIR_P0M] = 0.5;
+    expectedQs[DIR_PPM] = 0.5;
+    expectedQs[DIR_PMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMZhasBeenSet_thenSetPX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::MZ);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_P00] = 0.5;
+    expectedQs[DIR_PP0] = 0.5;
+    expectedQs[DIR_PM0] = 0.5;
+    expectedQs[DIR_P0P] = 0.5;
+    expectedQs[DIR_PPP] = 0.5;
+    expectedQs[DIR_PMP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPYandMZhaveBeenSet_thenSetPX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+    grid->addBCalreadySet(SideType::MZ);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_P00] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_PM0] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_P0P] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_PMP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPYandPZhaveBeenSet_thenSetPX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+    grid->addBCalreadySet(SideType::PZ);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_P00] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_PM0] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_P0M] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_PMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMYandPZhaveBeenSet_thenSetPX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::MY);
+    grid->addBCalreadySet(SideType::PZ);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_P00] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_PP0] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_P0M] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_PPM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMYandMZhaveBeenSet_thenSetPX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::MY);
+    grid->addBCalreadySet(SideType::MZ);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_P00] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_PP0] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_P0P] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_PPP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_whenSettingMX_setAllQsNormalToBC)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_M00] = 0.5;
+    expectedQs[DIR_MP0] = 0.5;
+    expectedQs[DIR_MM0] = 0.5;
+    expectedQs[DIR_M0P] = 0.5;
+    expectedQs[DIR_M0M] = 0.5;
+    expectedQs[DIR_MPP] = 0.5;
+    expectedQs[DIR_MMP] = 0.5;
+    expectedQs[DIR_MPM] = 0.5;
+    expectedQs[DIR_MMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPYhasBeenSet_thenSetMX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_M00] = 0.5;
+    expectedQs[DIR_MM0] = 0.5;
+    expectedQs[DIR_M0P] = 0.5;
+    expectedQs[DIR_M0M] = 0.5;
+    expectedQs[DIR_MMP] = 0.5;
+    expectedQs[DIR_MMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMYhasBeenSet_thenSetMX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::MY);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_M00] = 0.5;
+    expectedQs[DIR_MP0] = 0.5;
+    expectedQs[DIR_M0P] = 0.5;
+    expectedQs[DIR_M0M] = 0.5;
+    expectedQs[DIR_MPP] = 0.5;
+    expectedQs[DIR_MPM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPZhasBeenSet_thenSetMX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::PZ);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_M00] = 0.5;
+    expectedQs[DIR_MP0] = 0.5;
+    expectedQs[DIR_MM0] = 0.5;
+    expectedQs[DIR_M0M] = 0.5;
+    expectedQs[DIR_MPM] = 0.5;
+    expectedQs[DIR_MMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMZhasBeenSet_thenSetMX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::MZ);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_M00] = 0.5;
+    expectedQs[DIR_MP0] = 0.5;
+    expectedQs[DIR_MM0] = 0.5;
+    expectedQs[DIR_M0P] = 0.5;
+    expectedQs[DIR_MPP] = 0.5;
+    expectedQs[DIR_MMP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPYandMZhaveBeenSet_thenSetMX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+    grid->addBCalreadySet(SideType::MZ);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_M00] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_MM0] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_M0P] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_MMP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPYandPZhaveBeenSet_thenSetMX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+    grid->addBCalreadySet(SideType::PZ);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_M00] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_MM0] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_M0M] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_MMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMYandPZhaveBeenSet_thenSetMX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::MY);
+    grid->addBCalreadySet(SideType::PZ);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_M00] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_MP0] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_M0M] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_MPM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMYandMZhaveBeenSet_thenSetMX_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = X_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::MY);
+    grid->addBCalreadySet(SideType::MZ);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_M00] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_MP0] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_M0P] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_MPP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_whenSettingMZ_setAllQsNormalToBC)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_00M] = 0.5;
+    expectedQs[DIR_P0M] = 0.5;
+    expectedQs[DIR_M0M] = 0.5;
+    expectedQs[DIR_0PM] = 0.5;
+    expectedQs[DIR_0MM] = 0.5;
+    expectedQs[DIR_PPM] = 0.5;
+    expectedQs[DIR_MPM] = 0.5;
+    expectedQs[DIR_PMM] = 0.5;
+    expectedQs[DIR_MMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMYhasBeenSet_thenSetMZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::MY);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_00M] = 0.5;
+    expectedQs[DIR_P0M] = 0.5;
+    expectedQs[DIR_M0M] = 0.5;
+    expectedQs[DIR_0PM] = 0.5;
+    expectedQs[DIR_PPM] = 0.5;
+    expectedQs[DIR_MPM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPYhasBeenSet_thenSetMZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_00M] = 0.5;
+    expectedQs[DIR_P0M] = 0.5;
+    expectedQs[DIR_M0M] = 0.5;
+    expectedQs[DIR_0MM] = 0.5;
+    expectedQs[DIR_PMM] = 0.5;
+    expectedQs[DIR_MMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPXhasBeenSet_thenSetMZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::PX);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_00M] = 0.5;
+    expectedQs[DIR_M0M] = 0.5;
+    expectedQs[DIR_0PM] = 0.5;
+    expectedQs[DIR_0MM] = 0.5;
+    expectedQs[DIR_MPM] = 0.5;
+    expectedQs[DIR_MMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMXhasBeenSet_thenSetMZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::MX);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_00M] = 0.5;
+    expectedQs[DIR_P0M] = 0.5;
+    expectedQs[DIR_0PM] = 0.5;
+    expectedQs[DIR_0MM] = 0.5;
+    expectedQs[DIR_PPM] = 0.5;
+    expectedQs[DIR_PMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMYandPXhaveBeenSet_thenSetMZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::MY);
+    grid->addBCalreadySet(SideType::PX);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_00M] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_M0M] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_0PM] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_MPM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMYandMXhaveBeenSet_thenSetMZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::MY);
+    grid->addBCalreadySet(SideType::MX);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_00M] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_P0M] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_0PM] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_PPM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPYandPXhaveBeenSet_thenSetMZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+    grid->addBCalreadySet(SideType::PX);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_00M] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_M0M] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_0MM] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_MMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPYandMXhaveBeenSet_thenSetMZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = NEGATIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+    grid->addBCalreadySet(SideType::MX);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_00M] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_P0M] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_0MM] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_PMM] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_whenSettingPZ_setAllQsNormalToBC)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_00P] = 0.5;
+    expectedQs[DIR_P0P] = 0.5;
+    expectedQs[DIR_M0P] = 0.5;
+    expectedQs[DIR_0PP] = 0.5;
+    expectedQs[DIR_0MP] = 0.5;
+    expectedQs[DIR_PPP] = 0.5;
+    expectedQs[DIR_MPP] = 0.5;
+    expectedQs[DIR_PMP] = 0.5;
+    expectedQs[DIR_MMP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMYhasBeenSet_thenSetPZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::MY);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_00P] = 0.5;
+    expectedQs[DIR_P0P] = 0.5;
+    expectedQs[DIR_M0P] = 0.5;
+    expectedQs[DIR_0PP] = 0.5;
+    expectedQs[DIR_PPP] = 0.5;
+    expectedQs[DIR_MPP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPYhasBeenSet_thenSetPZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_00P] = 0.5;
+    expectedQs[DIR_P0P] = 0.5;
+    expectedQs[DIR_M0P] = 0.5;
+    expectedQs[DIR_0MP] = 0.5;
+    expectedQs[DIR_PMP] = 0.5;
+    expectedQs[DIR_MMP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPXhasBeenSet_thenSetPZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::PX);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_00P] = 0.5;
+    expectedQs[DIR_M0P] = 0.5;
+    expectedQs[DIR_0PP] = 0.5;
+    expectedQs[DIR_0MP] = 0.5;
+    expectedQs[DIR_MPP] = 0.5;
+    expectedQs[DIR_MMP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMXhasBeenSet_thenSetPZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::MX);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQs(27, -1);
+    expectedQs[DIR_00P] = 0.5;
+    expectedQs[DIR_P0P] = 0.5;
+    expectedQs[DIR_0PP] = 0.5;
+    expectedQs[DIR_0MP] = 0.5;
+    expectedQs[DIR_PPP] = 0.5;
+    expectedQs[DIR_PMP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMYandPXhaveBeenSet_thenSetPZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::MY);
+    grid->addBCalreadySet(SideType::PX);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_00P] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_M0P] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_0PP] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_MPP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenMYandMXhaveBeenSet_thenSetPZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::MY);
+    grid->addBCalreadySet(SideType::MX);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_00P] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_P0P] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_0PP] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_PPP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPYandPXhaveBeenSet_thenSetPZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+    grid->addBCalreadySet(SideType::PX);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_00P] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_M0P] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_0MP] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_MMP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
+
+TEST_F(SideTestBC, setQs3D_givenPYandMXhaveBeenSet_thenSetPZ_doNotSetSameQsAgain)
+{
+    side.coordinateDirection = Z_INDEX;
+    side.sideDirection = POSITIVE_DIR;
+    grid->addBCalreadySet(SideType::PY);
+    grid->addBCalreadySet(SideType::MX);
+
+    side.setQs(grid, bc, index);
+    auto actualQs = bc->getQs()[0];
+
+    std::vector<real> expectedQsForTwoPreviousBCs(27, -1);
+    expectedQsForTwoPreviousBCs[DIR_00P] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_P0P] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_0MP] = 0.5;
+    expectedQsForTwoPreviousBCs[DIR_PMP] = 0.5;
+    EXPECT_THAT(actualQs, testing::Eq(expectedQsForTwoPreviousBCs));
+}
diff --git a/src/gpu/GridGenerator/grid/Grid.h b/src/gpu/GridGenerator/grid/Grid.h
index 3f28120a5d969fcc5d7b2a3402a2169ff97c0cc3..ad2ce473fb65fe4414f6da5c4caf0d3e140b7e02 100644
--- a/src/gpu/GridGenerator/grid/Grid.h
+++ b/src/gpu/GridGenerator/grid/Grid.h
@@ -47,6 +47,7 @@ struct Triangle;
 class GridInterface;
 class Object;
 class BoundingBox;
+enum class SideType;
 
 class GRIDGENERATOR_EXPORT Grid
 {
@@ -84,6 +85,8 @@ public:
     virtual void getGridInterfaceIndices(uint* iCellCfc, uint* iCellCff, uint* iCellFcc, uint* iCellFcf) const = 0;
     virtual bool isSparseIndexInFluidNodeIndicesBorder(uint &sparseIndex) const = 0;
 
+    virtual bool isStopperForBC(uint index) const = 0;
+
     virtual int *getNeighborsX() const = 0;
     virtual int *getNeighborsY() const = 0;
     virtual int *getNeighborsZ() const = 0;
@@ -133,9 +136,9 @@ public:
     virtual void setPeriodicityY(bool periodicity) = 0;
     virtual void setPeriodicityZ(bool periodicity) = 0;
 
-    virtual bool getPeriodicityX() = 0;
-    virtual bool getPeriodicityY() = 0;
-    virtual bool getPeriodicityZ() = 0;
+    virtual bool getPeriodicityX() const = 0;
+    virtual bool getPeriodicityY() const = 0;
+    virtual bool getPeriodicityZ() const = 0;
 
     virtual void setEnableFixRefinementIntoTheWall(bool enableFixRefinementIntoTheWall) = 0;
 
@@ -170,6 +173,11 @@ public:
 
     virtual void repairCommunicationIndices(int direction) = 0;
 
+    virtual bool nodeHasBC(uint index) const = 0;
+
+    virtual std::vector<SideType> getBCAlreadySet() = 0;
+    virtual void addBCalreadySet(SideType side) = 0;
+
     // needed for CUDA Streams 
     virtual void findFluidNodeIndices(bool onlyBulk) = 0;
     virtual uint getNumberOfFluidNodes() const = 0;
@@ -178,6 +186,20 @@ public:
     virtual void findFluidNodeIndicesBorder() = 0;
     virtual uint getNumberOfFluidNodesBorder() const = 0;
     virtual void getFluidNodeIndicesBorder(uint *fluidNodeIndicesBorder) const = 0;
+
+    virtual void addFluidNodeIndicesMacroVars(std::vector<uint> _fluidNodeIndicesMacroVars) = 0;
+    virtual void addFluidNodeIndicesApplyBodyForce(std::vector<uint> _fluidNodeIndicesApplyBodyForce) = 0;
+    virtual void addFluidNodeIndicesAllFeatures(std::vector<uint> _fluidNodeIndicesAllFeatures) = 0;
+    virtual void sortFluidNodeIndicesMacroVars() = 0;
+    virtual void sortFluidNodeIndicesApplyBodyForce() = 0;
+    virtual void sortFluidNodeIndicesAllFeatures() = 0;
+
+    virtual uint getNumberOfFluidNodeIndicesMacroVars() const = 0;
+    virtual uint getNumberOfFluidNodeIndicesApplyBodyForce() const = 0;
+    virtual uint getNumberOfFluidNodeIndicesAllFeatures() const = 0; 
+    virtual void getFluidNodeIndicesMacroVars(uint *fluidNodeIndicesMacroVars) const = 0;
+    virtual void getFluidNodeIndicesApplyBodyForce(uint *fluidNodeIndicesApplyBodyForce) const = 0;
+    virtual void getFluidNodeIndicesAllFeatures(uint *fluidNodeIndicesAllFeatures) const = 0;
 };
 
 #endif
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
index 739aef59f76a33fa67d472a77ef258469f5e411c..f3d850384816f6690e5ffc158bbdc5e1df0ab328 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
@@ -54,6 +54,7 @@ class GridWrapper;
 class Transformator;
 class ArrowTransformator;
 class PolyDataWriterWrapper;
+class TransientBCInputFileReader;
 
 class BoundingBox;
 class Grid;
@@ -113,6 +114,15 @@ public:
     virtual void getPressureValues(real *rho, int *indices, int *neighborIndices, int level) const = 0;
     virtual void getPressureQs(real *qs[27], int level) const                                      = 0;
 
+    virtual uint getPrecursorSize(int level) const              = 0;
+    virtual void getPrecursorValues(uint* neighbor0PP, uint* neighbor0PM, uint* neighbor0MP, uint* neighbor0MM, 
+                                    real* weights0PP, real* weights0PM, real* weights0MP, real* weights0MM, 
+                                    int* indices, std::vector<SPtr<TransientBCInputFileReader>>& reader, 
+                                    int& numberOfPrecursorNodes, size_t& numberOfQuantities, uint& timeStepsBetweenReads, 
+                                    real& velocityX, real& velocityY, real& velocityZ, int level) const = 0;
+
+    virtual void getPrecursorQs(real* qs[27], int level) const  = 0;
+
     virtual uint getGeometrySize(int level) const                                 = 0;
     virtual void getGeometryIndices(int *indices, int level) const                = 0;
     virtual void getGeometryQs(real *qs[27], int level) const                     = 0;
@@ -136,6 +146,21 @@ public:
     virtual void getReceiveIndices(int *sendIndices, int direction, int level) = 0;
 
     virtual void findFluidNodes(bool splitDomain) = 0;
+
+    virtual void addFluidNodeIndicesMacroVars(const std::vector<uint>& fluidNodeIndicesMacroVars, uint level)           = 0;
+    virtual void addFluidNodeIndicesApplyBodyForce(const std::vector<uint>& fluidNodeIndicesApplyBodyForce, uint level) = 0;
+    virtual void addFluidNodeIndicesAllFeatures(const std::vector<uint>& fluidNodeIndicesAllFeatures, uint level)       = 0;
+    virtual void sortFluidNodeIndicesMacroVars(uint level) = 0;
+    virtual void sortFluidNodeIndicesApplyBodyForce(uint level) = 0;
+    virtual void sortFluidNodeIndicesAllFeatures(uint level) = 0;
+    virtual uint getNumberOfFluidNodesMacroVars(uint level) const = 0;
+    virtual void getFluidNodeIndicesMacroVars(uint *fluidNodeIndicesMacroVars, int level) const = 0;
+    virtual uint getNumberOfFluidNodesApplyBodyForce(uint level) const = 0;
+    virtual void getFluidNodeIndicesApplyBodyForce(uint *fluidNodeIndicesApplyBodyForce, int level) const = 0;
+    virtual uint getNumberOfFluidNodesAllFeatures(uint level) const = 0;
+    virtual void getFluidNodeIndicesAllFeatures(uint *fluidNodeIndicesAllFeatures, int level) const = 0;
+
+
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
index 083b9a51e0b151f49922df456e968c4b204e4af7..003e6dcd223d2bf019c83f71349a9a7bec84efdc 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
@@ -1,28 +1,28 @@
 //=======================================================================================
-// ____          ____    __    ______     __________   __      __       __        __         
-// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
-//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
-//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
-//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
-//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
-//      \    \  |    |   ________________________________________________________________    
-//       \    \ |    |  |  ______________________________________________________________|   
-//        \    \|    |  |  |         __          __     __     __     ______      _______    
-//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
-//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
 //           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
-//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
 //
-//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -52,6 +52,8 @@
 #include "io/QLineWriter.h"
 #include "io/SimulationFileWriter/SimulationFileWriter.h"
 
+#include "TransientBCSetter/TransientBCSetter.h"
+
 #include "utilities/communication.h"
 #include "utilities/transformator/ArrowTransformator.h"
 
@@ -103,28 +105,33 @@ void LevelGridBuilder::setSlipGeometryBoundaryCondition(real normalX, real norma
 
     for (uint level = 0; level < getNumberOfGridLevels(); level++)
     {
-		if (boundaryConditions[level]->geometryBoundaryCondition != nullptr)
-		{
-			boundaryConditions[level]->geometryBoundaryCondition->normalX = normalX;
-			boundaryConditions[level]->geometryBoundaryCondition->normalY = normalY;
-			boundaryConditions[level]->geometryBoundaryCondition->normalZ = normalZ;
-			boundaryConditions[level]->geometryBoundaryCondition->side->addIndices(grids, level, boundaryConditions[level]->geometryBoundaryCondition);
+        if (boundaryConditions[level]->geometryBoundaryCondition != nullptr)
+        {
+            boundaryConditions[level]->geometryBoundaryCondition->normalX = normalX;
+            boundaryConditions[level]->geometryBoundaryCondition->normalY = normalY;
+            boundaryConditions[level]->geometryBoundaryCondition->normalZ = normalZ;
+            boundaryConditions[level]->geometryBoundaryCondition->side->addIndices(grids, level, boundaryConditions[level]->geometryBoundaryCondition);
 
             boundaryConditions[level]->geometryBoundaryCondition->fillSlipNormalLists();
 
             *logging::out << logging::Logger::INFO_INTERMEDIATE << "Set Geometry Slip BC on level " << level << " with " << (int)boundaryConditions[level]->geometryBoundaryCondition->indices.size() <<"\n";
-		}
+        }
     }
 }
 
-void LevelGridBuilder::setStressBoundaryCondition(  SideType sideType, 
-                                                    real nomalX, real normalY, real normalZ, 
-                                                    uint samplingOffset, real z0)
+//=======================================================================================
+//! \brief Set stress boundary concdition using iMEM
+//! \param samplingOffset number of grid points above boundary where velocity for wall model is sampled
+//! \param z0 roughness length [m]
+//! \param dx dx of level 0 [m]
+//!
+void LevelGridBuilder::setStressBoundaryCondition(  SideType sideType,
+                                                    real nomalX, real normalY, real normalZ,
+                                                    uint samplingOffset, real z0, real dx)
 {
     for (uint level = 0; level < getNumberOfGridLevels(); level++)
     {
-        SPtr<StressBoundaryCondition> stressBoundaryCondition = StressBoundaryCondition::make(nomalX, normalY, normalZ, samplingOffset, z0);
-
+        SPtr<StressBoundaryCondition> stressBoundaryCondition = StressBoundaryCondition::make(nomalX, normalY, normalZ, samplingOffset, z0*pow(2.0f,level)/dx);
         auto side = SideFactory::make(sideType);
 
         stressBoundaryCondition->side = side;
@@ -171,17 +178,17 @@ void LevelGridBuilder::setVelocityGeometryBoundaryCondition(real vx, real vy, re
 
     for (uint level = 0; level < getNumberOfGridLevels(); level++)
     {
-		if (boundaryConditions[level]->geometryBoundaryCondition != nullptr)
-		{
-			boundaryConditions[level]->geometryBoundaryCondition->vx = vx;
-			boundaryConditions[level]->geometryBoundaryCondition->vy = vy;
-			boundaryConditions[level]->geometryBoundaryCondition->vz = vz;
-			boundaryConditions[level]->geometryBoundaryCondition->side->addIndices(grids, level, boundaryConditions[level]->geometryBoundaryCondition);
+        if (boundaryConditions[level]->geometryBoundaryCondition != nullptr)
+        {
+            boundaryConditions[level]->geometryBoundaryCondition->vx = vx;
+            boundaryConditions[level]->geometryBoundaryCondition->vy = vy;
+            boundaryConditions[level]->geometryBoundaryCondition->vz = vz;
+            boundaryConditions[level]->geometryBoundaryCondition->side->addIndices(grids, level, boundaryConditions[level]->geometryBoundaryCondition);
 
             boundaryConditions[level]->geometryBoundaryCondition->fillVelocityLists();
 
             *logging::out << logging::Logger::INFO_INTERMEDIATE << "Set Geometry Velocity BC on level " << level << " with " << (int)boundaryConditions[level]->geometryBoundaryCondition->indices.size() <<"\n";
-		}
+        }
     }
 }
 
@@ -223,7 +230,7 @@ void LevelGridBuilder::setNoSlipBoundaryCondition(SideType sideType)
             noSlipBoundaryCondition->fillVelocityLists();
 
             // now effectively just a wrapper for velocityBC with zero velocity. No distinction in Gridgenerator.
-            boundaryConditions[level]->velocityBoundaryConditions.push_back(noSlipBoundaryCondition); 
+            boundaryConditions[level]->velocityBoundaryConditions.push_back(noSlipBoundaryCondition);
         }
     }
 }
@@ -234,12 +241,45 @@ void LevelGridBuilder::setNoSlipGeometryBoundaryCondition()
 
     for (uint level = 0; level < getNumberOfGridLevels(); level++)
     {
-		if (boundaryConditions[level]->geometryBoundaryCondition != nullptr)
-		{
-			boundaryConditions[level]->geometryBoundaryCondition->side->addIndices(grids, level, boundaryConditions[level]->geometryBoundaryCondition);
+        if (boundaryConditions[level]->geometryBoundaryCondition != nullptr)
+        {
+            boundaryConditions[level]->geometryBoundaryCondition->side->addIndices(grids, level, boundaryConditions[level]->geometryBoundaryCondition);
 
             *logging::out << logging::Logger::INFO_INTERMEDIATE << "Set Geometry No-Slip BC on level " << level << " with " << (int)boundaryConditions[level]->geometryBoundaryCondition->indices.size() <<"\n";
-		}
+        }
+    }
+}
+
+void LevelGridBuilder::setPrecursorBoundaryCondition(SideType sideType, SPtr<FileCollection> fileCollection, int timeStepsBetweenReads,
+                                                        real velocityX, real velocityY, real velocityZ, std::vector<uint> fileLevelToGridLevelMap)
+{
+    if(fileLevelToGridLevelMap.empty())
+    {
+        *logging::out << logging::Logger::INFO_INTERMEDIATE << "Mapping precursor file levels to the corresponding grid levels" << "\n";
+
+        for (uint level = 0; level < getNumberOfGridLevels(); level++)
+            fileLevelToGridLevelMap.push_back(level);
+    }
+    else
+    {
+        if(fileLevelToGridLevelMap.size()!=getNumberOfGridLevels())
+            throw std::runtime_error("In setPrecursorBoundaryCondition: fileLevelToGridLevelMap does not match with the number of levels");
+        *logging::out << logging::Logger::INFO_INTERMEDIATE << "Using user defined file to grid level mapping"  << "\n";
+    }
+
+    for (uint level = 0; level < getNumberOfGridLevels(); level++)
+    {
+        auto reader = createReaderForCollection(fileCollection, fileLevelToGridLevelMap[level]);
+        SPtr<PrecursorBoundaryCondition> precursorBoundaryCondition = PrecursorBoundaryCondition::make( reader, timeStepsBetweenReads, velocityX, velocityY, velocityZ);
+
+        auto side = SideFactory::make(sideType);
+
+        precursorBoundaryCondition->side = side;
+        precursorBoundaryCondition->side->addIndices(grids, level, precursorBoundaryCondition);
+
+        boundaryConditions[level]->precursorBoundaryConditions.push_back(precursorBoundaryCondition);
+
+        *logging::out << logging::Logger::INFO_INTERMEDIATE << "Set Precursor BC on level " << level << " with " << (int)precursorBoundaryCondition->indices.size() << "\n";
     }
 }
 
@@ -373,9 +413,9 @@ std::shared_ptr<Grid> LevelGridBuilder::getGrid(int level, int box)
 void LevelGridBuilder::checkLevel(int level)
 {
     if (level >= (int)grids.size())
-    { 
+    {
         std::cout << "wrong level input... return to caller\n";
-        return; 
+        return;
     }
 }
 
@@ -386,16 +426,16 @@ void LevelGridBuilder::getDimensions(int &nx, int &ny, int &nz, const int level)
     nz = grids[level]->getNumberOfNodesZ();
 }
 
-void LevelGridBuilder::getNodeValues(real *xCoords, real *yCoords, real *zCoords, 
-                                     uint *neighborX, uint *neighborY, uint *neighborZ, uint *neighborNegative, 
+void LevelGridBuilder::getNodeValues(real *xCoords, real *yCoords, real *zCoords,
+                                     uint *neighborX, uint *neighborY, uint *neighborZ, uint *neighborNegative,
                                      uint *geo, const int level) const
 {
     grids[level]->getNodeValues(xCoords, yCoords, zCoords, neighborX, neighborY, neighborZ, neighborNegative, geo);
 }
 
 
-GRIDGENERATOR_EXPORT void LevelGridBuilder::getFluidNodeIndices(uint *fluidNodeIndices, const int level) const 
-{ 
+GRIDGENERATOR_EXPORT void LevelGridBuilder::getFluidNodeIndices(uint *fluidNodeIndices, const int level) const
+{
     grids[level]->getFluidNodeIndices(fluidNodeIndices);
 }
 
@@ -404,9 +444,9 @@ GRIDGENERATOR_EXPORT void LevelGridBuilder::getFluidNodeIndicesBorder(uint *flui
     grids[level]->getFluidNodeIndicesBorder(fluidNodeIndices);
 }
 
-uint LevelGridBuilder::getNumberOfFluidNodes(unsigned int level) const 
+uint LevelGridBuilder::getNumberOfFluidNodes(unsigned int level) const
 {
-    return grids[level]->getNumberOfFluidNodes(); 
+    return grids[level]->getNumberOfFluidNodes();
 }
 
 GRIDGENERATOR_EXPORT uint LevelGridBuilder::getNumberOfFluidNodesBorder(unsigned int level) const
@@ -432,7 +472,7 @@ void LevelGridBuilder::getSlipValues(real* normalX, real* normalY, real* normalZ
         for (uint index = 0; index < boundaryCondition->indices.size(); index++)
         {
             indices[allIndicesCounter] = grids[level]->getSparseIndex(boundaryCondition->indices[index]) + 1;
-            
+
             normalX[allIndicesCounter] = boundaryCondition->getNormalx(index);
             normalY[allIndicesCounter] = boundaryCondition->getNormaly(index);
             normalZ[allIndicesCounter] = boundaryCondition->getNormalz(index);
@@ -467,9 +507,9 @@ uint LevelGridBuilder::getStressSize(int level) const
     return size;
 }
 
-void LevelGridBuilder::getStressValues( real* normalX, real* normalY, real* normalZ, 
-                                        real* vx,      real* vy,      real* vz, 
-                                        real* vx1,     real* vy1,     real* vz1, 
+void LevelGridBuilder::getStressValues( real* normalX, real* normalY, real* normalZ,
+                                        real* vx,      real* vy,      real* vz,
+                                        real* vx1,     real* vy1,     real* vz1,
                                         int* indices, int* samplingIndices, int* samplingOffset, real* z0, int level) const
 {
 
@@ -525,7 +565,7 @@ void LevelGridBuilder::getVelocityValues(real* vx, real* vy, real* vz, int* indi
     {
         for (uint i = 0; i < (uint)boundaryCondition->indices.size(); i++)
         {
-            indices[allIndicesCounter] = grids[level]->getSparseIndex(boundaryCondition->indices[i]) +1;  
+            indices[allIndicesCounter] = grids[level]->getSparseIndex(boundaryCondition->indices[i]) +1;
 
             vx[allIndicesCounter] = boundaryCondition->getVx(i);
             vy[allIndicesCounter] = boundaryCondition->getVy(i);
@@ -594,11 +634,91 @@ void LevelGridBuilder::getPressureQs(real* qs[27], int level) const
     }
 }
 
+uint LevelGridBuilder::getPrecursorSize(int level) const
+{
+    uint size = 0;
+    for (auto boundaryCondition : boundaryConditions[level]->precursorBoundaryConditions)
+    {
+        size += uint(boundaryCondition->indices.size());
+    }
+    return size;
+}
+
+void LevelGridBuilder::getPrecursorValues(  uint* neighbor0PP, uint* neighbor0PM, uint* neighbor0MP, uint* neighbor0MM,
+                                            real* weights0PP, real* weights0PM, real* weights0MP, real* weights0MM,
+                                            int* indices, std::vector<SPtr<TransientBCInputFileReader>>& reader,
+                                            int& numberOfPrecursorNodes, size_t& numberOfQuantities, uint& timeStepsBetweenReads,
+                                            real& velocityX, real& velocityY, real& velocityZ, int level) const
+{
+    int allIndicesCounter = 0;
+    int allNodesCounter = 0;
+    uint tmpTimeStepsBetweenReads = 0;
+    size_t tmpNumberOfQuantities = 0;
+
+    for (auto boundaryCondition : boundaryConditions[level]->precursorBoundaryConditions)
+    {
+        if( tmpTimeStepsBetweenReads == 0 )
+            tmpTimeStepsBetweenReads = boundaryCondition->timeStepsBetweenReads;
+        if( tmpTimeStepsBetweenReads != boundaryCondition->timeStepsBetweenReads )
+            throw std::runtime_error("All precursor boundary conditions must have the same timeStepsBetweenReads value");
+        auto BCreader = boundaryCondition->getReader();
+        BCreader->setWritingOffset(allIndicesCounter);
+        reader.push_back(BCreader);
+
+        std::vector<real> y, z;
+        real xTmp, yTmp, zTmp;
+        for(uint i = 0; i<boundaryCondition->indices.size(); i++)
+        {
+            indices[allIndicesCounter] = grids[level]->getSparseIndex(boundaryCondition->indices[i]) + 1;
+            grids[level]->transIndexToCoords(boundaryCondition->indices[i], xTmp, yTmp, zTmp);
+            y.push_back(yTmp);
+            z.push_back(zTmp);
+            allIndicesCounter++;
+        }
+        BCreader->fillArrays(y, z);
+        BCreader->getNeighbors(neighbor0PP, neighbor0PM, neighbor0MP, neighbor0MM);
+        BCreader->getWeights(weights0PP, weights0PM, weights0MP, weights0MM);
+        if(tmpNumberOfQuantities == 0)
+            tmpNumberOfQuantities = BCreader->getNumberOfQuantities();
+        if(tmpNumberOfQuantities != BCreader->getNumberOfQuantities())
+            throw std::runtime_error("All precursor files must have the same quantities.");
+        allNodesCounter += BCreader->getNPointsRead();
+        velocityX = boundaryCondition->getVelocityX();
+        velocityY = boundaryCondition->getVelocityY();
+        velocityZ = boundaryCondition->getVelocityZ();
+    }
+    numberOfPrecursorNodes = allNodesCounter;
+
+    if (tmpTimeStepsBetweenReads == 0)
+        throw std::runtime_error("timeStepsBetweenReads of precursor needs to be larger than 0.");
+    timeStepsBetweenReads = tmpTimeStepsBetweenReads;
+
+    if (tmpNumberOfQuantities == 0)
+        throw std::runtime_error("Number of quantities in precursor needs to be larger than 0.");
+    numberOfQuantities = tmpNumberOfQuantities;
+}
+
+void LevelGridBuilder::getPrecursorQs(real* qs[27], int level) const
+{
+    int allIndicesCounter = 0;
+    for (auto boundaryCondition : boundaryConditions[level]->precursorBoundaryConditions)
+    {
+        for ( uint index = 0; index < boundaryCondition->indices.size(); index++ )
+        {
+            for (int dir = 0; dir <= grids[level]->getEndDirection(); dir++)
+            {
+                qs[dir][allIndicesCounter] = boundaryCondition->qs[index][dir];
+            }
+            allIndicesCounter++;
+        }
+    }
+}
+
 uint LevelGridBuilder::getGeometrySize(int level) const
 {
     if (boundaryConditions[level]->geometryBoundaryCondition)
         return  (uint)boundaryConditions[level]->geometryBoundaryCondition->indices.size();
-    
+
     return 0;
 }
 
@@ -619,9 +739,9 @@ void LevelGridBuilder::getGeometryValues(real* vx, real* vy, real* vz, int level
 {
     for (uint i = 0; i < boundaryConditions[level]->geometryBoundaryCondition->indices.size(); i++)
     {
-		vx[i] = boundaryConditions[level]->geometryBoundaryCondition->getVx(i);
-		vy[i] = boundaryConditions[level]->geometryBoundaryCondition->getVy(i);
-		vz[i] = boundaryConditions[level]->geometryBoundaryCondition->getVz(i);
+        vx[i] = boundaryConditions[level]->geometryBoundaryCondition->getVx(i);
+        vy[i] = boundaryConditions[level]->geometryBoundaryCondition->getVy(i);
+        vz[i] = boundaryConditions[level]->geometryBoundaryCondition->getVz(i);
     }
 }
 
@@ -636,7 +756,7 @@ void LevelGridBuilder::getGeometryQs(real* qs[27], int level) const
     }
 }
 
-void LevelGridBuilder::writeArrows(std::string fileName) const 
+void LevelGridBuilder::writeArrows(std::string fileName) const
 {
     QLineWriter::writeArrows(fileName, boundaryConditions[getNumberOfGridLevels() - 1]->geometryBoundaryCondition, grids[getNumberOfGridLevels() - 1]);
 }
@@ -674,4 +794,65 @@ void LevelGridBuilder::findFluidNodes(bool splitDomain)
     for (uint i = 0; i < grids.size(); i++)
         grids[i]->findFluidNodeIndices(splitDomain);
     *logging::out << logging::Logger::INFO_HIGH << "Done with findFluidNodes()\n";
-}
\ No newline at end of file
+}
+
+
+void LevelGridBuilder::addFluidNodeIndicesMacroVars(const std::vector<uint>& fluidNodeIndicesMacroVars, uint level)
+{
+    grids[level]->addFluidNodeIndicesMacroVars(fluidNodeIndicesMacroVars);
+}
+
+void LevelGridBuilder::addFluidNodeIndicesApplyBodyForce(const std::vector<uint>& fluidNodeIndicesApplyBodyForce, uint level)
+{
+    grids[level]->addFluidNodeIndicesApplyBodyForce(fluidNodeIndicesApplyBodyForce);
+}
+
+void LevelGridBuilder::addFluidNodeIndicesAllFeatures(const std::vector<uint>& fluidNodeIndicesAllFeatures, uint level)
+{
+    grids[level]->addFluidNodeIndicesAllFeatures(fluidNodeIndicesAllFeatures);
+}
+
+void LevelGridBuilder::sortFluidNodeIndicesMacroVars(uint level)
+{
+    grids[level]->sortFluidNodeIndicesMacroVars();
+}
+
+void LevelGridBuilder::sortFluidNodeIndicesApplyBodyForce(uint level)
+{
+    grids[level]->sortFluidNodeIndicesApplyBodyForce();
+}
+
+void LevelGridBuilder::sortFluidNodeIndicesAllFeatures(uint level)
+{
+    grids[level]->sortFluidNodeIndicesAllFeatures();
+}
+
+uint LevelGridBuilder::getNumberOfFluidNodesMacroVars(unsigned int level) const
+{
+    return grids[level]->getNumberOfFluidNodeIndicesMacroVars();
+}
+
+void LevelGridBuilder::getFluidNodeIndicesMacroVars(uint *fluidNodeIndicesMacroVars, const int level) const
+{
+    grids[level]->getFluidNodeIndicesMacroVars(fluidNodeIndicesMacroVars);
+}
+
+uint LevelGridBuilder::getNumberOfFluidNodesApplyBodyForce(unsigned int level) const
+{
+    return grids[level]->getNumberOfFluidNodeIndicesApplyBodyForce();
+}
+
+void LevelGridBuilder::getFluidNodeIndicesApplyBodyForce(uint *fluidNodeIndicesApplyBodyForce, const int level) const
+{
+    grids[level]->getFluidNodeIndicesApplyBodyForce(fluidNodeIndicesApplyBodyForce);
+}
+
+uint LevelGridBuilder::getNumberOfFluidNodesAllFeatures(unsigned int level) const
+{
+    return grids[level]->getNumberOfFluidNodeIndicesAllFeatures();
+}
+
+void LevelGridBuilder::getFluidNodeIndicesAllFeatures(uint *fluidNodeIndicesAllFeatures, const int level) const
+{
+    grids[level]->getFluidNodeIndicesAllFeatures(fluidNodeIndicesAllFeatures);
+}
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
index afb027fc1665ab874523bf39ec2a05518d28f7a1..2e0eaf13080c46260de2a0c845fbf784a2cc3e09 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
@@ -1,28 +1,28 @@
 //=======================================================================================
-// ____          ____    __    ______     __________   __      __       __        __         
-// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
-//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
-//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
-//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
-//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
-//      \    \  |    |   ________________________________________________________________    
-//       \    \ |    |  |  ______________________________________________________________|   
-//        \    \|    |  |  |         __          __     __     __     ______      _______    
-//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
-//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
 //           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
-//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
 //
-//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -38,6 +38,8 @@
 #include <memory>
 #include <array>
 
+#include <lbm/constants/NumericConstants.h>
+
 #include "gpu/GridGenerator/global.h"
 
 #include "gpu/GridGenerator/grid/GridBuilder/GridBuilder.h"
@@ -45,6 +47,8 @@
 #include "gpu/GridGenerator/grid/GridInterface.h"
 #include "gpu/GridGenerator/grid/NodeValues.h"
 
+using namespace vf::lbm::constant;
+
 struct Vertex;
 class  Grid;
 class Transformator;
@@ -58,9 +62,11 @@ class SlipBoundaryCondition;
 class StressBoundaryCondition;
 class PressureBoundaryCondition;
 class GeometryBoundaryCondition;
+class PrecursorBoundaryCondition;
 enum class SideType;
 
-
+class TransientBCInputFileReader;
+class FileCollection;
 
 class LevelGridBuilder : public GridBuilder
 {
@@ -75,11 +81,14 @@ public:
     GRIDGENERATOR_EXPORT  ~LevelGridBuilder() override;
 
     GRIDGENERATOR_EXPORT void setSlipBoundaryCondition(SideType sideType, real nomalX, real normalY, real normalZ);
-    GRIDGENERATOR_EXPORT void setStressBoundaryCondition(SideType sideType, real nomalX, real normalY, real normalZ, uint samplingOffset, real z0);
+    GRIDGENERATOR_EXPORT void setStressBoundaryCondition(SideType sideType, real nomalX, real normalY, real normalZ, uint samplingOffset, real z0, real dx);
     GRIDGENERATOR_EXPORT void setVelocityBoundaryCondition(SideType sideType, real vx, real vy, real vz);
     GRIDGENERATOR_EXPORT void setPressureBoundaryCondition(SideType sideType, real rho);
     GRIDGENERATOR_EXPORT void setPeriodicBoundaryCondition(bool periodic_X, bool periodic_Y, bool periodic_Z);
     GRIDGENERATOR_EXPORT void setNoSlipBoundaryCondition(SideType sideType);
+    GRIDGENERATOR_EXPORT void setPrecursorBoundaryCondition(SideType sideType, SPtr<FileCollection> fileCollection, int timeStepsBetweenReads,
+                                                            real velocityX=c0o1, real velocityY=c0o1, real velocityZ=c0o1,
+                                                            std::vector<uint> fileLevelToGridLevelMap = {});
 
     GRIDGENERATOR_EXPORT void setEnableFixRefinementIntoTheWall(bool enableFixRefinementIntoTheWall);
 
@@ -97,7 +106,7 @@ public:
     GRIDGENERATOR_EXPORT virtual void getFluidNodeIndicesBorder(uint *fluidNodeIndices, const int level) const override;
 
     GRIDGENERATOR_EXPORT virtual void getNodeValues(real *xCoords, real *yCoords, real *zCoords,
-                                         uint *neighborX, uint *neighborY, uint *neighborZ, uint *neighborNegative, 
+                                         uint *neighborX, uint *neighborY, uint *neighborZ, uint *neighborNegative,
                                          uint *geo, const int level) const override;
     GRIDGENERATOR_EXPORT virtual void getDimensions(int &nx, int &ny, int &nz, const int level) const override;
 
@@ -107,12 +116,12 @@ public:
     GRIDGENERATOR_EXPORT virtual void getSlipQs(real* qs[27], int level) const override;
 
     GRIDGENERATOR_EXPORT uint getStressSize(int level) const override;
-    GRIDGENERATOR_EXPORT virtual void getStressValues(  real* normalX, real* normalY, real* normalZ, 
-                                                        real* vx,      real* vy,      real* vz, 
-                                                        real* vx1,     real* vy1,     real* vz1, 
+    GRIDGENERATOR_EXPORT virtual void getStressValues(  real* normalX, real* normalY, real* normalZ,
+                                                        real* vx,      real* vy,      real* vz,
+                                                        real* vx1,     real* vy1,     real* vz1,
                                                         int* indices, int* samplingIndices, int* samplingOffsets, real* z0, int level) const override;
     GRIDGENERATOR_EXPORT virtual void getStressQs(real* qs[27], int level) const override;
-        
+
     GRIDGENERATOR_EXPORT uint getVelocitySize(int level) const override;
     GRIDGENERATOR_EXPORT virtual void getVelocityValues(real* vx, real* vy, real* vz, int* indices, int level) const override;
     GRIDGENERATOR_EXPORT virtual void getVelocityQs(real* qs[27], int level) const override;
@@ -121,6 +130,14 @@ public:
     GRIDGENERATOR_EXPORT void getPressureValues(real* rho, int* indices, int* neighborIndices, int level) const override;
     GRIDGENERATOR_EXPORT virtual void getPressureQs(real* qs[27], int level) const override;
 
+    GRIDGENERATOR_EXPORT uint getPrecursorSize(int level) const override;
+    GRIDGENERATOR_EXPORT void getPrecursorValues(   uint* neighbor0PP, uint* neighbor0PM, uint* neighbor0MP, uint* neighbor0MM,
+                                                    real* weights0PP, real* weights0PM, real* weights0MP, real* weights0MM,
+                                                    int* indices, std::vector<SPtr<TransientBCInputFileReader>>& reader,
+                                                    int& numberOfPrecursorNodes, size_t& numberOfQuantities, uint& timeStepsBetweenReads,
+                                                    real& velocityX, real& velocityY, real& velocityZ, int level) const override;
+    GRIDGENERATOR_EXPORT virtual void getPrecursorQs(real* qs[27], int level) const override;
+
     GRIDGENERATOR_EXPORT virtual void getGeometryQs(real *qs[27], int level) const override;
     GRIDGENERATOR_EXPORT virtual uint getGeometrySize(int level) const override;
     GRIDGENERATOR_EXPORT virtual void getGeometryIndices(int *indices, int level) const override;
@@ -133,11 +150,11 @@ public:
     GRIDGENERATOR_EXPORT SPtr<GeometryBoundaryCondition> getGeometryBoundaryCondition(uint level) const override;
 
 protected:
-    
+
 
     struct BoundaryConditions
     {
-		BoundaryConditions() = default;
+        BoundaryConditions() = default;
 
         std::vector<SPtr<SlipBoundaryCondition>> slipBoundaryConditions;
 
@@ -149,13 +166,15 @@ protected:
 
         std::vector<SPtr<VelocityBoundaryCondition>> noSlipBoundaryConditions;
 
+        std::vector<SPtr<PrecursorBoundaryCondition>> precursorBoundaryConditions;
+
         SPtr<GeometryBoundaryCondition> geometryBoundaryCondition;
     };
     bool geometryHasValues = false;
 
     std::vector<std::shared_ptr<Grid> > grids;
     std::vector<SPtr<BoundaryConditions> > boundaryConditions;
-    
+
     std::array<uint, 6> communicationProcesses;
 
     void checkLevel(int level);
@@ -194,7 +213,21 @@ public:
 
     // needed for CUDA Streams MultiGPU (Communication Hiding)
     void findFluidNodes(bool splitDomain) override;
+
+    void addFluidNodeIndicesMacroVars(const std::vector<uint>& fluidNodeIndicesMacroVars, uint level) override;
+    void addFluidNodeIndicesApplyBodyForce(const std::vector<uint>& fluidNodeIndicesApplyBodyForce, uint level) override;
+    void addFluidNodeIndicesAllFeatures(const std::vector<uint>& fluidNodeIndicesAllFeatures, uint level) override;
+
+    void sortFluidNodeIndicesMacroVars(uint level) override;
+    void sortFluidNodeIndicesApplyBodyForce(uint level) override;
+    void sortFluidNodeIndicesAllFeatures(uint level) override;
+
+    uint getNumberOfFluidNodesMacroVars(unsigned int level) const override;
+    void getFluidNodeIndicesMacroVars(uint *fluidNodeIndicesMacroVars, const int level) const override;
+    uint getNumberOfFluidNodesApplyBodyForce(unsigned int level) const override;
+    void getFluidNodeIndicesApplyBodyForce(uint *fluidNodeIndicesApplyBodyForce, const int level) const override;
+    uint getNumberOfFluidNodesAllFeatures(unsigned int level) const override;
+    void getFluidNodeIndicesAllFeatures(uint *fluidNodeIndicesAllFeatures, const int level) const override;
 };
 
 #endif
-
diff --git a/src/gpu/GridGenerator/grid/GridImp.cpp b/src/gpu/GridGenerator/grid/GridImp.cpp
index 31bbf3ddc87184846fcb01a3e6631358b6a6f864..32cf9d07da87149695a5bf548ed357be2b2f71b4 100644
--- a/src/gpu/GridGenerator/grid/GridImp.cpp
+++ b/src/gpu/GridGenerator/grid/GridImp.cpp
@@ -1,28 +1,28 @@
 //=======================================================================================
-// ____          ____    __    ______     __________   __      __       __        __         
-// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
-//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
-//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
-//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
-//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
-//      \    \  |    |   ________________________________________________________________    
-//       \    \ |    |  |  ______________________________________________________________|   
-//        \    \|    |  |  |         __          __     __     __     ______      _______    
-//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
-//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
 //           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
-//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
 //
-//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -33,7 +33,6 @@
 #include "GridImp.h"
 
 #include <iostream>
-#include <omp.h>
 #include <sstream>
 # include <algorithm>
 #include <cmath>
@@ -61,8 +60,8 @@ int DIRECTIONS[DIR_END_MAX][DIMENSION];
 
 using namespace vf::gpu;
 
-GridImp::GridImp(Object* object, real startX, real startY, real startZ, real endX, real endY, real endZ, real delta, Distribution distribution, uint level) 
-            : object(object), 
+GridImp::GridImp(Object* object, real startX, real startY, real startZ, real endX, real endY, real endZ, real delta, Distribution distribution, uint level)
+            : object(object),
     startX(startX),
     startY(startY),
     startZ(startZ),
@@ -135,7 +134,7 @@ void GridImp::inital(const SPtr<Grid> fineGrid, uint numberOfLayers)
 #pragma omp parallel for
     for (int index = 0; index < (int)this->size; index++)
         this->initalNodeToOutOfGrid(index);
-    
+
     if( this->innerRegionFromFinerGrid ){
         *logging::out << logging::Logger::INFO_INTERMEDIATE << "Start setInnerBasedOnFinerGrid()\n";
         this->setInnerBasedOnFinerGrid(fineGrid);
@@ -147,12 +146,12 @@ void GridImp::inital(const SPtr<Grid> fineGrid, uint numberOfLayers)
 
     *logging::out << logging::Logger::INFO_INTERMEDIATE << "Start addOverlap()\n";
     this->addOverlap();
-    
+
     *logging::out << logging::Logger::INFO_INTERMEDIATE << "Start fixOddCells()\n";
 #pragma omp parallel for
     for (int index = 0; index < (int)this->size; index++)
         this->fixOddCell(index);
-    
+
     if( enableFixRefinementIntoTheWall )
     {
         *logging::out << logging::Logger::INFO_INTERMEDIATE << "Start fixRefinementIntoWall()\n";
@@ -180,12 +179,12 @@ void GridImp::inital(const SPtr<Grid> fineGrid, uint numberOfLayers)
             }
         }
     }
-    
+
     *logging::out << logging::Logger::INFO_INTERMEDIATE << "Start findEndOfGridStopperNodes()\n";
 #pragma omp parallel for
     for (int index = 0; index < (int)this->size; index++)
         this->findEndOfGridStopperNode(index);
-    
+
     *logging::out << logging::Logger::INFO_INTERMEDIATE
         << "Grid created: " << "from (" << this->startX << ", " << this->startY << ", " << this->startZ << ") to (" << this->endX << ", " << this->endY << ", " << this->endZ << ")\n"
         << "nodes: " << this->nx << " x " << this->ny << " x " << this->nz << " = " << this->size << "\n";
@@ -209,9 +208,9 @@ void GridImp::freeMemory()
     if( this->neighborIndexZ        != nullptr ) { delete[] this->neighborIndexZ;        this->neighborIndexZ        = nullptr; }
     if( this->neighborIndexNegative != nullptr ) { delete[] this->neighborIndexNegative; this->neighborIndexNegative = nullptr; }
     if( this->sparseIndices         != nullptr ) { delete[] this->sparseIndices;         this->sparseIndices         = nullptr; }
-	if( this->qIndices              != nullptr ) { delete[] this->qIndices;              this->qIndices              = nullptr; }
-	if( this->qValues               != nullptr ) { delete[] this->qValues;               this->qValues               = nullptr; }
-	if( this->qPatches              != nullptr ) { delete[] this->qPatches;              this->qPatches              = nullptr; }
+    if( this->qIndices              != nullptr ) { delete[] this->qIndices;              this->qIndices              = nullptr; }
+    if( this->qValues               != nullptr ) { delete[] this->qValues;               this->qValues               = nullptr; }
+    if( this->qPatches              != nullptr ) { delete[] this->qPatches;              this->qPatches              = nullptr; }
 
     field.freeMemory();
 }
@@ -254,7 +253,7 @@ void GridImp::discretize(Object* solidObject, char innerType, char outerType)
         this->sparseIndices[index] = index;
 
         if( this->getFieldEntry(index) == innerType ) continue;
-        
+
         real x, y, z;
         this->transIndexToCoords(index, x, y, z);
 
@@ -279,7 +278,7 @@ bool GridImp::isInside(const Cell& cell) const
 //    |       +-----+-----+-----+           | +-----+-----+-----+
 //    +---------+                           +---------+
 //               0     1     2                   0     1     2
-//              even      even                        even     
+//              even      even                        even
 //                   odd                        odd         odd
 //
 Cell GridImp::getOddCellFromIndex(uint index) const
@@ -349,7 +348,7 @@ void GridImp::addOverlap()
 void GridImp::setOverlapTmp( uint index )
 {
     if( this->field.is( index, INVALID_OUT_OF_GRID ) ){
-        
+
         if( this->hasNeighborOfType(index, FLUID) ){
             this->field.setFieldEntry( index, OVERLAP_TMP );
         }
@@ -380,7 +379,7 @@ void GridImp::fixRefinementIntoWall(uint xIndex, uint yIndex, uint zIndex, int d
     if(  this->xOddStart && ( dir == 1 || dir == -1 ) && ( xIndex % 2 == 0 && xIndex != 0 ) ) return;
     if(  this->yOddStart && ( dir == 2 || dir == -2 ) && ( yIndex % 2 == 0 && yIndex != 0 ) ) return;
     if(  this->zOddStart && ( dir == 3 || dir == -3 ) && ( zIndex % 2 == 0 && zIndex != 0 ) ) return;
-    
+
     //////////////////////////////////////////////////////////////////////////
 
     real dx{ 0.0 }, dy{ 0.0 }, dz{ 0.0 };
@@ -433,31 +432,31 @@ void GridImp::findStopperNode(uint index) // deprecated
 
 void GridImp::findEndOfGridStopperNode(uint index)
 {
-	if (isValidEndOfGridStopper(index)){
+    if (isValidEndOfGridStopper(index)){
         if( this->level != 0 )
-		    this->field.setFieldEntryToStopperOutOfGrid(index);
+            this->field.setFieldEntryToStopperOutOfGrid(index);
         else
             this->field.setFieldEntryToStopperOutOfGridBoundary(index);
     }
-    
-	if (isValidEndOfGridBoundaryStopper(index))
-		this->field.setFieldEntryToStopperOutOfGridBoundary(index);
+
+    if (isValidEndOfGridBoundaryStopper(index))
+        this->field.setFieldEntryToStopperOutOfGridBoundary(index);
 }
 
 void GridImp::findSolidStopperNode(uint index)
 {
-	if (isValidSolidStopper(index))
-		this->field.setFieldEntry(index, STOPPER_SOLID);
+    if (isValidSolidStopper(index))
+        this->field.setFieldEntry(index, STOPPER_SOLID);
 }
 
 void GridImp::findBoundarySolidNode(uint index)
 {
-	if (shouldBeBoundarySolidNode(index)) 
-	{
-		this->field.setFieldEntry(index, BC_SOLID);
-		this->qIndices[index] = this->numberOfSolidBoundaryNodes++;
-		//grid->setNumberOfSolidBoundaryNodes(grid->getNumberOfSolidBoundaryNodes() + 1);
-	}
+    if (shouldBeBoundarySolidNode(index))
+    {
+        this->field.setFieldEntry(index, BC_SOLID);
+        this->qIndices[index] = this->numberOfSolidBoundaryNodes++;
+        //grid->setNumberOfSolidBoundaryNodes(grid->getNumberOfSolidBoundaryNodes() + 1);
+    }
 }
 
 void GridImp::fixOddCell(uint index)
@@ -483,9 +482,9 @@ bool GridImp::isOutSideOfGrid(Cell &cell) const
 bool GridImp::contains(Cell &cell, char type) const
 {
     for (const auto point : cell) {
-		uint index = transCoordToIndex(point.x, point.y, point.z);
-		if (index == INVALID_INDEX)
-			continue;
+        uint index = transCoordToIndex(point.x, point.y, point.z);
+        if (index == INVALID_INDEX)
+            continue;
         if (field.is(index, type))
             return true;
     }
@@ -495,8 +494,8 @@ bool GridImp::contains(Cell &cell, char type) const
 bool GridImp::cellContainsOnly(Cell &cell, char type) const
 {
     for (const auto point : cell) {
-		uint index = transCoordToIndex(point.x, point.y, point.z);
-		if (index == INVALID_INDEX)
+        uint index = transCoordToIndex(point.x, point.y, point.z);
+        if (index == INVALID_INDEX)
             return false;
         if (!field.is(index, type))
             return false;
@@ -507,8 +506,8 @@ bool GridImp::cellContainsOnly(Cell &cell, char type) const
 bool GridImp::cellContainsOnly(Cell &cell, char typeA, char typeB) const
 {
     for (const auto point : cell) {
-		uint index = transCoordToIndex(point.x, point.y, point.z);
-		if (index == INVALID_INDEX)
+        uint index = transCoordToIndex(point.x, point.y, point.z);
+        if (index == INVALID_INDEX)
             return false;
         if (!field.is(index, typeA) && !field.is(index, typeB))
             return false;
@@ -524,91 +523,91 @@ const Object * GridImp::getObject() const
 void GridImp::setNodeTo(Cell &cell, char type)
 {
     for (const auto point : cell) {
-		uint index = transCoordToIndex(point.x, point.y, point.z);
-		if (index == INVALID_INDEX)
-			continue;
-		field.setFieldEntry(index, type);
+        uint index = transCoordToIndex(point.x, point.y, point.z);
+        if (index == INVALID_INDEX)
+            continue;
+        field.setFieldEntry(index, type);
     }
 }
 
 void GridImp::setNodeTo(uint index, char type)
 {
-	if( index != INVALID_INDEX )
-		field.setFieldEntry(index, type);
+    if( index != INVALID_INDEX )
+        field.setFieldEntry(index, type);
 }
 
 bool GridImp::isNode(uint index, char type) const
 {
     if( index != INVALID_INDEX )
-		return field.is(index, type);
+        return field.is(index, type);
 
     throw std::runtime_error("GridImp::isNode() -> index == INVALID_INDEX not supported.");
 }
 
 bool GridImp::isValidEndOfGridStopper(uint index) const
 {
-	// Lenz: also includes corner stopper nodes
-	if (!this->field.is(index, INVALID_OUT_OF_GRID))
-		return false;
+    // Lenz: also includes corner stopper nodes
+    if (!this->field.is(index, INVALID_OUT_OF_GRID))
+        return false;
 
-	return hasNeighborOfType(index, FLUID);
+    return hasNeighborOfType(index, FLUID);
 }
 
 bool GridImp::isValidEndOfGridBoundaryStopper(uint index) const
 {
-	// Lenz: also includes corner stopper nodes
-	if (!this->field.is(index, FLUID))
-		return false;
+    // Lenz: also includes corner stopper nodes
+    if (!this->field.is(index, FLUID))
+        return false;
 
-	return ! hasAllNeighbors(index);
+    return ! hasAllNeighbors(index);
 }
 
 bool GridImp::isValidSolidStopper(uint index) const
 {
-	// Lenz: also includes corner stopper nodes
-	if (!this->field.is(index, INVALID_SOLID))
-		return false;
+    // Lenz: also includes corner stopper nodes
+    if (!this->field.is(index, INVALID_SOLID))
+        return false;
 
-	return hasNeighborOfType(index, FLUID);
+    return hasNeighborOfType(index, FLUID);
 }
 
 bool GridImp::shouldBeBoundarySolidNode(uint index) const
 {
-	if (!this->field.is(index, FLUID))
-		return false;
+    if (!this->field.is(index, FLUID))
+        return false;
 
-	return hasNeighborOfType(index, STOPPER_SOLID);
+    return hasNeighborOfType(index, STOPPER_SOLID);
 }
 
 bool GridImp::hasAllNeighbors(uint index) const
 {
-	// new version by Lenz, utilizes the range based for loop for all directions
-	real x, y, z;
-	this->transIndexToCoords(index, x, y, z);
-	for (const auto dir : this->distribution) {
-		const uint neighborIndex = this->transCoordToIndex(x + dir[0] * this->getDelta(), y + dir[1] * this->getDelta(), z + dir[2] * this->getDelta());
+    // new version by Lenz, utilizes the range based for loop for all directions
+    real x, y, z;
+    this->transIndexToCoords(index, x, y, z);
+    for (const auto dir : this->distribution) {
+        const uint neighborIndex = this->transCoordToIndex(x + dir[0] * this->getDelta(), y + dir[1] * this->getDelta(), z + dir[2] * this->getDelta());
 
-		if (neighborIndex == INVALID_INDEX) return false;
-	}
+        if (neighborIndex == INVALID_INDEX) return false;
+    }
 
-	return true;
+    return true;
 }
 
 bool GridImp::hasNeighborOfType(uint index, char type) const
 {
-	// new version by Lenz, utilizes the range based for loop for all directions
-	real x, y, z;
-	this->transIndexToCoords(index, x, y, z);
-	for (const auto dir : this->distribution) {
-		const uint neighborIndex = this->transCoordToIndex(x + dir[0] * this->getDelta(), y + dir[1] * this->getDelta(), z + dir[2] * this->getDelta());
+    // new version by Lenz, utilizes the range based for loop for all directions
+    real x, y, z;
+    this->transIndexToCoords(index, x, y, z);
+    for (const auto dir : this->distribution) {
+        const uint neighborIndex = this->transCoordToIndex(x + dir[0] * this->getDelta(), y + dir[1] * this->getDelta(), z + dir[2] * this->getDelta());
 
-		if (neighborIndex == INVALID_INDEX) continue;
+        if (neighborIndex == INVALID_INDEX) continue;
 
-		if (this->field.is(neighborIndex, type))
-			return true;
-	}
+        if (this->field.is(neighborIndex, type))
+            return true;
+    }
 
-	return false;
+    return false;
 }
 
 bool GridImp::nodeInNextCellIs(int index, char type) const
@@ -630,13 +629,13 @@ bool GridImp::nodeInNextCellIs(int index, char type) const
 
     const uint indexXYZ = transCoordToIndex(neighborX, neighborY, neighborZ);
 
-	const bool typeX   = indexX   == INVALID_INDEX ? false : this->field.is(indexX, type);
-	const bool typeY   = indexY   == INVALID_INDEX ? false : this->field.is(indexY, type);
-	const bool typeXY  = indexXY  == INVALID_INDEX ? false : this->field.is(indexXY, type);
-	const bool typeZ   = indexZ   == INVALID_INDEX ? false : this->field.is(indexZ, type);
-	const bool typeYZ  = indexYZ  == INVALID_INDEX ? false : this->field.is(indexYZ, type);
-	const bool typeXZ  = indexXZ  == INVALID_INDEX ? false : this->field.is(indexXZ, type);
-	const bool typeXYZ = indexXYZ == INVALID_INDEX ? false : this->field.is(indexXYZ, type);
+    const bool typeX   = indexX   == INVALID_INDEX ? false : this->field.is(indexX, type);
+    const bool typeY   = indexY   == INVALID_INDEX ? false : this->field.is(indexY, type);
+    const bool typeXY  = indexXY  == INVALID_INDEX ? false : this->field.is(indexXY, type);
+    const bool typeZ   = indexZ   == INVALID_INDEX ? false : this->field.is(indexZ, type);
+    const bool typeYZ  = indexYZ  == INVALID_INDEX ? false : this->field.is(indexYZ, type);
+    const bool typeXZ  = indexXZ  == INVALID_INDEX ? false : this->field.is(indexXZ, type);
+    const bool typeXYZ = indexXYZ == INVALID_INDEX ? false : this->field.is(indexXYZ, type);
 
     return typeX || typeY || typeXY || typeZ || typeYZ
         || typeXZ || typeXYZ;
@@ -661,13 +660,13 @@ bool GridImp::nodeInPreviousCellIs(int index, char type) const
 
     const uint indexXYZ = transCoordToIndex(neighborX, neighborY, neighborZ);
 
-	const bool typeX   = indexX   == INVALID_INDEX ? false : this->field.is(indexX  , type);
-	const bool typeY   = indexY   == INVALID_INDEX ? false : this->field.is(indexY  , type);
-	const bool typeXY  = indexXY  == INVALID_INDEX ? false : this->field.is(indexXY , type);
-	const bool typeZ   = indexZ   == INVALID_INDEX ? false : this->field.is(indexZ  , type);
-	const bool typeYZ  = indexYZ  == INVALID_INDEX ? false : this->field.is(indexYZ , type);
-	const bool typeXZ  = indexXZ  == INVALID_INDEX ? false : this->field.is(indexXZ , type);
-	const bool typeXYZ = indexXYZ == INVALID_INDEX ? false : this->field.is(indexXYZ, type);
+    const bool typeX   = indexX   == INVALID_INDEX ? false : this->field.is(indexX  , type);
+    const bool typeY   = indexY   == INVALID_INDEX ? false : this->field.is(indexY  , type);
+    const bool typeXY  = indexXY  == INVALID_INDEX ? false : this->field.is(indexXY , type);
+    const bool typeZ   = indexZ   == INVALID_INDEX ? false : this->field.is(indexZ  , type);
+    const bool typeYZ  = indexYZ  == INVALID_INDEX ? false : this->field.is(indexYZ , type);
+    const bool typeXZ  = indexXZ  == INVALID_INDEX ? false : this->field.is(indexXZ , type);
+    const bool typeXYZ = indexXYZ == INVALID_INDEX ? false : this->field.is(indexXYZ, type);
 
     return typeX || typeY || typeXY || typeZ || typeYZ
         || typeXZ || typeXYZ;
@@ -678,8 +677,8 @@ bool GridImp::nodeInCellIs(Cell& cell, char type) const
     for (const auto node : cell)
     {
         const uint index = transCoordToIndex(node.x, node.y, node.z);
-		if (index == INVALID_INDEX)
-			continue;
+        if (index == INVALID_INDEX)
+            continue;
         if (field.is(index, type))
             return true;
     }
@@ -696,9 +695,9 @@ void GridImp::setCellTo(uint index, char type)
     for (const auto node : cell)
     {
         const uint nodeIndex = transCoordToIndex(node.x, node.y, node.z);
-		if (nodeIndex == INVALID_INDEX)
-			continue;
-		this->field.setFieldEntry(nodeIndex, type);
+        if (nodeIndex == INVALID_INDEX)
+            continue;
+        this->field.setFieldEntry(nodeIndex, type);
     }
 }
 
@@ -712,15 +711,21 @@ void GridImp::setNonStopperOutOfGridCellTo(uint index, char type)
     for (const auto node : cell)
     {
         const uint nodeIndex = transCoordToIndex(node.x, node.y, node.z);
-		if (nodeIndex == INVALID_INDEX)
-			continue;
+        if (nodeIndex == INVALID_INDEX)
+            continue;
 
-        if( this->getFieldEntry( nodeIndex ) != STOPPER_OUT_OF_GRID && 
+        if( this->getFieldEntry( nodeIndex ) != STOPPER_OUT_OF_GRID &&
             this->getFieldEntry( nodeIndex ) != STOPPER_OUT_OF_GRID_BOUNDARY )
             this->field.setFieldEntry(nodeIndex, type);
     }
 }
 
+bool GridImp::nodeHasBC(uint index) const
+{
+    return (getFieldEntry(index) == vf::gpu::BC_PRESSURE || getFieldEntry(index) == vf::gpu::BC_VELOCITY ||
+            getFieldEntry(index) == vf::gpu::BC_NOSLIP   || getFieldEntry(index) == vf::gpu::BC_SLIP     ||
+            getFieldEntry(index) == vf::gpu::BC_STRESS);
+}
 
 void GridImp::setPeriodicity(bool periodicityX, bool periodicityY, bool periodicityZ)
 {
@@ -744,17 +749,17 @@ void GridImp::setPeriodicityZ(bool periodicity)
     this->periodicityZ = periodicity;
 }
 
-bool GridImp::getPeriodicityX()
+bool GridImp::getPeriodicityX() const
 {
     return this->periodicityX;
 }
 
-bool GridImp::getPeriodicityY()
+bool GridImp::getPeriodicityY() const
 {
     return this->periodicityY;
 }
 
-bool GridImp::getPeriodicityZ()
+bool GridImp::getPeriodicityZ() const
 {
     return this->periodicityZ;
 }
@@ -770,7 +775,7 @@ uint GridImp::transCoordToIndex(const real &x, const real &y, const real &z) con
     const uint yIndex = getYIndex(y);
     const uint zIndex = getZIndex(z);
 
-	if (xIndex >= nx || yIndex >= ny || zIndex >= nz)
+    if (xIndex >= nx || yIndex >= ny || zIndex >= nz)
         return INVALID_INDEX;
 
     return xIndex + nx * (yIndex + ny * zIndex);
@@ -819,20 +824,20 @@ TriangularMeshDiscretizationStrategy * GridImp::getTriangularMeshDiscretizationS
 
 uint GridImp::getNumberOfSolidBoundaryNodes() const
 {
-	return this->numberOfSolidBoundaryNodes;
+    return this->numberOfSolidBoundaryNodes;
 }
 
 void GridImp::setNumberOfSolidBoundaryNodes(uint numberOfSolidBoundaryNodes)
 {
-	if (numberOfSolidBoundaryNodes < INVALID_INDEX)
-		this->numberOfSolidBoundaryNodes = numberOfSolidBoundaryNodes;
+    if (numberOfSolidBoundaryNodes < INVALID_INDEX)
+        this->numberOfSolidBoundaryNodes = numberOfSolidBoundaryNodes;
 }
 
 real GridImp::getQValue(const uint index, const uint dir) const
 {
-	const int qIndex = dir * this->numberOfSolidBoundaryNodes + this->qIndices[index];
+    const int qIndex = dir * this->numberOfSolidBoundaryNodes + this->qIndices[index];
 
-	return this->qValues[qIndex];
+    return this->qValues[qIndex];
 }
 
 uint GridImp::getQPatch(const uint index) const
@@ -858,7 +863,7 @@ void GridImp::findSparseIndices(SPtr<Grid> finerGrid)
 {
     *logging::out << logging::Logger::INFO_INTERMEDIATE << "Find sparse indices...";
     auto fineGrid = std::static_pointer_cast<GridImp>(finerGrid);
-    
+
     this->updateSparseIndices();
 
 #pragma omp parallel for
@@ -906,7 +911,7 @@ void GridImp::updateSparseIndices()
     sparseSize = size - removedNodes;
 }
 
-void GridImp::findFluidNodeIndices(bool splitDomain) 
+void GridImp::findFluidNodeIndices(bool splitDomain)
 {
     // find sparse index of all fluid nodes
     this->fluidNodeIndices.clear();
@@ -935,7 +940,7 @@ void GridImp::findFluidNodeIndicesBorder() {
     // resize fluidNodeIndicesBorder (for better performance in copy operation)
     size_t newSize = 0;
     for (CommunicationIndices& ci : this->communicationIndices)
-        newSize += ci.sendIndices.size();    
+        newSize += ci.sendIndices.size();
     this->fluidNodeIndicesBorder.reserve(newSize);
 
     // copy all send indices to fluidNodeIndicesBorder
@@ -968,7 +973,7 @@ void GridImp::setNeighborIndices(uint index)
         this->setStopperNeighborCoords(index);
         return;
     }
-     
+
     if (this->sparseIndices[index] == -1)
         return;
 
@@ -1002,9 +1007,9 @@ void GridImp::setStopperNeighborCoords(uint index)
     if (vf::Math::lessEqual(z + delta, endZ + (0.5 * delta)) && !this->field.isInvalidOutOfGrid(this->transCoordToIndex(x, y, z + delta)))
         neighborIndexZ[index] = getSparseIndex(x, y, z + delta);
 
-    if (vf::Math::greaterEqual(x - delta, endX) && 
-        vf::Math::greaterEqual(y - delta, endY) && 
-        vf::Math::greaterEqual(z - delta, endZ) && 
+    if (vf::Math::greaterEqual(x - delta, endX) &&
+        vf::Math::greaterEqual(y - delta, endY) &&
+        vf::Math::greaterEqual(z - delta, endZ) &&
         !this->field.isInvalidOutOfGrid(this->transCoordToIndex(x - delta, y - delta, z - delta)))
     {
         neighborIndexNegative[index] = getSparseIndex(x - delta, y - delta, z - delta);
@@ -1035,7 +1040,7 @@ real GridImp::getNeighborCoord(bool periodicity, real startCoord, real coords[3]
             return coords[direction] + delta;
 
     }
-    
+
     return coords[direction] + delta;
 }
 
@@ -1061,7 +1066,7 @@ real GridImp::getNegativeNeighborCoord(bool periodicity, real startCoord, real c
 
         return getLastFluidNode(coords, direction, startCoord);
     }
-    
+
     return coords[direction] - delta;
 }
 
@@ -1155,7 +1160,7 @@ void GridImp::limitToSubDomain(SPtr<BoundingBox> subDomainBox, LbmOrGks lbmOrGks
             if( lbmOrGks == LBM )
                 tmpSubDomainBox.extend(this->delta);
 
-            if (!tmpSubDomainBox.isInside(x, y, z) 
+            if (!tmpSubDomainBox.isInside(x, y, z)
                 && ( this->getFieldEntry(index) == FLUID ||
                      this->getFieldEntry(index) == FLUID_CFC ||
                      this->getFieldEntry(index) == FLUID_CFF ||
@@ -1184,13 +1189,13 @@ void GridImp::limitToSubDomain(SPtr<BoundingBox> subDomainBox, LbmOrGks lbmOrGks
 
 void GridImp::findGridInterfaceCF(uint index, GridImp& finerGrid, LbmOrGks lbmOrGks)
 {
-	if (lbmOrGks == LBM)
-	{
-		gridInterface->findInterfaceCF            (index, this, &finerGrid);
-		gridInterface->findBoundaryGridInterfaceCF(index, this, &finerGrid);
-	}
-	else if (lbmOrGks == GKS)
-		gridInterface->findInterfaceCF_GKS(index, this, &finerGrid);
+    if (lbmOrGks == LBM)
+    {
+        gridInterface->findInterfaceCF            (index, this, &finerGrid);
+        gridInterface->findBoundaryGridInterfaceCF(index, this, &finerGrid);
+    }
+    else if (lbmOrGks == GKS)
+        gridInterface->findInterfaceCF_GKS(index, this, &finerGrid);
 }
 
 void GridImp::findGridInterfaceFC(uint index, GridImp& finerGrid)
@@ -1217,16 +1222,16 @@ void GridImp::mesh(Object* object)
     if (triangularMesh)
         triangularMeshDiscretizationStrategy->discretize(triangularMesh, this, INVALID_SOLID, FLUID);
     else
-		//new method for geometric primitives (not cell based) to be implemented
+        //new method for geometric primitives (not cell based) to be implemented
         this->discretize(object, INVALID_SOLID, FLUID);
 
     this->closeNeedleCells();
 
-	#pragma omp parallel for
+    #pragma omp parallel for
     for (int index = 0; index < (int)this->size; index++)
         this->findSolidStopperNode(index);
 
-	//#pragma omp parallel for
+    //#pragma omp parallel for
     for (int index = 0; index < (int)this->size; index++) {
         this->findBoundarySolidNode(index);
     }
@@ -1359,7 +1364,7 @@ void GridImp::findQs(Object* object) //TODO: enable qs for primitive objects
         findQsPrimitive(object);
 }
 
-void GridImp::allocateQs() 
+void GridImp::allocateQs()
 {
     this->qPatches = new uint[this->getNumberOfSolidBoundaryNodes()];
 
@@ -1379,8 +1384,8 @@ void GridImp::findQs(TriangularMesh &triangularMesh)
 
     if( this->qComputationStage == qComputationStageType::ComputeQs )
         allocateQs();
-    
-    
+
+
 #pragma omp parallel for
     for (int i = 0; i < triangularMesh.size; i++)
         this->findQs(triangularMesh.triangles[i]);
@@ -1406,15 +1411,15 @@ void GridImp::findQs(Triangle &triangle)
                 //if (!field.isFluid(index))
                 //    continue;
 
-				if( index == INVALID_INDEX ) continue;
+                if( index == INVALID_INDEX ) continue;
 
                 const Vertex point(x, y, z);
 
                 if( this->qComputationStage == qComputationStageType::ComputeQs ){
                     if(this->field.is(index, BC_SOLID))
                     {
-					    calculateQs(index, point, triangle);
-				    }
+                        calculateQs(index, point, triangle);
+                    }
                 }
                 else if( this->qComputationStage == qComputationStageType::FindSolidBoundaryNodes )
                 {
@@ -1449,14 +1454,14 @@ void GridImp::findQsPrimitive(Object * object)
         real x,y,z;
 
         this->transIndexToCoords(index,x,y,z);
-        
+
         const Vertex point(x, y, z);
 
         if( this->qComputationStage == qComputationStageType::ComputeQs ){
             if(this->field.is(index, BC_SOLID))
             {
-				calculateQs(index, point, object);
-			}
+                calculateQs(index, point, object);
+            }
         }
         else if( this->qComputationStage == qComputationStageType::FindSolidBoundaryNodes )
         {
@@ -1477,66 +1482,66 @@ void GridImp::calculateQs(const uint index, const Vertex &point, Object* object)
 {
     Vertex pointOnTriangle, direction;
 
-	real subdistance;
-	int error;
-	for (int i = distribution.dir_start; i <= distribution.dir_end; i++)
-	{
-		direction = Vertex( real(distribution.dirs[i * DIMENSION + 0]), 
+    real subdistance;
+    int error;
+    for (int i = distribution.dir_start; i <= distribution.dir_end; i++)
+    {
+        direction = Vertex( real(distribution.dirs[i * DIMENSION + 0]),
                             real(distribution.dirs[i * DIMENSION + 1]),
-			                real(distribution.dirs[i * DIMENSION + 2]) );
+                            real(distribution.dirs[i * DIMENSION + 2]) );
 
-		uint neighborIndex = this->transCoordToIndex(point.x + direction.x * this->delta,
-													    point.y + direction.y * this->delta,
-													    point.z + direction.z * this->delta);
+        uint neighborIndex = this->transCoordToIndex(point.x + direction.x * this->delta,
+                                                        point.y + direction.y * this->delta,
+                                                        point.z + direction.z * this->delta);
 
-		if (neighborIndex == INVALID_INDEX) continue;
+        if (neighborIndex == INVALID_INDEX) continue;
 
-		error = object->getIntersection(point, direction, pointOnTriangle, subdistance);
+        error = object->getIntersection(point, direction, pointOnTriangle, subdistance);
 
-		subdistance /= this->delta;
+        subdistance /= this->delta;
 
-		if (error == 0 && vf::Math::lessEqual(subdistance, 1.0) && vf::Math::greaterEqual(subdistance, 0.0))
-		{
-			if ( -0.5        > this->qValues[i*this->numberOfSolidBoundaryNodes + this->qIndices[index]] ||
+        if (error == 0 && vf::Math::lessEqual(subdistance, 1.0) && vf::Math::greaterEqual(subdistance, 0.0))
+        {
+            if ( -0.5        > this->qValues[i*this->numberOfSolidBoundaryNodes + this->qIndices[index]] ||
                     subdistance < this->qValues[i*this->numberOfSolidBoundaryNodes + this->qIndices[index]] )
-			{
+            {
+
+                this->qValues[i*this->numberOfSolidBoundaryNodes + this->qIndices[index]] = subdistance;
 
-				this->qValues[i*this->numberOfSolidBoundaryNodes + this->qIndices[index]] = subdistance;
-                    
                 this->qPatches[ this->qIndices[index] ] = 0;
 
-			}
-		}
-	}
+            }
+        }
+    }
 }
 
 bool GridImp::checkIfAtLeastOneValidQ(const uint index, const Vertex &point, Object* object) const
 {
     Vertex pointOnTriangle, direction;
 
-	real subdistance;
-	int error;
-	for (int i = distribution.dir_start; i <= distribution.dir_end; i++)
-	{
-		direction = Vertex( real(distribution.dirs[i * DIMENSION + 0]), 
+    real subdistance;
+    int error;
+    for (int i = distribution.dir_start; i <= distribution.dir_end; i++)
+    {
+        direction = Vertex( real(distribution.dirs[i * DIMENSION + 0]),
                             real(distribution.dirs[i * DIMENSION + 1]),
-			                real(distribution.dirs[i * DIMENSION + 2]) );
+                            real(distribution.dirs[i * DIMENSION + 2]) );
 
-		uint neighborIndex = this->transCoordToIndex(point.x + direction.x * this->delta,
-													 point.y + direction.y * this->delta,
-													 point.z + direction.z * this->delta);
+        uint neighborIndex = this->transCoordToIndex(point.x + direction.x * this->delta,
+                                                     point.y + direction.y * this->delta,
+                                                     point.z + direction.z * this->delta);
 
-		if (neighborIndex == INVALID_INDEX) continue;
+        if (neighborIndex == INVALID_INDEX) continue;
 
-		error = object->getIntersection(point, direction, pointOnTriangle, subdistance);
+        error = object->getIntersection(point, direction, pointOnTriangle, subdistance);
 
-		subdistance /= this->delta;
+        subdistance /= this->delta;
 
-		if (error == 0 && vf::Math::lessEqual(subdistance, 1.0) && vf::Math::greaterEqual(subdistance, 0.0))
-		{
-			return true;
-		}
-	}
+        if (error == 0 && vf::Math::lessEqual(subdistance, 1.0) && vf::Math::greaterEqual(subdistance, 0.0))
+        {
+            return true;
+        }
+    }
     return false;
 }
 
@@ -1565,7 +1570,7 @@ void GridImp::calculateQs(const Vertex &point, const Triangle &triangle) const
 
         error = triangle.getTriangleIntersection(point, direction, pointOnTriangle, subdistance);
 
-		subdistance /= this->delta;
+        subdistance /= this->delta;
 
         if (error == 0 && subdistance < 1.0 && subdistance > 0.0)
         {
@@ -1577,81 +1582,80 @@ void GridImp::calculateQs(const Vertex &point, const Triangle &triangle) const
 
 void GridImp::calculateQs(const uint index, const Vertex &point, const Triangle &triangle) const
 {
-	Vertex pointOnTriangle, direction;
-	real subdistance;
-	int error;
-	for (int i = distribution.dir_start; i <= distribution.dir_end; i++)
-	{
+    Vertex pointOnTriangle, direction;
+    real subdistance;
+    int error;
+    for (int i = distribution.dir_start; i <= distribution.dir_end; i++)
+    {
 #if defined(__CUDA_ARCH__)
-		direction = Vertex(DIRECTIONS[i][0], DIRECTIONS[i][1], DIRECTIONS[i][2]);
+        direction = Vertex(DIRECTIONS[i][0], DIRECTIONS[i][1], DIRECTIONS[i][2]);
 #else
-		direction = Vertex( real(distribution.dirs[i * DIMENSION + 0]), 
+        direction = Vertex( real(distribution.dirs[i * DIMENSION + 0]),
                             real(distribution.dirs[i * DIMENSION + 1]),
-			                real(distribution.dirs[i * DIMENSION + 2]) );
+                            real(distribution.dirs[i * DIMENSION + 2]) );
 #endif
 
-		uint neighborIndex = this->transCoordToIndex(point.x + direction.x * this->delta,
-													 point.y + direction.y * this->delta,
-													 point.z + direction.z * this->delta);
+        uint neighborIndex = this->transCoordToIndex(point.x + direction.x * this->delta,
+                                                     point.y + direction.y * this->delta,
+                                                     point.z + direction.z * this->delta);
 
-		if (neighborIndex == INVALID_INDEX) continue;
+        if (neighborIndex == INVALID_INDEX) continue;
 
-		error = triangle.getTriangleIntersection(point, direction, pointOnTriangle, subdistance);
+        error = triangle.getTriangleIntersection(point, direction, pointOnTriangle, subdistance);
 
-		subdistance /= this->delta;
+        subdistance /= this->delta;
 
-		if (error == 0 && vf::Math::lessEqual(subdistance, 1.0) && vf::Math::greaterEqual(subdistance, 0.0))
-		{
-			if ( -0.5        > this->qValues[i*this->numberOfSolidBoundaryNodes + this->qIndices[index]] ||
+        if (error == 0 && vf::Math::lessEqual(subdistance, 1.0) && vf::Math::greaterEqual(subdistance, 0.0))
+        {
+            if ( -0.5        > this->qValues[i*this->numberOfSolidBoundaryNodes + this->qIndices[index]] ||
                  subdistance < this->qValues[i*this->numberOfSolidBoundaryNodes + this->qIndices[index]] )
-			{
-				this->qValues[i*this->numberOfSolidBoundaryNodes + this->qIndices[index]] = subdistance;
+            {
+                this->qValues[i*this->numberOfSolidBoundaryNodes + this->qIndices[index]] = subdistance;
 
                 this->qPatches[ this->qIndices[index] ] = triangle.patchIndex;
-			}
-		}
-	}
+            }
+        }
+    }
 }
 
 bool GridImp::checkIfAtLeastOneValidQ(const uint index, const Vertex & point, const Triangle & triangle) const
 {
-	Vertex pointOnTriangle, direction;
-	real subdistance;
-	int error;
-	for (int i = distribution.dir_start; i <= distribution.dir_end; i++)
-	{
+    Vertex pointOnTriangle, direction;
+    real subdistance;
+    int error;
+    for (int i = distribution.dir_start; i <= distribution.dir_end; i++)
+    {
 #if defined(__CUDA_ARCH__)
-		direction = Vertex(DIRECTIONS[i][0], DIRECTIONS[i][1], DIRECTIONS[i][2]);
+        direction = Vertex(DIRECTIONS[i][0], DIRECTIONS[i][1], DIRECTIONS[i][2]);
 #else
-		direction = Vertex(real(distribution.dirs[i * DIMENSION + 0]), 
+        direction = Vertex(real(distribution.dirs[i * DIMENSION + 0]),
                            real(distribution.dirs[i * DIMENSION + 1]),
-			               real(distribution.dirs[i * DIMENSION + 2]));
+                           real(distribution.dirs[i * DIMENSION + 2]));
 #endif
 
-		uint neighborIndex = this->transCoordToIndex(point.x + direction.x * this->delta,
-													 point.y + direction.y * this->delta,
-													 point.z + direction.z * this->delta);
-		if (neighborIndex == INVALID_INDEX) continue;
+        uint neighborIndex = this->transCoordToIndex(point.x + direction.x * this->delta,
+                                                     point.y + direction.y * this->delta,
+                                                     point.z + direction.z * this->delta);
+        if (neighborIndex == INVALID_INDEX) continue;
 
-		error = triangle.getTriangleIntersection(point, direction, pointOnTriangle, subdistance);
+        error = triangle.getTriangleIntersection(point, direction, pointOnTriangle, subdistance);
 
-		subdistance /= this->delta;
+        subdistance /= this->delta;
 
-		if (error == 0 && vf::Math::lessEqual(subdistance, 1.0) && vf::Math::greaterEqual(subdistance, 0.0))
-		{
-			return true;
-		}
-	}
+        if (error == 0 && vf::Math::lessEqual(subdistance, 1.0) && vf::Math::greaterEqual(subdistance, 0.0))
+        {
+            return true;
+        }
+    }
     return false;
 }
 
 void GridImp::findCommunicationIndices(int direction, SPtr<BoundingBox> subDomainBox, LbmOrGks lbmOrGks)
 {
     for( uint index = 0; index < this->size; index++ ){
-        
         real x, y, z;
         this->transIndexToCoords(index, x, y, z);
-    
+
         if( this->getFieldEntry(index) == INVALID_OUT_OF_GRID ||
             this->getFieldEntry(index) == INVALID_SOLID ||
             this->getFieldEntry(index) == INVALID_COARSE_UNDER_FINE ||
@@ -1660,7 +1664,6 @@ void GridImp::findCommunicationIndices(int direction, SPtr<BoundingBox> subDomai
 
         if( lbmOrGks == LBM && this->getFieldEntry(index) == STOPPER_OUT_OF_GRID_BOUNDARY ) continue;
         if( lbmOrGks == LBM && this->getFieldEntry(index) == STOPPER_SOLID ) continue;
-
         if( direction == CommunicationDirections::MX ) findCommunicationIndex( index, x, subDomainBox->minX, direction);
         if( direction == CommunicationDirections::PX ) findCommunicationIndex( index, x, subDomainBox->maxX, direction);
         if( direction == CommunicationDirections::MY ) findCommunicationIndex( index, y, subDomainBox->minY, direction);
@@ -1672,16 +1675,13 @@ void GridImp::findCommunicationIndices(int direction, SPtr<BoundingBox> subDomai
 
 void GridImp::findCommunicationIndex( uint index, real coordinate, real limit, int direction ){
     // negative direction get a negative sign
-    real s = ( direction % 2 == 0 ) ? ( -1.0 ) : ( 1.0 );  
-
+    real s = ( direction % 2 == 0 ) ? ( -1.0 ) : ( 1.0 );
 
-	if (std::abs(coordinate - (limit + s * 0.5 * this->delta)) < 0.1 * this->delta) {
-		this->communicationIndices[direction].receiveIndices.push_back(index);
-	}
+    if (std::abs(coordinate - (limit + s * 0.5 * this->delta)) < 0.1 * this->delta)
+        this->communicationIndices[direction].receiveIndices.push_back(index);
 
-	if (std::abs(coordinate - (limit - s * 0.5 * this->delta)) < 0.1 * this->delta) {
-		this->communicationIndices[direction].sendIndices.push_back(index);
-	}
+    if (std::abs(coordinate - (limit - s * 0.5 * this->delta)) < 0.1 * this->delta)
+        this->communicationIndices[direction].sendIndices.push_back(index);
 }
 
 bool GridImp::isSendNode(int index) const
@@ -1727,14 +1727,14 @@ uint GridImp::getReceiveIndex(int direction, uint index)
 
 void GridImp::repairCommunicationIndices(int direction)
 {
-    this->communicationIndices[direction].sendIndices.insert( this->communicationIndices[direction].sendIndices.end(), 
-                                                              this->communicationIndices[direction+1].sendIndices.begin(), 
+    this->communicationIndices[direction].sendIndices.insert( this->communicationIndices[direction].sendIndices.end(),
+                                                              this->communicationIndices[direction+1].sendIndices.begin(),
                                                               this->communicationIndices[direction+1].sendIndices.end() );
 
 
 
-    this->communicationIndices[direction+1].receiveIndices.insert( this->communicationIndices[direction+1].receiveIndices.end(), 
-                                                                 this->communicationIndices[direction].receiveIndices.begin(), 
+    this->communicationIndices[direction+1].receiveIndices.insert( this->communicationIndices[direction+1].receiveIndices.end(),
+                                                                 this->communicationIndices[direction].receiveIndices.begin(),
                                                                  this->communicationIndices[direction].receiveIndices.end() );
 
     this->communicationIndices[direction].receiveIndices = this->communicationIndices[direction+1].receiveIndices;
@@ -1839,19 +1839,19 @@ real GridImp::getMaximumOnNodes(const real &maxExact, const real &decimalStart,
     return maxNode;
 }
 
-uint GridImp::getXIndex(real x) const 
-{ 
-    return std::lround((x - startX) / delta); 
+uint GridImp::getXIndex(real x) const
+{
+    return std::lround((x - startX) / delta);
 }
 
 uint GridImp::getYIndex(real y) const
-{ 
-    return std::lround((y - startY) / delta); 
+{
+    return std::lround((y - startY) / delta);
 }
 
 uint GridImp::getZIndex(real z) const
-{ 
-    return std::lround((z - startZ) / delta); 
+{
+    return std::lround((z - startZ) / delta);
 }
 
 real GridImp::getDelta() const
@@ -1866,11 +1866,11 @@ uint GridImp::getSize() const
 
 uint GridImp::getSparseSize() const
 {
-    return this->sparseSize; 
+    return this->sparseSize;
 }
 
-uint GridImp::getNumberOfFluidNodes() const { 
-    return (uint)this->fluidNodeIndices.size(); 
+uint GridImp::getNumberOfFluidNodes() const {
+    return (uint)this->fluidNodeIndices.size();
 }
 
 Field GridImp::getField() const
@@ -2063,23 +2063,147 @@ void GridImp::getNodeValues(real *xCoords, real *yCoords, real *zCoords, uint *n
     }
 }
 
-void GridImp::getFluidNodeIndices(uint *fluidNodeIndices) const 
-{ 
+void GridImp::getFluidNodeIndices(uint *fluidNodeIndices) const
+{
     for (uint nodeNumber = 0; nodeNumber < (uint)this->fluidNodeIndices.size(); nodeNumber++)
         fluidNodeIndices[nodeNumber] = this->fluidNodeIndices[nodeNumber];
 }
 
-uint GridImp::getNumberOfFluidNodesBorder() const 
-{ 
-    return (uint)this->fluidNodeIndicesBorder.size(); 
+uint GridImp::getNumberOfFluidNodesBorder() const
+{
+    return (uint)this->fluidNodeIndicesBorder.size();
 }
 
-void GridImp::getFluidNodeIndicesBorder(uint *fluidNodeIndicesBorder) const 
+void GridImp::getFluidNodeIndicesBorder(uint *fluidNodeIndicesBorder) const
 {
     for (uint nodeNumber = 0; nodeNumber < (uint)this->fluidNodeIndicesBorder.size(); nodeNumber++)
         fluidNodeIndicesBorder[nodeNumber] = this->fluidNodeIndicesBorder[nodeNumber];
 }
 
+void GridImp::addFluidNodeIndicesMacroVars(std::vector<uint> _fluidNodeIndicesMacroVars)
+{
+    size_t newSize = this->fluidNodeIndicesMacroVars.size()+_fluidNodeIndicesMacroVars.size();
+    this->fluidNodeIndicesMacroVars.reserve(newSize);
+    std::copy(_fluidNodeIndicesMacroVars.begin(), _fluidNodeIndicesMacroVars.end(), std::back_inserter(this->fluidNodeIndicesMacroVars));
+}
+
+void GridImp::addFluidNodeIndicesApplyBodyForce(std::vector<uint> _fluidNodeIndicesApplyBodyForce)
+{
+
+    size_t newSize = this->fluidNodeIndicesApplyBodyForce.size()+_fluidNodeIndicesApplyBodyForce.size();
+    this->fluidNodeIndicesApplyBodyForce.reserve(newSize);
+    std::copy(_fluidNodeIndicesApplyBodyForce.begin(), _fluidNodeIndicesApplyBodyForce.end(), std::back_inserter(this->fluidNodeIndicesApplyBodyForce));
+}
+
+void GridImp::addFluidNodeIndicesAllFeatures(std::vector<uint> _fluidNodeIndicesAllFeatures)
+{
+
+    size_t newSize = this->fluidNodeIndicesAllFeatures.size()+_fluidNodeIndicesAllFeatures.size();
+    this->fluidNodeIndicesAllFeatures.reserve(newSize);
+    std::copy(_fluidNodeIndicesAllFeatures.begin(), _fluidNodeIndicesAllFeatures.end(), std::back_inserter(this->fluidNodeIndicesAllFeatures));
+}
+
+void GridImp::sortFluidNodeIndicesMacroVars()
+{
+    if(this->fluidNodeIndicesMacroVars.size()>0)
+    {
+        sort(this->fluidNodeIndicesMacroVars.begin(), this->fluidNodeIndicesMacroVars.end());
+        // Remove duplicates
+        this->fluidNodeIndicesMacroVars.erase( unique( this->fluidNodeIndicesMacroVars.begin(), this->fluidNodeIndicesMacroVars.end() ), this->fluidNodeIndicesMacroVars.end() );
+
+         // Remove indices of fluidNodeIndicesAllFeatures from fluidNodeIndicesMacroVars
+        if(this->fluidNodeIndicesAllFeatures.size()>0)
+        {
+            this->fluidNodeIndicesMacroVars.erase(   std::remove_if(   this->fluidNodeIndicesMacroVars.begin(), this->fluidNodeIndicesMacroVars.end(),
+                                                        [&](auto x){return binary_search(fluidNodeIndicesAllFeatures.begin(),fluidNodeIndicesAllFeatures.end(),x);} ),
+                                            this->fluidNodeIndicesMacroVars.end()
+                                        );
+        }
+
+        // Remove indices of fluidNodeIndicesMacroVars from fluidNodeIndices
+        this->fluidNodeIndices.erase(   std::remove_if(   this->fluidNodeIndices.begin(), this->fluidNodeIndices.end(),
+                                                        [&](auto x){return binary_search(fluidNodeIndicesMacroVars.begin(),fluidNodeIndicesMacroVars.end(),x);} ),
+                                        this->fluidNodeIndices.end()
+                                    );
+    }
+}
+
+void GridImp::sortFluidNodeIndicesApplyBodyForce()
+{
+    if(this->fluidNodeIndicesApplyBodyForce.size()>0)
+    {
+        sort(this->fluidNodeIndicesApplyBodyForce.begin(), this->fluidNodeIndicesApplyBodyForce.end());
+        // Remove duplicates
+        this->fluidNodeIndicesApplyBodyForce.erase( unique( this->fluidNodeIndicesApplyBodyForce.begin(), this->fluidNodeIndicesApplyBodyForce.end() ), this->fluidNodeIndicesApplyBodyForce.end() );
+
+         // Remove indices of fluidNodeIndicesAllFeatures from fluidNodeIndicesMacroVars
+        if(this->fluidNodeIndicesAllFeatures.size()>0)
+        {
+            this->fluidNodeIndicesApplyBodyForce.erase(   std::remove_if(   this->fluidNodeIndicesApplyBodyForce.begin(), this->fluidNodeIndicesApplyBodyForce.end(),
+                                                        [&](auto x){return binary_search(fluidNodeIndicesAllFeatures.begin(),fluidNodeIndicesAllFeatures.end(),x);} ),
+                                            this->fluidNodeIndicesApplyBodyForce.end()
+                                        );
+        }
+
+        // Remove indices of fluidNodeIndicesMacroVars from fluidNodeIndices
+        this->fluidNodeIndices.erase(   std::remove_if(   this->fluidNodeIndices.begin(), this->fluidNodeIndices.end(),
+                                                        [&](auto x){return binary_search(fluidNodeIndicesApplyBodyForce.begin(),fluidNodeIndicesApplyBodyForce.end(),x);} ),
+                                        this->fluidNodeIndices.end()
+                                    );
+    }
+}
+
+void GridImp::sortFluidNodeIndicesAllFeatures()
+{
+    if(this->fluidNodeIndicesAllFeatures.size()>0)
+    {
+        sort(this->fluidNodeIndicesAllFeatures.begin(), this->fluidNodeIndicesAllFeatures.end());
+        // Remove duplicates
+        this->fluidNodeIndicesAllFeatures.erase( unique( this->fluidNodeIndicesAllFeatures.begin(), this->fluidNodeIndicesAllFeatures.end() ), this->fluidNodeIndicesAllFeatures.end() );
+        // Remove indices of fluidNodeIndicesMacroVars from fluidNodeIndices
+        this->fluidNodeIndices.erase(   std::remove_if(   this->fluidNodeIndices.begin(), this->fluidNodeIndices.end(),
+                                                        [&](auto x){return binary_search(fluidNodeIndicesAllFeatures.begin(),fluidNodeIndicesAllFeatures.end(),x);} ),
+                                        this->fluidNodeIndices.end()
+                                    );
+    }
+}
+
+uint GridImp::getNumberOfFluidNodeIndicesMacroVars() const {
+    return (uint)this->fluidNodeIndicesMacroVars.size();
+}
+
+uint GridImp::getNumberOfFluidNodeIndicesApplyBodyForce() const {
+    return (uint)this->fluidNodeIndicesApplyBodyForce.size();
+}
+
+uint GridImp::getNumberOfFluidNodeIndicesAllFeatures() const {
+    return (uint)this->fluidNodeIndicesAllFeatures.size();
+}
+
+void GridImp::getFluidNodeIndicesMacroVars(uint *_fluidNodeIndicesMacroVars) const
+{
+    std::copy(fluidNodeIndicesMacroVars.begin(), fluidNodeIndicesMacroVars.end(), _fluidNodeIndicesMacroVars);
+}
+void GridImp::getFluidNodeIndicesApplyBodyForce(uint *_fluidNodeIndicesApplyBodyForce) const
+{
+    std::copy(fluidNodeIndicesApplyBodyForce.begin(), fluidNodeIndicesApplyBodyForce.end(), _fluidNodeIndicesApplyBodyForce);
+}
+void GridImp::getFluidNodeIndicesAllFeatures(uint *_fluidNodeIndicesAllFeatures) const
+{
+    std::copy(fluidNodeIndicesAllFeatures.begin(), fluidNodeIndicesAllFeatures.end(), _fluidNodeIndicesAllFeatures);
+}
+
+
+std::vector<SideType> GridImp::getBCAlreadySet() {
+    return this->bcAlreadySet;
+}
+
+void GridImp::addBCalreadySet(SideType side)
+{
+    this->bcAlreadySet.push_back(side);
+}
+
+
 void GridImp::print() const
 {
     printf("min: (%2.4f, %2.4f, %2.4f), max: (%2.4f, %2.4f, %2.4f), size: %d, delta: %2.4f\n", startX, startY, startZ,
@@ -2087,3 +2211,10 @@ void GridImp::print() const
     if(this->gridInterface)
         this->gridInterface->print();
 }
+
+bool GridImp::isStopperForBC(uint index) const
+{
+    return (this->getFieldEntry(index) == vf::gpu::STOPPER_OUT_OF_GRID_BOUNDARY ||
+            this->getFieldEntry(index) == vf::gpu::STOPPER_OUT_OF_GRID ||
+            this->getFieldEntry(index) == vf::gpu::STOPPER_SOLID);
+}
diff --git a/src/gpu/GridGenerator/grid/GridImp.h b/src/gpu/GridGenerator/grid/GridImp.h
index edb5ca916bf68dcf992ea214dcddb2dc43810352..2cd322ebed78daaf135ad97b881923ca5831bbcd 100644
--- a/src/gpu/GridGenerator/grid/GridImp.h
+++ b/src/gpu/GridGenerator/grid/GridImp.h
@@ -34,6 +34,7 @@
 #define GRID_IMP_H
 
 #include <array>
+#include <vector>
 
 #include "Core/LbmOrGks.h"
 
@@ -52,6 +53,7 @@ class Object;
 class BoundingBox;
 class TriangularMeshDiscretizationStrategy;
 
+
 #ifdef __GNUC__
     #ifndef __clang__
         #pragma push
@@ -76,7 +78,7 @@ protected:
 
 public:
     static SPtr<GridImp> makeShared(Object* object, real startX, real startY, real startZ, real endX, real endY, real endZ, real delta, std::string d3Qxx, uint level);
-    virtual ~GridImp() = default;
+    ~GridImp() override = default;
 
 private:
     void initalNumberOfNodesAndSize();
@@ -92,6 +94,7 @@ private:
     bool nodeInPreviousCellIs(int index, char type) const;
     bool nodeInCellIs(Cell& cell, char type) const override;
 
+
     uint getXIndex(real x) const;
     uint getYIndex(real y) const;
     uint getZIndex(real z) const;
@@ -115,8 +118,11 @@ private:
 
     int *sparseIndices;
 
-    std::vector<uint> fluidNodeIndices;
-    std::vector<uint> fluidNodeIndicesBorder;
+    std::vector<uint> fluidNodeIndices;                 // run on CollisionTemplate::Default
+    std::vector<uint> fluidNodeIndicesBorder;           // run on subdomain border nodes (CollisionTemplate::SubDomainBorder)
+    std::vector<uint> fluidNodeIndicesMacroVars;        // run on CollisionTemplate::MacroVars
+    std::vector<uint> fluidNodeIndicesApplyBodyForce;   // run on CollisionTemplate::ApplyBodyForce
+    std::vector<uint> fluidNodeIndicesAllFeatures;      // run on CollisionTemplate::AllFeatures
 
 	uint *qIndices;     //maps from matrix index to qIndex
 	real *qValues;
@@ -132,6 +138,8 @@ private:
 
     bool enableFixRefinementIntoTheWall;
 
+    std::vector<SideType> bcAlreadySet;
+
 protected:
     Field field;
     int *neighborIndexX, *neighborIndexY, *neighborIndexZ, *neighborIndexNegative;
@@ -146,9 +154,9 @@ public:
     void setPeriodicityY(bool periodicity) override;
     void setPeriodicityZ(bool periodicity) override;
 
-    bool getPeriodicityX() override;
-    bool getPeriodicityY() override;
-    bool getPeriodicityZ() override;
+    bool getPeriodicityX() const override;
+    bool getPeriodicityY() const override;
+    bool getPeriodicityZ() const override;
 
     void setEnableFixRefinementIntoTheWall(bool enableFixRefinementIntoTheWall) override;
 
@@ -182,6 +190,9 @@ public:
 
     void setNumberOfLayers(uint numberOfLayers) override;
 
+    std::vector<SideType> getBCAlreadySet() override;
+    void addBCalreadySet(SideType side) override;
+
 public:
     Distribution distribution;
 
@@ -216,6 +227,7 @@ public:
     bool nodeInNextCellIs(int index, char type) const;
     bool hasAllNeighbors(uint index) const;
     bool hasNeighborOfType(uint index, char type) const;
+    bool nodeHasBC(uint index) const override;
     bool cellContainsOnly(Cell &cell, char type) const;
     bool cellContainsOnly(Cell &cell, char typeA, char typeB) const;
 
@@ -256,6 +268,8 @@ public:
     static void getGridInterface(uint *gridInterfaceList, const uint *oldGridInterfaceList, uint size);
 
     bool isSparseIndexInFluidNodeIndicesBorder(uint &sparseIndex) const override;
+    
+    bool isStopperForBC(uint index) const override;
 
     int *getNeighborsX() const override;
     int* getNeighborsY() const override;
@@ -273,7 +287,7 @@ public:
     void print() const;
 
 public:
-    virtual void findSparseIndices(SPtr<Grid> fineGrid) override;
+    void findSparseIndices(SPtr<Grid> fineGrid) override;
 
     void findForGridInterfaceNewIndices(SPtr<GridImp> fineGrid);
     void updateSparseIndices();
@@ -364,6 +378,19 @@ public:
     uint getNumberOfFluidNodesBorder() const override;
     void getFluidNodeIndicesBorder(uint *fluidNodeIndicesBorder) const override;
 
+    void addFluidNodeIndicesMacroVars(std::vector<uint> _fluidNodeIndicesMacroVars) override;
+    void addFluidNodeIndicesApplyBodyForce(std::vector<uint> _fluidNodeIndicesApplyBodyForce) override;
+    void addFluidNodeIndicesAllFeatures(std::vector<uint> _fluidNodeIndicesAllFeatures) override;
+    void sortFluidNodeIndicesMacroVars() override;
+    void sortFluidNodeIndicesApplyBodyForce() override;
+    void sortFluidNodeIndicesAllFeatures() override;
+
+    uint getNumberOfFluidNodeIndicesMacroVars() const override;
+    uint getNumberOfFluidNodeIndicesApplyBodyForce() const override;
+    uint getNumberOfFluidNodeIndicesAllFeatures() const override; 
+    void getFluidNodeIndicesMacroVars(uint *fluidNodeIndicesMacroVars) const override;
+    void getFluidNodeIndicesApplyBodyForce(uint *fluidNodeIndicesApplyBodyForce) const override;
+    void getFluidNodeIndicesAllFeatures(uint *fluidNodeIndicesAllFeatures) const override;
 
 public:
     struct CommunicationIndices {
diff --git a/src/gpu/GridGenerator/io/SimulationFileWriter/SimulationFileWriter.cpp b/src/gpu/GridGenerator/io/SimulationFileWriter/SimulationFileWriter.cpp
index 23fb0f4e7f3e16702e9cb2459606986af1032e49..0238434dc87b453dc21164577d8abd4ce1819793 100644
--- a/src/gpu/GridGenerator/io/SimulationFileWriter/SimulationFileWriter.cpp
+++ b/src/gpu/GridGenerator/io/SimulationFileWriter/SimulationFileWriter.cpp
@@ -417,10 +417,10 @@ void SimulationFileWriter::writeGridInterfaceToFile(SPtr<GridBuilder> builder, u
     }
 }
 
-void SimulationFileWriter::writeGridInterfaceToFile(const uint numberOfNodes, std::ofstream& coarseFile, uint* coarse, std::ofstream& fineFile, uint* fine)
+void SimulationFileWriter::writeGridInterfaceToFile(uint numberOfNodes, std::ofstream &coarseFile, uint *coarse,
+                                                    std::ofstream &fineFile, uint *fine)
 {
-    for (uint index = 0; index < numberOfNodes; index++)
-    {
+    for (uint index = 0; index < numberOfNodes; index++) {
         coarseFile << coarse[index] << " \n";
         fineFile << fine[index] << " \n";
     }
@@ -428,17 +428,15 @@ void SimulationFileWriter::writeGridInterfaceToFile(const uint numberOfNodes, st
     fineFile << "\n";
 }
 
-void SimulationFileWriter::writeGridInterfaceOffsetToFile(uint numberOfNodes, std::ofstream & offsetFile, real* offset_X, real* offset_Y, real* offset_Z)
+void SimulationFileWriter::writeGridInterfaceOffsetToFile(uint numberOfNodes, std::ofstream &offsetFile, real *offset_X,
+                                                          real *offset_Y, real *offset_Z)
 {
-    for (uint index = 0; index < numberOfNodes; index++)
-    {
+    for (uint index = 0; index < numberOfNodes; index++) {
         offsetFile << offset_X[index] << " " << offset_Y[index] << " " << offset_Z[index] << " \n";
     }
     offsetFile << "\n";
 }
 
-
-
 /*#################################################################################*/
 /*---------------------------------private methods---------------------------------*/
 /*---------------------------------------------------------------------------------*/
diff --git a/src/gpu/VirtualFluids_GPU/CMakeLists.txt b/src/gpu/VirtualFluids_GPU/CMakeLists.txt
index 759528e5346ba8d9899cb90eb64503b20a44c4fc..ed647cb406bca23ef90667b7d17171c7b3f46283 100644
--- a/src/gpu/VirtualFluids_GPU/CMakeLists.txt
+++ b/src/gpu/VirtualFluids_GPU/CMakeLists.txt
@@ -8,7 +8,7 @@ if(MSVC)
     set(additional_libraries ws2_32 Traffic) # ws_32 throws an error on Phoenix
 endif()
 
-vf_add_library(PUBLIC_LINK basics lbmCuda PRIVATE_LINK ${additional_libraries} GridGenerator MPI::MPI_CXX vf_cuda)
+vf_add_library(PUBLIC_LINK basics lbm PRIVATE_LINK ${additional_libraries} GridGenerator MPI::MPI_CXX vf_cuda)
 
 #SET(TPN_WIN32 "/EHsc")
 #https://stackoverflow.com/questions/6832666/lnk2019-when-including-asio-headers-solution-generated-with-cmake
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/Calc2ndMoments.cpp b/src/gpu/VirtualFluids_GPU/Calculation/Calc2ndMoments.cpp
index f8f5c42b835a1a4ba55e378e624230bbb43dc05a..e3f344231dc9d5e19c09f7ce1fde7d31f1770232 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/Calc2ndMoments.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/Calc2ndMoments.cpp
@@ -17,11 +17,11 @@ void alloc2ndMoments(Parameter* para, CudaMemoryManager* cudaMemoryManager)
 
 void init2ndMoments(Parameter* para)
 {
-	for (int lev=para->getCoarse(); lev <= para->getFine(); lev++)
+	for (int lev = para->getCoarse(); lev <= para->getFine(); lev++)
 	{
 		//////////////////////////////////////////////////////////////////////////
 		//init host arrays
-		for (unsigned int pos=0;pos<para->getParH(lev)->numberOfNodes;pos++)
+		for (size_t pos = 0; pos < para->getParH(lev)->numberOfNodes; pos++)
 		{
 			para->getParH(lev)->kxyFromfcNEQ[pos]    = 0.0;
 			para->getParH(lev)->kyzFromfcNEQ[pos]    = 0.0;
@@ -116,7 +116,7 @@ void init3rdMoments(Parameter* para)
 	{
 		//////////////////////////////////////////////////////////////////////////
 		//init host arrays
-		for (unsigned int pos=0;pos<para->getParH(lev)->numberOfNodes;pos++)
+		for (size_t pos = 0; pos < para->getParH(lev)->numberOfNodes; pos++)
 		{
 			para->getParH(lev)->CUMbbb[pos] = 0.0;
 			para->getParH(lev)->CUMabc[pos] = 0.0;
@@ -198,7 +198,7 @@ void calc3rdMoments(Parameter* para, CudaMemoryManager* cudaMemoryManager)
 
 void allocHigherOrderMoments(Parameter* para, CudaMemoryManager* cudaMemoryManager)
 {
-	for (int lev=para->getCoarse(); lev <= para->getFine(); lev++)
+	for (int lev = para->getCoarse(); lev <= para->getFine(); lev++)
 	{
 		//////////////////////////////////////////////////////////////////////////
 		//allocation (device-memory + host-memory)
@@ -211,11 +211,11 @@ void allocHigherOrderMoments(Parameter* para, CudaMemoryManager* cudaMemoryManag
 
 void initHigherOrderMoments(Parameter* para)
 {
-	for (int lev=para->getCoarse(); lev <= para->getFine(); lev++)
+	for (int lev = para->getCoarse(); lev <= para->getFine(); lev++)
 	{
 		//////////////////////////////////////////////////////////////////////////
 		//init host arrays
-		for (unsigned int pos=0;pos<para->getParH(lev)->numberOfNodes;pos++)
+		for (size_t pos = 0; pos < para->getParH(lev)->numberOfNodes; pos++)
 		{
 			para->getParH(lev)->CUMcbb[pos] = 0.0;
 			para->getParH(lev)->CUMbcb[pos] = 0.0;
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/CalcMedian.cpp b/src/gpu/VirtualFluids_GPU/Calculation/CalcMedian.cpp
index 77db571f7f10e0ea0bff827400270dd074d4e666..80a667f91976b745b619fed5d5763b5429a6559c 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/CalcMedian.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/CalcMedian.cpp
@@ -11,16 +11,16 @@
 
 void allocMedian(Parameter* para, CudaMemoryManager* cudaMemoryManager)
 {
-	for (int lev=para->getCoarse(); lev <= para->getFine(); lev++)
+	for (int lev = para->getCoarse(); lev <= para->getFine(); lev++)
 	{
 		cudaMemoryManager->cudaAllocMedianOut(lev);
-		for (unsigned int i = 0; i < para->getParH(lev)->numberOfNodes; i++)
+		for (size_t pos = 0; pos < para->getParH(lev)->numberOfNodes; pos++)
 		{
-			para->getParH(lev)->vx_SP_Med_Out[i]    = (real)0.0;
-			para->getParH(lev)->vy_SP_Med_Out[i]    = (real)0.0;
-			para->getParH(lev)->vz_SP_Med_Out[i]    = (real)0.0;
-			para->getParH(lev)->rho_SP_Med_Out[i]   = (real)0.0;
-			para->getParH(lev)->press_SP_Med_Out[i] = (real)0.0;
+			para->getParH(lev)->vx_SP_Med_Out[pos]    = (real)0.0;
+			para->getParH(lev)->vy_SP_Med_Out[pos]    = (real)0.0;
+			para->getParH(lev)->vz_SP_Med_Out[pos]    = (real)0.0;
+			para->getParH(lev)->rho_SP_Med_Out[pos]   = (real)0.0;
+			para->getParH(lev)->press_SP_Med_Out[pos] = (real)0.0;
 		}
 	}
 }
@@ -31,15 +31,15 @@ void allocMedian(Parameter* para, CudaMemoryManager* cudaMemoryManager)
 
 void calcMedian(Parameter* para, uint tdiff)
 {
-	for (int lev=para->getCoarse(); lev <= para->getFine(); lev++)
+	for (int lev = para->getCoarse(); lev <= para->getFine(); lev++)
 	{
-		for (uint i = 0; i < para->getParH(lev)->numberOfNodes; i++)
+        for (size_t pos = 0; pos < para->getParH(lev)->numberOfNodes; pos++)
 		{
-			para->getParH(lev)->vx_SP_Med_Out[i]    = para->getParH(lev)->vx_SP_Med[i]   / (real)tdiff;
-			para->getParH(lev)->vy_SP_Med_Out[i]    = para->getParH(lev)->vy_SP_Med[i]   / (real)tdiff;
-			para->getParH(lev)->vz_SP_Med_Out[i]    = para->getParH(lev)->vz_SP_Med[i]   / (real)tdiff;
-			para->getParH(lev)->rho_SP_Med_Out[i]   = para->getParH(lev)->rho_SP_Med[i]  / (real)tdiff;
-			para->getParH(lev)->press_SP_Med_Out[i] = para->getParH(lev)->press_SP_Med[i]/ (real)tdiff;
+			para->getParH(lev)->vx_SP_Med_Out[pos]    = para->getParH(lev)->vx_SP_Med[pos]   / (real)tdiff;
+			para->getParH(lev)->vy_SP_Med_Out[pos]    = para->getParH(lev)->vy_SP_Med[pos]   / (real)tdiff;
+			para->getParH(lev)->vz_SP_Med_Out[pos]    = para->getParH(lev)->vz_SP_Med[pos]   / (real)tdiff;
+			para->getParH(lev)->rho_SP_Med_Out[pos]   = para->getParH(lev)->rho_SP_Med[pos]  / (real)tdiff;
+			para->getParH(lev)->press_SP_Med_Out[pos] = para->getParH(lev)->press_SP_Med[pos]/ (real)tdiff;
 		}
 	}
 }
@@ -75,14 +75,14 @@ void allocMedianAD(Parameter* para, CudaMemoryManager* cudaMemoryManager)
 	for (int lev = para->getCoarse(); lev <= para->getFine(); lev++)
 	{
         cudaMemoryManager->cudaAllocMedianOutAD(lev);
-		for (unsigned int i = 0; i < para->getParH(lev)->numberOfNodes; i++)
+		for (size_t pos = 0; pos < para->getParH(lev)->numberOfNodes; pos++)
 		{
-			para->getParH(lev)->vx_SP_Med_Out[i]    = (real)0.0;
-			para->getParH(lev)->vy_SP_Med_Out[i]    = (real)0.0;
-			para->getParH(lev)->vz_SP_Med_Out[i]    = (real)0.0;
-			para->getParH(lev)->rho_SP_Med_Out[i]   = (real)0.0;
-			para->getParH(lev)->press_SP_Med_Out[i] = (real)0.0;
-			para->getParH(lev)->Conc_Med_Out[i]     = (real)0.0;
+			para->getParH(lev)->vx_SP_Med_Out[pos]    = (real)0.0;
+			para->getParH(lev)->vy_SP_Med_Out[pos]    = (real)0.0;
+			para->getParH(lev)->vz_SP_Med_Out[pos]    = (real)0.0;
+			para->getParH(lev)->rho_SP_Med_Out[pos]   = (real)0.0;
+			para->getParH(lev)->press_SP_Med_Out[pos] = (real)0.0;
+			para->getParH(lev)->Conc_Med_Out[pos]     = (real)0.0;
 		}
 	}
 }
@@ -95,14 +95,14 @@ void calcMedianAD(Parameter* para, uint tdiff)
 {
 	for (int lev = para->getCoarse(); lev <= para->getFine(); lev++)
 	{
-		for (uint i = 0; i < para->getParH(lev)->numberOfNodes; i++)
+		for (size_t pos = 0; pos < para->getParH(lev)->numberOfNodes; pos++)
 		{
-			para->getParH(lev)->vx_SP_Med_Out[i]    = para->getParH(lev)->vx_SP_Med[i]    / (real)tdiff;
-			para->getParH(lev)->vy_SP_Med_Out[i]    = para->getParH(lev)->vy_SP_Med[i]    / (real)tdiff;
-			para->getParH(lev)->vz_SP_Med_Out[i]    = para->getParH(lev)->vz_SP_Med[i]    / (real)tdiff;
-			para->getParH(lev)->rho_SP_Med_Out[i]   = para->getParH(lev)->rho_SP_Med[i]   / (real)tdiff;
-			para->getParH(lev)->press_SP_Med_Out[i] = para->getParH(lev)->press_SP_Med[i] / (real)tdiff;
-			para->getParH(lev)->Conc_Med_Out[i]     = para->getParH(lev)->Conc_Med[i]     / (real)tdiff;
+			para->getParH(lev)->vx_SP_Med_Out[pos]    = para->getParH(lev)->vx_SP_Med[pos]    / (real)tdiff;
+			para->getParH(lev)->vy_SP_Med_Out[pos]    = para->getParH(lev)->vy_SP_Med[pos]    / (real)tdiff;
+			para->getParH(lev)->vz_SP_Med_Out[pos]    = para->getParH(lev)->vz_SP_Med[pos]    / (real)tdiff;
+			para->getParH(lev)->rho_SP_Med_Out[pos]   = para->getParH(lev)->rho_SP_Med[pos]   / (real)tdiff;
+			para->getParH(lev)->press_SP_Med_Out[pos] = para->getParH(lev)->press_SP_Med[pos] / (real)tdiff;
+			para->getParH(lev)->Conc_Med_Out[pos]     = para->getParH(lev)->Conc_Med[pos]     / (real)tdiff;
 		}
 	}
 }
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.cpp b/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.cpp
index e91fb6f5c232bd98073a1c930149693f8af4b078..9572252965e1c619702370f8b9a3756bf035035e 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.cpp
@@ -25,32 +25,32 @@ void calcVelocityAndFluctuations(Parameter *para, CudaMemoryManager *cudaMemoryM
     for (int lev = para->getCoarse(); lev <= para->getFine(); lev++) {
         cudaMemoryManager->cudaCopyTurbulenceIntensityDH(lev, para->getParH(lev)->numberOfNodes);
 
-        for (uint i = 0; i < para->getParH(lev)->numberOfNodes; i++) {
+        for (size_t pos = 0; pos < para->getParH(lev)->numberOfNodes; pos++) {
             // mean velocity
-            para->getParH(lev)->vx_mean[i] = para->getParH(lev)->vx_mean[i] / (real)tdiff;
-            para->getParH(lev)->vy_mean[i] = para->getParH(lev)->vy_mean[i] / (real)tdiff;
-            para->getParH(lev)->vz_mean[i] = para->getParH(lev)->vz_mean[i] / (real)tdiff;
+            para->getParH(lev)->vx_mean[pos] = para->getParH(lev)->vx_mean[pos] / (real)tdiff;
+            para->getParH(lev)->vy_mean[pos] = para->getParH(lev)->vy_mean[pos] / (real)tdiff;
+            para->getParH(lev)->vz_mean[pos] = para->getParH(lev)->vz_mean[pos] / (real)tdiff;
 
             // fluctuations
-            para->getParH(lev)->vxx[i] = para->getParH(lev)->vxx[i] / (real)tdiff;
-            para->getParH(lev)->vyy[i] = para->getParH(lev)->vyy[i] / (real)tdiff;
-            para->getParH(lev)->vzz[i] = para->getParH(lev)->vzz[i] / (real)tdiff;
-            para->getParH(lev)->vxy[i] = para->getParH(lev)->vxy[i] / (real)tdiff;
-            para->getParH(lev)->vxz[i] = para->getParH(lev)->vxz[i] / (real)tdiff;
-            para->getParH(lev)->vyz[i] = para->getParH(lev)->vyz[i] / (real)tdiff;
-
-            para->getParH(lev)->vxx[i] =
-                para->getParH(lev)->vxx[i] - para->getParH(lev)->vx_mean[i] * para->getParH(lev)->vx_mean[i];
-            para->getParH(lev)->vyy[i] =
-                para->getParH(lev)->vyy[i] - para->getParH(lev)->vy_mean[i] * para->getParH(lev)->vy_mean[i];
-            para->getParH(lev)->vzz[i] =
-                para->getParH(lev)->vzz[i] - para->getParH(lev)->vz_mean[i] * para->getParH(lev)->vz_mean[i];
-            para->getParH(lev)->vxy[i] =
-                para->getParH(lev)->vxy[i] - para->getParH(lev)->vx_mean[i] * para->getParH(lev)->vy_mean[i];
-            para->getParH(lev)->vxz[i] =
-                para->getParH(lev)->vxz[i] - para->getParH(lev)->vx_mean[i] * para->getParH(lev)->vz_mean[i];
-            para->getParH(lev)->vyz[i] =
-                para->getParH(lev)->vyz[i] - para->getParH(lev)->vy_mean[i] * para->getParH(lev)->vz_mean[i];
+            para->getParH(lev)->vxx[pos] = para->getParH(lev)->vxx[pos] / (real)tdiff;
+            para->getParH(lev)->vyy[pos] = para->getParH(lev)->vyy[pos] / (real)tdiff;
+            para->getParH(lev)->vzz[pos] = para->getParH(lev)->vzz[pos] / (real)tdiff;
+            para->getParH(lev)->vxy[pos] = para->getParH(lev)->vxy[pos] / (real)tdiff;
+            para->getParH(lev)->vxz[pos] = para->getParH(lev)->vxz[pos] / (real)tdiff;
+            para->getParH(lev)->vyz[pos] = para->getParH(lev)->vyz[pos] / (real)tdiff;
+
+            para->getParH(lev)->vxx[pos] =
+                para->getParH(lev)->vxx[pos] - para->getParH(lev)->vx_mean[pos] * para->getParH(lev)->vx_mean[pos];
+            para->getParH(lev)->vyy[pos] =
+                para->getParH(lev)->vyy[pos] - para->getParH(lev)->vy_mean[pos] * para->getParH(lev)->vy_mean[pos];
+            para->getParH(lev)->vzz[pos] =
+                para->getParH(lev)->vzz[pos] - para->getParH(lev)->vz_mean[pos] * para->getParH(lev)->vz_mean[pos];
+            para->getParH(lev)->vxy[pos] =
+                para->getParH(lev)->vxy[pos] - para->getParH(lev)->vx_mean[pos] * para->getParH(lev)->vy_mean[pos];
+            para->getParH(lev)->vxz[pos] =
+                para->getParH(lev)->vxz[pos] - para->getParH(lev)->vx_mean[pos] * para->getParH(lev)->vz_mean[pos];
+            para->getParH(lev)->vyz[pos] =
+                para->getParH(lev)->vyz[pos] - para->getParH(lev)->vy_mean[pos] * para->getParH(lev)->vz_mean[pos];
         }
     }
 }
@@ -146,7 +146,7 @@ void writeAllTiDatafToFile(Parameter *para, uint timestep)
     }
 }
 
-void writeTiStuffToFile(Parameter *para, uint timestep, int sizeOfTiArray, std::vector<real *> &data,
+void writeTiStuffToFile(Parameter *para, uint timestep, unsigned long long sizeOfTiArray, std::vector<real *> &data,
                         std::vector<std::string> &datanames)
 {
     ////////////////////////////////////////////////////////////////////////
@@ -169,10 +169,10 @@ void writeTiStuffToFile(Parameter *para, uint timestep, int sizeOfTiArray, std::
     ostr << std::endl;
     ////////////////////////////////////////////////////////////////////////
     // fill file with data
-    for (int i = 0; i < sizeOfTiArray; i++) {
-        ostr << i;
+    for (size_t pos = 0; pos < sizeOfTiArray; pos++) {
+        ostr << pos;
         for (auto dataset : data)
-            ostr << "\t" << dataset[i];
+            ostr << "\t" << dataset[pos];
         ostr << std::endl;
     }
     ////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.h b/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.h
index f70973eb5921a17c3229a026623de2a0ef9f3ce4..a76c2d0dde99ad9fb3fd38137b6c72e5c3f5a6c3 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.h
+++ b/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.h
@@ -18,7 +18,7 @@ void writeVeloFluctuationToFile(Parameter *para, uint timeste);
 void writeVeloMeansToFile(Parameter *para, uint timestep);
 void writeAllTiDatafToFile(Parameter *para, uint timestep);
 
-void writeTiStuffToFile(Parameter *para, uint timestep, int sizeOfTiArray, std::vector<real *> &data,
+void writeTiStuffToFile(Parameter *para, uint timestep, unsigned long long sizeOfTiArray, std::vector<real *> &data,
                   std::vector<std::string> &datanames);
 
 #endif
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/CollisisionStrategy.cpp b/src/gpu/VirtualFluids_GPU/Calculation/CollisisionStrategy.cpp
index 4a14d19c10936f84379f332ef24f081f0ebb0cb7..49543f37df7fb54290f4ab6c09edb8d10c0b67be 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/CollisisionStrategy.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/CollisisionStrategy.cpp
@@ -39,8 +39,14 @@ void CollisionAndExchange_noStreams_indexKernel::operator()(UpdateGrid27 *update
     //!
     //! 1. run collision
     //!
-    updateGrid->collisionUsingIndices(level, t, para->getParD(level)->fluidNodeIndices,
-                                    para->getParD(level)->numberOfFluidNodes, -1);
+    for( CollisionTemplate tag: para->getParH(level)->allocatedBulkFluidNodeTags )
+    {
+        updateGrid->collisionUsingIndices(  level, t, 
+                                            para->getParD(level)->taggedFluidNodeIndices[tag],
+                                            para->getParD(level)->numberOfTaggedFluidNodes[tag],
+                                            tag,
+                                            CudaStreamIndex::Legacy);
+    }
 
     //! 2. exchange information between GPUs
     updateGrid->exchangeMultiGPU_noStreams_withPrepare(level, false);
@@ -61,28 +67,35 @@ void CollisionAndExchange_noStreams_oldKernel::operator()(UpdateGrid27 *updateGr
 
 void CollisionAndExchange_streams::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level, unsigned int t)
 {
-    int borderStreamIndex = para->getStreamManager()->getBorderStreamIndex();
-    int bulkStreamIndex = para->getStreamManager()->getBulkStreamIndex();
-
     //! \details steps:
     //!
-    //! 1. run collision for nodes which are at the border of the gpus/processes
-    //!
-    updateGrid->collisionUsingIndices(level, t, para->getParD(level)->fluidNodeIndicesBorder,
-                                    para->getParD(level)->numberOfFluidNodesBorder, borderStreamIndex);
+    //! 1. run collision for nodes which are at the border of the gpus/processes, running with WriteMacroVars in case probes sample on these nodes
+    //!    
+    updateGrid->collisionUsingIndices(  level, t, 
+                                        para->getParD(level)->taggedFluidNodeIndices[CollisionTemplate::SubDomainBorder],
+                                        para->getParD(level)->numberOfTaggedFluidNodes[CollisionTemplate::SubDomainBorder], 
+                                        CollisionTemplate::WriteMacroVars,  
+                                        CudaStreamIndex::SubDomainBorder);
 
     //! 2. prepare the exchange between gpus (collect the send nodes for communication in a buffer on the gpu) and trigger bulk kernel execution when finished
     //!
-    updateGrid->prepareExchangeMultiGPU(level, borderStreamIndex);
+    updateGrid->prepareExchangeMultiGPU(level, CudaStreamIndex::SubDomainBorder);
     if (para->getUseStreams())
-        para->getStreamManager()->triggerStartBulkKernel(borderStreamIndex);
-
-    //! 3. launch the collision kernel for bulk nodes
-    //!
-    para->getStreamManager()->waitOnStartBulkKernelEvent(bulkStreamIndex);
-    updateGrid->collisionUsingIndices(level, t, para->getParD(level)->fluidNodeIndices,
-                                    para->getParD(level)->numberOfFluidNodes, bulkStreamIndex);
-
+        para->getStreamManager()->triggerStartBulkKernel(CudaStreamIndex::SubDomainBorder);
+
+    //! 3. launch the collision kernel for bulk nodes. This includes nodes with \param tag Default, WriteMacroVars, ApplyBodyForce, 
+    //!    or AllFeatures. All assigned tags are listed in \param allocatedBulkFluidNodeTags during initialization in Simulation::init
+
+    para->getStreamManager()->waitOnStartBulkKernelEvent(CudaStreamIndex::Bulk);
+    
+    for( CollisionTemplate tag: para->getParH(level)->allocatedBulkFluidNodeTags )
+    {
+        updateGrid->collisionUsingIndices(  level, t, 
+                                            para->getParD(level)->taggedFluidNodeIndices[tag],
+                                            para->getParD(level)->numberOfTaggedFluidNodes[tag], 
+                                            tag,
+                                            CudaStreamIndex::Bulk);
+    }
     //! 4. exchange information between GPUs
-    updateGrid->exchangeMultiGPU(level, borderStreamIndex);
+    updateGrid->exchangeMultiGPU(level, CudaStreamIndex::SubDomainBorder);
 }
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/ForceCalculations.cpp b/src/gpu/VirtualFluids_GPU/Calculation/ForceCalculations.cpp
index d62e8fee24dad1cde7ccd2044a5a5f9573f7ff82..cc1d2eb748b01835b46f5fc69f47ed3ddc17a28d 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/ForceCalculations.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/ForceCalculations.cpp
@@ -53,7 +53,7 @@ void ForceCalculations::calcPIDControllerForForce(Parameter* para, CudaMemoryMan
 	 {
 		 //////////////////////////////////////////////////////////////////////
 		 //measure the velocity
-		 int numberOfElements = para->getParH(lev)->numberOfNodes;
+		 unsigned long long numberOfElements = para->getParH(lev)->numberOfNodes;
 		 if (numberOfElements > 0)
 		 {
 			 CalcMacCompSP27(para->getParD(lev)->velocityX,
@@ -74,11 +74,11 @@ void ForceCalculations::calcPIDControllerForForce(Parameter* para, CudaMemoryMan
 			 cudaMemoryManager->cudaCopyPrint(lev);
 //			 para->cudaCopyForceVelo(i,numberOfElements);
 			 //////////////////////////////////////////////////////////////////
-			 for (int j = 0; j < numberOfElements; j++)
+			 for (size_t pos = 0; pos < numberOfElements; pos++)
 			 {
-				 tempVeloX += (double)para->getParH(lev)->velocityX[j];
-				 tempVeloY += (double)para->getParH(lev)->velocityY[j];
-				 tempVeloZ += (double)para->getParH(lev)->velocityZ[j];
+				 tempVeloX += (double)para->getParH(lev)->velocityX[pos];
+				 tempVeloY += (double)para->getParH(lev)->velocityY[pos];
+				 tempVeloZ += (double)para->getParH(lev)->velocityZ[pos];
 			 }
 			 tempVeloX /= (double)numberOfElements;
 			 tempVeloY /= (double)numberOfElements;
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/RefinementStrategy.cpp b/src/gpu/VirtualFluids_GPU/Calculation/RefinementStrategy.cpp
index cd74216e1fbe7b718c72046ace4b7d2e7cf451fe..b8ca4e9c2020e17cd0192267ac5d931b510afc3a 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/RefinementStrategy.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/RefinementStrategy.cpp
@@ -38,67 +38,62 @@ void NoRefinement::operator()(UpdateGrid27 *updateGrid, Parameter *para, int lev
 
 void RefinementAndExchange_streams_exchangeInterface::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level)
 {
-    int borderStreamIndex = para->getStreamManager()->getBorderStreamIndex();
-    int bulkStreamIndex = para->getStreamManager()->getBulkStreamIndex();
-
     //! \details steps:
     //!
     //! 1. Interpolation fine to coarse for nodes which are at the border of the gpus/processes
     //!
-    updateGrid->fineToCoarse(level, &para->getParD(level)->intFCBorder, para->getParD(level)->offFC, borderStreamIndex);
+    updateGrid->fineToCoarse(level, &para->getParD(level)->intFCBorder, para->getParD(level)->offFC, CudaStreamIndex::SubDomainBorder);
 
     //! 2. prepare the exchange between gpus (collect the send nodes for communication in a buffer on the gpu) and trigger bulk kernel execution when finished
     //!
-    updateGrid->prepareExchangeMultiGPUAfterFtoC(level, borderStreamIndex);
+    updateGrid->prepareExchangeMultiGPUAfterFtoC(level, CudaStreamIndex::SubDomainBorder);
     if (para->getUseStreams())
-        para->getStreamManager()->triggerStartBulkKernel(borderStreamIndex);
+        para->getStreamManager()->triggerStartBulkKernel(CudaStreamIndex::SubDomainBorder);
 
     //! 3. launch the bulk kernels for both interpolation processes (fine to coarse and coarse to fine)
     //!
-    para->getStreamManager()->waitOnStartBulkKernelEvent(bulkStreamIndex);
-    updateGrid->fineToCoarse(level, &para->getParD(level)->intFCBulk, para->getParD(level)->offFCBulk, bulkStreamIndex);
-    updateGrid->coarseToFine(level, &para->getParD(level)->intCFBulk, para->getParD(level)->offCFBulk, bulkStreamIndex);
+    para->getStreamManager()->waitOnStartBulkKernelEvent(CudaStreamIndex::Bulk);
+    updateGrid->fineToCoarse(level, &para->getParD(level)->intFCBulk, para->getParD(level)->offFCBulk, CudaStreamIndex::SubDomainBorder);
+    updateGrid->coarseToFine(level, &para->getParD(level)->intCFBulk, para->getParD(level)->offCFBulk, CudaStreamIndex::SubDomainBorder);
 
     //! 4. exchange information between GPUs (only nodes which are part of the interpolation)
     //!
-    updateGrid->exchangeMultiGPUAfterFtoC(level, borderStreamIndex);
+    updateGrid->exchangeMultiGPUAfterFtoC(level, CudaStreamIndex::SubDomainBorder);
 
     // 5. interpolation fine to coarse for nodes which are at the border of the gpus/processes
     //!
-    updateGrid->coarseToFine(level, &para->getParD(level)->intCFBorder, para->getParD(level)->offCF, borderStreamIndex);
+    updateGrid->coarseToFine(level, &para->getParD(level)->intCFBorder, para->getParD(level)->offCF, CudaStreamIndex::SubDomainBorder);
 
     cudaDeviceSynchronize();
 }
 
-void RefinementAndExchange_streams_exchangeAllNodes::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level){
-    int borderStreamIndex = para->getStreamManager()->getBorderStreamIndex();
-    int bulkStreamIndex = para->getStreamManager()->getBulkStreamIndex();
-
+void RefinementAndExchange_streams_exchangeAllNodes::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level)
+{
     //! \details steps:
     //!
     //! 1. interpolation fine to coarse for nodes which are at the border of the gpus/processes
     //!
-    updateGrid->fineToCoarse(level, &para->getParD(level)->intFCBorder, para->getParD(level)->offFC, borderStreamIndex);
+    updateGrid->fineToCoarse(level, &para->getParD(level)->intFCBorder, para->getParD(level)->offFC, CudaStreamIndex::SubDomainBorder);
 
     //! 2. prepare the exchange between gpus (collect the send nodes for communication in a buffer on the gpu) and trigger bulk kernel execution when finished
     //!
-    updateGrid->prepareExchangeMultiGPU(level, borderStreamIndex);
+    updateGrid->prepareExchangeMultiGPU(level, CudaStreamIndex::SubDomainBorder);
     if (para->getUseStreams())
-        para->getStreamManager()->triggerStartBulkKernel(borderStreamIndex);
+        para->getStreamManager()->triggerStartBulkKernel(CudaStreamIndex::SubDomainBorder);
 
     //! 3. launch the bulk kernels for both interpolation processes (fine to coarse and coarse to fine)
     //!
-    para->getStreamManager()->waitOnStartBulkKernelEvent(bulkStreamIndex);
-    updateGrid->fineToCoarse(level, &para->getParD(level)->intFCBulk, para->getParD(level)->offFCBulk, bulkStreamIndex);
-    updateGrid->coarseToFine(level, &para->getParD(level)->intCFBulk, para->getParD(level)->offCFBulk, bulkStreamIndex);
+    para->getStreamManager()->waitOnStartBulkKernelEvent(CudaStreamIndex::Bulk);
+    updateGrid->fineToCoarse(level, &para->getParD(level)->intFCBulk, para->getParD(level)->offFCBulk, CudaStreamIndex::SubDomainBorder);
+    updateGrid->coarseToFine(level, &para->getParD(level)->intCFBulk, para->getParD(level)->offCFBulk, CudaStreamIndex::SubDomainBorder);
 
     //! 4. exchange information between GPUs (all nodes)
     //!
-    updateGrid->exchangeMultiGPU(level, borderStreamIndex);
+    updateGrid->exchangeMultiGPU(level, CudaStreamIndex::SubDomainBorder);
 
     // 5. interpolation fine to coarse for nodes which are at the border of the gpus/processes
     //!
-    updateGrid->coarseToFine(level, &para->getParD(level)->intCFBorder, para->getParD(level)->offCF, borderStreamIndex);
+    updateGrid->coarseToFine(level, &para->getParD(level)->intCFBorder, para->getParD(level)->offCF, CudaStreamIndex::SubDomainBorder);
 
     cudaDeviceSynchronize();
 }
@@ -109,14 +104,14 @@ void RefinementAndExchange_noStreams_exchangeInterface::operator()(UpdateGrid27
     //!
     //! 1. interpolation fine to coarse
     //!
-    updateGrid->fineToCoarse(level, &para->getParD(level)->intFC, para->getParD(level)->offFC, -1);
+    updateGrid->fineToCoarse(level, &para->getParD(level)->intFC, para->getParD(level)->offFC, CudaStreamIndex::Legacy);
 
     //! 2. exchange information between GPUs (only nodes which are part of the interpolation)
     //!
     updateGrid->exchangeMultiGPU_noStreams_withPrepare(level, true);
 
     //! 3. interpolation coarse to fine
-    updateGrid->coarseToFine(level, &para->getParD(level)->intCF, para->getParD(level)->offCF, -1);
+    updateGrid->coarseToFine(level, &para->getParD(level)->intCF, para->getParD(level)->offCF, CudaStreamIndex::Legacy);
 }
 
 void RefinementAndExchange_noStreams_exchangeAllNodes::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level)
@@ -125,14 +120,14 @@ void RefinementAndExchange_noStreams_exchangeAllNodes::operator()(UpdateGrid27 *
     //!
     //! 1. interpolation fine to coarse
     //!
-    updateGrid->fineToCoarse(level, &para->getParD(level)->intFC, para->getParD(level)->offFC, -1);
+    updateGrid->fineToCoarse(level, &para->getParD(level)->intFC, para->getParD(level)->offFC, CudaStreamIndex::Legacy);
 
     //! 2. exchange information between GPUs (all nodes)
     //!
     updateGrid->exchangeMultiGPU_noStreams_withPrepare(level, false);
 
     //! 3. interpolation coarse to fine
-    updateGrid->coarseToFine(level, &para->getParD(level)->intCF, para->getParD(level)->offCF, -1);
+    updateGrid->coarseToFine(level, &para->getParD(level)->intCF, para->getParD(level)->offCF, CudaStreamIndex::Legacy);
 }
 
 void Refinement_noExchange::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level)
@@ -141,7 +136,7 @@ void Refinement_noExchange::operator()(UpdateGrid27 *updateGrid, Parameter *para
     //!
     //! 1. interpolation fine to coarse
     //!
-    updateGrid->fineToCoarse(level, &para->getParD(level)->intFC, para->getParD(level)->offFC, -1);
+    updateGrid->fineToCoarse(level, &para->getParD(level)->intFC, para->getParD(level)->offFC, CudaStreamIndex::Legacy);
     //! 2. interpolation coarse to fine
-    updateGrid->coarseToFine(level, &para->getParD(level)->intCF, para->getParD(level)->offCF, -1);
+    updateGrid->coarseToFine(level, &para->getParD(level)->intCF, para->getParD(level)->offCF, CudaStreamIndex::Legacy);
 }
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
index 296ab819c5538a6b6d6a6827b5c28cbc475af838..4136614dfbfc9e0d2fc1bf7f4b01624f94eabb6f 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
@@ -22,13 +22,17 @@ void UpdateGrid27::updateGrid(int level, unsigned int t)
         updateGrid(level + 1, t);
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    
+    interactWithProbes(level, t);
+
     //////////////////////////////////////////////////////////////////////////
 
     collision(this, para.get(), level, t);
 
     //////////////////////////////////////////////////////////////////////////
 
-    postCollisionBC(level);
+    postCollisionBC(level, t);
 
     //////////////////////////////////////////////////////////////////////////
 
@@ -47,13 +51,14 @@ void UpdateGrid27::updateGrid(int level, unsigned int t)
 
     //////////////////////////////////////////////////////////////////////////
     if( level != para->getFine() )
-    {
+    {   
         refinement(this, para.get(), level);
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    
     interactWithActuators(level, t);
 
-    interactWithProbes(level, t);
 }
 
 void UpdateGrid27::collisionAllNodes(int level, unsigned int t)
@@ -71,15 +76,16 @@ void UpdateGrid27::collisionAllNodes(int level, unsigned int t)
         collisionAdvectionDiffusion(level);
 }
 
-void UpdateGrid27::collisionUsingIndices(int level, unsigned int t, uint *fluidNodeIndices, uint numberOfFluidNodes, int stream)
+void UpdateGrid27::collisionUsingIndices(int level, unsigned int t, uint *taggedFluidNodeIndices, uint numberOfTaggedFluidNodes, CollisionTemplate collisionTemplate, CudaStreamIndex stream)
 {
-    if (fluidNodeIndices != nullptr && numberOfFluidNodes != 0)
-        kernels.at(level)->runOnIndices(fluidNodeIndices, numberOfFluidNodes, stream);
+    if (taggedFluidNodeIndices != nullptr && numberOfTaggedFluidNodes != 0)
+        kernels.at(level)->runOnIndices(taggedFluidNodeIndices, numberOfTaggedFluidNodes, collisionTemplate, stream);
     else
-        std::cout << "In collision: fluidNodeIndices or numberOfFluidNodes not definded"
+        std::cout << "In collision: fluidNodeIndices or numberOfFluidNodes not defined"
                       << std::endl;
 
     //////////////////////////////////////////////////////////////////////////
+    //! \todo: AD collision and porousMedia should be called separately, not in collisionUsingIndices
 
     if (para->getSimulatePorousMedia())
         collisionPorousMedia(level);
@@ -118,21 +124,21 @@ void UpdateGrid27::collisionAdvectionDiffusion(int level)
     this->adKernelManager->runADcollisionKernel(level);
 }
 
-void UpdateGrid27::prepareExchangeMultiGPU(int level, int streamIndex)
+void UpdateGrid27::prepareExchangeMultiGPU(int level, CudaStreamIndex streamIndex)
 {
     prepareExchangeCollDataXGPU27AllNodes(para.get(), level, streamIndex);
     prepareExchangeCollDataYGPU27AllNodes(para.get(), level, streamIndex);
     prepareExchangeCollDataZGPU27AllNodes(para.get(), level, streamIndex);
 }
 
-void UpdateGrid27::prepareExchangeMultiGPUAfterFtoC(int level, int streamIndex)
+void UpdateGrid27::prepareExchangeMultiGPUAfterFtoC(int level, CudaStreamIndex streamIndex)
 {
     prepareExchangeCollDataXGPU27AfterFtoC(para.get(), level, streamIndex);
     prepareExchangeCollDataYGPU27AfterFtoC(para.get(), level, streamIndex);
     prepareExchangeCollDataZGPU27AfterFtoC(para.get(), level, streamIndex);
 }
 
-void UpdateGrid27::exchangeMultiGPU(int level, int streamIndex)
+void UpdateGrid27::exchangeMultiGPU(int level, CudaStreamIndex streamIndex)
 {
     //////////////////////////////////////////////////////////////////////////
     // 3D domain decomposition
@@ -168,30 +174,30 @@ void UpdateGrid27::exchangeMultiGPU_noStreams_withPrepare(int level, bool useRed
     // 3D domain decomposition
     if (useReducedComm) {
         // X
-        prepareExchangeCollDataXGPU27AfterFtoC(para.get(), level, -1);
-        exchangeCollDataXGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, -1);
-        scatterNodesFromRecvBufferXGPU27AfterFtoC(para.get(), level, -1);
+        prepareExchangeCollDataXGPU27AfterFtoC(para.get(), level, CudaStreamIndex::Legacy);
+        exchangeCollDataXGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, CudaStreamIndex::Legacy);
+        scatterNodesFromRecvBufferXGPU27AfterFtoC(para.get(), level, CudaStreamIndex::Legacy);
         // Y
-        prepareExchangeCollDataYGPU27AfterFtoC(para.get(), level, -1);
-        exchangeCollDataYGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, -1);
-        scatterNodesFromRecvBufferYGPU27AfterFtoC(para.get(), level, -1);
+        prepareExchangeCollDataYGPU27AfterFtoC(para.get(), level, CudaStreamIndex::Legacy);
+        exchangeCollDataYGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, CudaStreamIndex::Legacy);
+        scatterNodesFromRecvBufferYGPU27AfterFtoC(para.get(), level, CudaStreamIndex::Legacy);
         // Z
-        prepareExchangeCollDataZGPU27AfterFtoC(para.get(), level, -1);
-        exchangeCollDataZGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, -1);
-        scatterNodesFromRecvBufferZGPU27AfterFtoC(para.get(), level, -1);
+        prepareExchangeCollDataZGPU27AfterFtoC(para.get(), level, CudaStreamIndex::Legacy);
+        exchangeCollDataZGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, CudaStreamIndex::Legacy);
+        scatterNodesFromRecvBufferZGPU27AfterFtoC(para.get(), level, CudaStreamIndex::Legacy);
     } else {
         // X
-        prepareExchangeCollDataXGPU27AllNodes(para.get(), level, -1);
-        exchangeCollDataXGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, -1);
-        scatterNodesFromRecvBufferXGPU27AllNodes(para.get(), level, -1);
+        prepareExchangeCollDataXGPU27AllNodes(para.get(), level, CudaStreamIndex::Legacy);
+        exchangeCollDataXGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, CudaStreamIndex::Legacy);
+        scatterNodesFromRecvBufferXGPU27AllNodes(para.get(), level, CudaStreamIndex::Legacy);
         // Y
-        prepareExchangeCollDataYGPU27AllNodes(para.get(), level, -1);
-        exchangeCollDataYGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, -1);
-        scatterNodesFromRecvBufferYGPU27AllNodes(para.get(), level, -1);
+        prepareExchangeCollDataYGPU27AllNodes(para.get(), level, CudaStreamIndex::Legacy);
+        exchangeCollDataYGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, CudaStreamIndex::Legacy);
+        scatterNodesFromRecvBufferYGPU27AllNodes(para.get(), level, CudaStreamIndex::Legacy);
         // Z
-        prepareExchangeCollDataZGPU27AllNodes(para.get(), level, -1);
-        exchangeCollDataZGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, -1);
-        scatterNodesFromRecvBufferZGPU27AllNodes(para.get(), level, -1);
+        prepareExchangeCollDataZGPU27AllNodes(para.get(), level, CudaStreamIndex::Legacy);
+        exchangeCollDataZGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, CudaStreamIndex::Legacy);
+        scatterNodesFromRecvBufferZGPU27AllNodes(para.get(), level, CudaStreamIndex::Legacy);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -204,7 +210,7 @@ void UpdateGrid27::exchangeMultiGPU_noStreams_withPrepare(int level, bool useRed
         exchangePostCollDataADZGPU27(para.get(), comm, cudaMemoryManager.get(), level);
     }
 }
-void UpdateGrid27::exchangeMultiGPUAfterFtoC(int level, int streamIndex)
+void UpdateGrid27::exchangeMultiGPUAfterFtoC(int level, CudaStreamIndex streamIndex)
 {
     //////////////////////////////////////////////////////////////////////////
     // 3D domain decomposition
@@ -227,9 +233,10 @@ void UpdateGrid27::exchangeMultiGPUAfterFtoC(int level, int streamIndex)
     }
 }
 
-void UpdateGrid27::postCollisionBC(int level)
+void UpdateGrid27::postCollisionBC(int level, uint t)
 {
     //////////////////////////////////////////////////////////////////////////
+    // G E O M E T R Y
     // V E L O C I T Y (I N F L O W)
     this->bcKernelManager->runVelocityBCKernelPost(level);
 
@@ -257,6 +264,10 @@ void UpdateGrid27::postCollisionBC(int level)
     // P R E S S U R E
     this->bcKernelManager->runPressureBCKernelPost(level);
 
+    //////////////////////////////////////////////////////////////////////////
+    // P R E C U R S O R
+    this->bcKernelManager->runPrecursorBCKernelPost(level, t, cudaMemoryManager.get());
+
     //////////////////////////////////////////////////////////////////////////
     // A D V E C T I O N    D I F F U S I O N
     if (para->getDiffOn())
@@ -317,13 +328,12 @@ void UpdateGrid27::preCollisionBC(int level, unsigned int t)
     //////////////////////////////////////////////////////////////////////////////////
 }
 
-void UpdateGrid27::fineToCoarse(int level, InterpolationCellFC* icellFC, OffFC &offFC,
-                                int streamIndex)
+void UpdateGrid27::fineToCoarse(int level, InterpolationCellFC* icellFC, OffFC &offFC, CudaStreamIndex streamIndex)
 {
     gridScalingKernelManager->runFineToCoarseKernelLB(level, icellFC, offFC, streamIndex);
 
     if (para->getDiffOn()) {
-        if (streamIndex != -1) {
+        if (para->getStreamManager()->streamIsRegistered(streamIndex)) {
             printf("fineToCoarse Advection Diffusion not implemented"); // TODO
             return;
         }
@@ -331,14 +341,13 @@ void UpdateGrid27::fineToCoarse(int level, InterpolationCellFC* icellFC, OffFC &
     }
 }
 
-void UpdateGrid27::coarseToFine(int level, InterpolationCellCF* icellCF, OffCF &offCF,
-                                int streamIndex)
+void UpdateGrid27::coarseToFine(int level, InterpolationCellCF* icellCF, OffCF &offCF, CudaStreamIndex streamIndex)
 {
     this->gridScalingKernelManager->runCoarseToFineKernelLB(level, icellCF, offCF, streamIndex);
 
     if (para->getDiffOn())
     {
-        if (streamIndex != -1){
+        if(para->getStreamManager()->streamIsRegistered(streamIndex)){
             printf("CoarseToFineWithStream Advection Diffusion not implemented"); // TODO
             return;
         }
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
index 8110923bf066412e2bb09ffa1f10efe3ddc983c7..8ce2cf5bfd72f9f53cdb35bc92502ee9ca0d3ad8 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
@@ -4,6 +4,7 @@
 #include "LBM/LB.h"
 #include "GPU/GPU_Interface.h"
 #include "Parameter/Parameter.h"
+#include "Parameter/CudaStreamManager.h"
 #include "GPU/CudaMemoryManager.h"
 #include "Communication/Communicator.h"
 #include "Calculation/PorousMedia.h"
@@ -15,7 +16,6 @@ class Kernel;
 class BoundaryConditionFactory;
 class GridScalingFactory;
 class TurbulenceModelFactory;
-
 class UpdateGrid27;
 using CollisionStrategy = std::function<void (UpdateGrid27* updateGrid, Parameter* para, int level, unsigned int t)>;
 using RefinementStrategy = std::function<void (UpdateGrid27* updateGrid, Parameter* para, int level)>;
@@ -31,21 +31,21 @@ public:
 
 private:
     void collisionAllNodes(int level, unsigned int t);
-    void collisionUsingIndices(int level, unsigned int t, uint *fluidNodeIndices = nullptr, uint numberOfFluidNodes = 0, int stream = -1);
+    void collisionUsingIndices(int level, unsigned int t, uint *taggedFluidNodeIndices = nullptr, uint numberOfTaggedFluidNodes = 0, CollisionTemplate collisionTemplate = CollisionTemplate::Default, CudaStreamIndex streamIndex=CudaStreamIndex::Legacy);
     void collisionAdvectionDiffusion(int level);
 
-    void postCollisionBC(int level);
+    void postCollisionBC(int level, unsigned int t);
     void preCollisionBC(int level, unsigned int t);
     void collisionPorousMedia(int level);
 
-    void fineToCoarse(int level, InterpolationCellFC* icellFC, OffFC &offFC, int streamIndex);
-    void coarseToFine(int level, InterpolationCellCF* icellCF, OffCF &offCF, int streamIndex);
+    void fineToCoarse(int level, InterpolationCellFC* icellFC, OffFC &offFC, CudaStreamIndex streamIndex);
+    void coarseToFine(int level, InterpolationCellCF* icellCF, OffCF &offCF, CudaStreamIndex streamIndex);
 
-    void prepareExchangeMultiGPU(int level, int streamIndex);
-    void prepareExchangeMultiGPUAfterFtoC(int level, int streamIndex);
+    void prepareExchangeMultiGPU(int level, CudaStreamIndex streamIndex);
+    void prepareExchangeMultiGPUAfterFtoC(int level, CudaStreamIndex streamIndex);
 
-    void exchangeMultiGPU(int level, int streamIndex);
-    void exchangeMultiGPUAfterFtoC(int level, int streamIndex);
+    void exchangeMultiGPU(int level, CudaStreamIndex streamIndex);
+    void exchangeMultiGPUAfterFtoC(int level, CudaStreamIndex streamIndex);
     void exchangeMultiGPU_noStreams_withPrepare(int level, bool useReducedComm);
 
     void swapBetweenEvenAndOddTimestep(int level);
@@ -60,6 +60,7 @@ private:
     friend class CollisionAndExchange_noStreams_indexKernel;
     friend class CollisionAndExchange_noStreams_oldKernel;
     friend class CollisionAndExchange_streams;
+    friend class CollisionAndExchange_noStreams_withReadWriteFlags;
 
     RefinementStrategy refinement;
     friend class RefinementAndExchange_streams_exchangeInterface;
diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
index 36c250401e0775b3abcc7d25c0f89fde0556631e..00a7b45668e2050467f3d1122455dc74d0ad4f1c 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
@@ -11,12 +11,12 @@ using namespace vf::lbm::dir;
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // 3D domain decomposition: functions used by all directions
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex,
+void collectNodesInSendBufferGPU(Parameter *para, int level, CudaStreamIndex streamIndex,
                                  std::vector<ProcessNeighbor27> *sendProcessNeighbor,
                                  unsigned int numberOfSendProcessNeighbors)
 {
-    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
-
+    cudaStream_t stream = para->getStreamManager()->getStream(streamIndex);
+    
     for (unsigned int i = 0; i < numberOfSendProcessNeighbors; i++) {
         GetSendFsPostDev27(para->getParD(level)->distributions.f[0], 
                            (*sendProcessNeighbor)[i].f[0],
@@ -32,11 +32,11 @@ void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex,
     }
 }
 
-void scatterNodesFromRecvBufferGPU(Parameter *para, int level, int streamIndex,
+void scatterNodesFromRecvBufferGPU(Parameter *para, int level, CudaStreamIndex streamIndex,
                                    std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
                                    unsigned int numberOfRecvProcessNeighbors)
 {
-    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+    cudaStream_t stream = para->getStreamManager()->getStream(streamIndex);
     for (unsigned int i = 0; i < numberOfRecvProcessNeighbors; i++) {
         SetRecvFsPostDev27(para->getParD(level)->distributions.f[0], 
                            (*recvProcessNeighborDev)[i].f[0],
@@ -105,22 +105,22 @@ void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeN
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // X
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, int streamIndex)
+void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex)
 {
     collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborX,
                                 (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
 }
 
-void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
+void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex)
 {
     collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborsAfterFtoCX,
                                 (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
 }
 
 void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
-                                    int level, int streamIndex)
+                                    int level, CudaStreamIndex streamIndex)
 {
-    exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, streamIndex, 
+    exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, streamIndex,
                            &para->getParD(level)->sendProcessNeighborX,
                            &para->getParD(level)->recvProcessNeighborX,
                            &para->getParH(level)->sendProcessNeighborX,
@@ -128,40 +128,40 @@ void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm
 }
 
 void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
-                                     int level, int streamIndex)
+                                     int level, CudaStreamIndex streamIndex)
 {
-    exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, streamIndex, 
+    exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, streamIndex,
                            &para->getParD(level)->sendProcessNeighborsAfterFtoCX,
                            &para->getParD(level)->recvProcessNeighborsAfterFtoCX,
                            &para->getParH(level)->sendProcessNeighborsAfterFtoCX,
                            &para->getParH(level)->recvProcessNeighborsAfterFtoCX);
 }
 
-void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex)
+void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex)
 {
-    scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborX,
+    scatterNodesFromRecvBufferGPU(para, level, streamIndex,&para->getParD(level)->recvProcessNeighborX,
                                   (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
 }
 
-void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
+void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex)
 {
-    scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborsAfterFtoCX,
+    scatterNodesFromRecvBufferGPU(para, level, streamIndex,&para->getParD(level)->recvProcessNeighborsAfterFtoCX,
                                   (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
 }
 
-void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level,
-                            int streamIndex, 
+void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, 
+                            int level, CudaStreamIndex streamIndex,
                             std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
                             std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
                             std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
                             std::vector<ProcessNeighbor27> *recvProcessNeighborHost)
 {
-    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+    cudaStream_t stream = para->getStreamManager()->getStream(streamIndex);
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //! \details steps: 
     //! 1. copy data from device to host
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-        cudaMemoryManager->cudaCopyProcessNeighborXFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex);
+        cudaMemoryManager->cudaCopyProcessNeighborXFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs);
 
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //! 2. start non-blocking receive (MPI)
@@ -181,7 +181,7 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //! 7. copy received data from host to device
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-        cudaMemoryManager->cudaCopyProcessNeighborXFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex);
+        cudaMemoryManager->cudaCopyProcessNeighborXFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs);
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -189,22 +189,22 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Y
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, int streamIndex)
+void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex)
 {
     collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborY,
                                 (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
 }
 
-void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
+void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex)
 {
     collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborsAfterFtoCY,
                                 (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
 }
 
 void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
-                                    int level, int streamIndex)
+                                    int level, CudaStreamIndex streamIndex)
 {
-    exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, streamIndex, 
+    exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, streamIndex,
                            &para->getParD(level)->sendProcessNeighborY,
                            &para->getParD(level)->recvProcessNeighborY, 
                            &para->getParH(level)->sendProcessNeighborY,
@@ -212,38 +212,39 @@ void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm
 }
 
 void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
-                                     int level, int streamIndex)
+                                     int level, CudaStreamIndex streamIndex)
 {
-    exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, streamIndex, 
+    exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, streamIndex,
                            &para->getParD(level)->sendProcessNeighborsAfterFtoCY,
                            &para->getParD(level)->recvProcessNeighborsAfterFtoCY, 
                            &para->getParH(level)->sendProcessNeighborsAfterFtoCY,
                            &para->getParH(level)->recvProcessNeighborsAfterFtoCY);
 }
 
-void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex)
+void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex)
 {
     scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborY,
                                   (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
 }
 
-void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
+void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex)
 {
     scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborsAfterFtoCY,
                                   (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
 }
 
 void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level,
-                            int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
+                            CudaStreamIndex streamIndex,
+                            std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
                             std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
                             std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
                             std::vector<ProcessNeighbor27> *recvProcessNeighborHost)
 {
-    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+    cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamIndex::SubDomainBorder);
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // copy Device to Host
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-        cudaMemoryManager->cudaCopyProcessNeighborYFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex);
+        cudaMemoryManager->cudaCopyProcessNeighborYFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs);
 
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost);
@@ -276,7 +277,7 @@ void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // copy Host to Device
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) {
-        cudaMemoryManager->cudaCopyProcessNeighborYFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex);
+        cudaMemoryManager->cudaCopyProcessNeighborYFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs);
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
@@ -285,61 +286,62 @@ void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Z
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, int streamIndex)
+void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex)
 {
     collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborZ,
                                 (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")));
 }
 
-void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
+void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex)
 {
     collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborsAfterFtoCZ,
                                 (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")));
 }
 
 void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
-                                    int level, int streamIndex)
+                                    int level, CudaStreamIndex streamIndex)
 {
-    exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, streamIndex, 
+    exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, streamIndex,
                            &para->getParD(level)->sendProcessNeighborZ,
                            &para->getParD(level)->recvProcessNeighborZ, 
                            &para->getParH(level)->sendProcessNeighborZ,
                            &para->getParH(level)->recvProcessNeighborZ);
 }
 void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
-                                     int level, int streamIndex)
+                                     int level, CudaStreamIndex streamIndex)
 {
-    exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, streamIndex, 
+    exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, streamIndex,
                            &para->getParD(level)->sendProcessNeighborsAfterFtoCZ,
                            &para->getParD(level)->recvProcessNeighborsAfterFtoCZ, 
                            &para->getParH(level)->sendProcessNeighborsAfterFtoCZ,
                            &para->getParH(level)->recvProcessNeighborsAfterFtoCZ);
 }
 
-void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, int streamIndex)
+void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex)
 {
     scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborZ,
                                   (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")));
 }
 
-void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
+void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex)
 {
     scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborsAfterFtoCZ,
                                   (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")));
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level,
-                            int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
+void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level, 
+                            CudaStreamIndex streamIndex,
+                            std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
                             std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
                             std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
                             std::vector<ProcessNeighbor27> *recvProcessNeighborHost)
 {
-    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+    cudaStream_t stream = para->getStreamManager()->getStream(streamIndex);
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // copy Device to Host
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-        cudaMemoryManager->cudaCopyProcessNeighborZFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex);
+        cudaMemoryManager->cudaCopyProcessNeighborZFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs);
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost);
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -386,7 +388,7 @@ void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe
     // copy Host to Device
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
     {
-        cudaMemoryManager->cudaCopyProcessNeighborZFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex);
+        cudaMemoryManager->cudaCopyProcessNeighborZFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs);
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h
index ec930ebbc06554e948204b74e79e0e25b85f57b5..8302ffdc47bfa012c47df00f90c2491039f4eaee 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h
+++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h
@@ -6,6 +6,7 @@
 #include "GPU/GPU_Interface.h"
 #include "LBM/LB.h"
 #include "Parameter/Parameter.h"
+#include "Parameter/CudaStreamManager.h"
 
 //! \file ExchangeData27.h
 //! \ingroup GPU
@@ -14,9 +15,9 @@
 
 //////////////////////////////////////////////////////////////////////////
 // 1D domain decomposition
-void exchangePreCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangePreCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, 
                                          int level);
-void exchangePostCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangePostCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, 
                                           int level);
 //////////////////////////////////////////////////////////////////////////
 // 3D domain decomposition
@@ -24,13 +25,13 @@ void exchangePostCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, Cud
 // functions used for all directions
 
 //! \brief Collect the send nodes in a buffer on the gpu
-void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex,
-                                            std::vector<ProcessNeighbor27> *sendProcessNeighbor,
-                                            unsigned int numberOfSendProcessNeighbors);
+void collectNodesInSendBufferGPU(Parameter *para, int level, CudaStreamIndex streamIndex,
+                                 std::vector<ProcessNeighbor27> *sendProcessNeighbor,
+                                 unsigned int numberOfSendProcessNeighbors);
 //! \brief Distribute the receive nodes from the buffer on the gpu
-void scatterNodesFromRecvBufferGPU(Parameter *para, int level, int streamIndex,
-                                              std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
-                                              unsigned int numberOfRecvProcessNeighbors);
+void scatterNodesFromRecvBufferGPU(Parameter *para, int level, CudaStreamIndex streamIndex,
+                                   std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
+                                   unsigned int numberOfRecvProcessNeighbors);
 //! \brief Copy nodes which are part of the communication in multiple directions
 //! \details The nodes are copied from the receive buffer in one direction to the send buffer in another direction. The
 //! copy operation is conducted on the cpu. 
@@ -49,21 +50,20 @@ void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeN
 
 //! \brief Collect the send nodes for communication in the x direction in a buffer on the gpu
 //! \details Needed to exchange all nodes, used in the communication after collision step
-void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, int streamIndex);
+void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex);
 //! \brief Collect the send nodes for communication in the x direction in a buffer on the gpu
 //! \details Only exchange nodes which are part of the interpolation process on refined grids. This function is used in
 //! the exchange which takes place after the interpolation fine to coarse and before the interpolation coarse to fine.
 //! See [master thesis of Anna Wellmann]
-void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, int streamIndex);
+void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex);
 //! \brief Exchange routine in x direction for simulations on multiple gpus
 //! \details Send and receive the nodes from the communication buffers on the gpus.
 //! \param Communicator is needed for the communication between the processes with mpi
 //! \param CudaMemoryManager is needed for moving the data between host and device
-//! \param streamIndex is the index of a CUDA Stream, which is needed for communication hiding
 //! \param sendProcessNeighborDev, recvProcessNeighborDev, sendProcessNeighborHost, recvProcessNeighborHost are pointers
 //! to the send and receive arrays, both on the device and the host
 void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
-                                       int level, int streamIndex,
+                                       int level, CudaStreamIndex streamIndex,
                                        std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
                                        std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
                                        std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
@@ -71,59 +71,59 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe
 //! \brief Calls exchangeCollDataXGPU27() for exchanging all nodes
 //! \details Used in the communication after collision step
 void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm,
-                                               CudaMemoryManager *cudaMemoryManager, int level, int streamIndex);
+                                               CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex);
 //! \brief Calls exchangeCollDataXGPU27() for exchanging the nodes, which are part of the communication between the two
 //! interpolation processes on refined grids 
 //! \details Only exchange nodes which are part of the interpolation process on
 //! refined grids. This function is used in the exchange which takes place after the interpolation fine to coarse and
 //! before the interpolation coarse to fine. See [master thesis of Anna Wellmann]
 void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm,
-                                                CudaMemoryManager *cudaMemoryManager, int level, int streamIndex);
+                                                CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex);
 //! \brief Distribute the receive nodes (x direction) from the buffer on the gpu
 //! \details Needed to exchange all nodes, used in the communication after collision step
-void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex);
+void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex);
 //! \brief Distribute the receive nodes (x direction) from the buffer on the gpu
 //! \details Only exchange nodes which are part of the interpolation process on refined grids. This function is used in
 //! the exchange which takes place after the interpolation fine to coarse and before the interpolation coarse to fine.
 //! See [master thesis of Anna Wellmann]
-void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex);
+void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex);
 
 //////////////////////////////////////////////////////////////////////////
 // y
 
-void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, int streamIndex);
-void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, int streamIndex);
+void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex);
+void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex);
 
 void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
-                                       int level, int streamIndex,
+                                       int level,CudaStreamIndex streamIndex,
                                        std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
                                        std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
                                        std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
                                        std::vector<ProcessNeighbor27> *recvProcessNeighborHos);
 void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm,
-                                               CudaMemoryManager *cudaMemoryManager, int level, int streamIndex);
+                                               CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex);
 void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm,
-                                                CudaMemoryManager *cudaMemoryManager, int level, int streamIndex);
-void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex);
-void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex);
+                                                CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex);
+void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex);
+void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex);
 
 // z
-void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, int streamIndex);
-void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, int streamIndex);
+void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex);
+void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex);
 
 void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
-                                       int level, int streamIndex,
+                                       int level, CudaStreamIndex streamIndex,
                                        std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
                                        std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
                                        std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
                                        std::vector<ProcessNeighbor27> *recvProcessNeighborHost);
 void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm,
-                                               CudaMemoryManager *cudaMemoryManager, int level, int streamIndex);
+                                               CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex);
 void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm,
-                                                CudaMemoryManager *cudaMemoryManager, int level, int streamIndex);
+                                                CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex);
 
-void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, int streamIndex);
-void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, int streamIndex);
+void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex);
+void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex);
 
 //////////////////////////////////////////////////////////////////////////
 // 3D domain decomposition convection diffusion
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.cpp
index e197fb5c28611e77406b30ab39aa6af2f54b9ef5..3b511264e9c7edc80bbe367cac4a9b6d8725674b 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.cpp
@@ -19,34 +19,29 @@ std::shared_ptr<GridProvider> GridProvider::makeGridReader(FILEFORMAT format, st
     return std::shared_ptr<GridProvider>(new GridReader(format, para, cudaMemoryManager));
 }
 
-void GridProvider::setNumberOfNodes(const int numberOfNodes, const int level) const
+void GridProvider::setNumberOfNodes(uint numberOfNodes, int level) const
 {
-    para->getParH(level)->numberOfNodes = numberOfNodes;
-    para->getParD(level)->numberOfNodes = numberOfNodes;
-    para->getParH(level)->mem_size_real_SP = sizeof(real) * para->getParH(level)->numberOfNodes;
-    para->getParH(level)->mem_size_int_SP = sizeof(uint) * para->getParH(level)->numberOfNodes;
-    para->getParD(level)->mem_size_real_SP = sizeof(real) * para->getParD(level)->numberOfNodes;
-    para->getParD(level)->mem_size_int_SP = sizeof(uint) * para->getParD(level)->numberOfNodes;
+    para->getParH(level)->numberOfNodes          = (unsigned long long)numberOfNodes;
+    para->getParD(level)->numberOfNodes          = (unsigned long long)numberOfNodes;
+    para->getParH(level)->memSizeRealLBnodes     = sizeof(real) * para->getParH(level)->numberOfNodes;
+    para->getParD(level)->memSizeRealLBnodes     = sizeof(real) * para->getParD(level)->numberOfNodes;
+    para->getParH(level)->memSizeLonglongLBnodes = sizeof(unsigned long long) * para->getParH(level)->numberOfNodes;
+    para->getParD(level)->memSizeLonglongLBnodes = sizeof(unsigned long long) * para->getParD(level)->numberOfNodes;
 }
 
-void GridProvider::setNumberOfFluidNodes(const int numberOfNodes, const int level) const
+void GridProvider::setNumberOfTaggedFluidNodes(uint numberOfNodes, CollisionTemplate tag, int level) const
 {
-    para->getParH(level)->numberOfFluidNodes = numberOfNodes;
-    para->getParD(level)->numberOfFluidNodes = numberOfNodes;
+    para->getParH(level)->numberOfTaggedFluidNodes[tag] = numberOfNodes;
+    para->getParD(level)->numberOfTaggedFluidNodes[tag] = numberOfNodes;
 }
 
-void GridProvider::setNumberOfFluidNodesBorder(const int numberOfNodes, const int level) const {
-    para->getParH(level)->numberOfFluidNodesBorder = numberOfNodes;
-    para->getParD(level)->numberOfFluidNodesBorder = numberOfNodes;
-}
-
-void GridProvider::setInitalNodeValues(const int numberOfNodes, const int level) const
+void GridProvider::setInitalNodeValues(uint numberOfNodes, int level) const
 {
-    for (int j = 1; j <= numberOfNodes; j++)
+    for (uint pos = 1; pos <= numberOfNodes; pos++)
     {
-        const real coordX = para->getParH(level)->coordinateX[j];
-        const real coordY = para->getParH(level)->coordinateY[j];
-        const real coordZ = para->getParH(level)->coordinateZ[j];
+        const real coordX = para->getParH(level)->coordinateX[pos];
+        const real coordY = para->getParH(level)->coordinateY[pos];
+        const real coordZ = para->getParH(level)->coordinateZ[pos];
 
         real rho, vx, vy, vz;
 
@@ -63,40 +58,40 @@ void GridProvider::setInitalNodeValues(const int numberOfNodes, const int level)
             vz  = real(0.0);
         }
 
-        para->getParH(level)->rho[j] = rho; 
-        para->getParH(level)->velocityX[j]  = vx; 
-        para->getParH(level)->velocityY[j]  = vy;
-        para->getParH(level)->velocityZ[j]  = vz; 
+        para->getParH(level)->rho[pos] = rho; 
+        para->getParH(level)->velocityX[pos]  = vx; 
+        para->getParH(level)->velocityY[pos]  = vy;
+        para->getParH(level)->velocityZ[pos]  = vz; 
 
         //////////////////////////////////////////////////////////////////////////
 
         if (para->getCalcMedian()) {
-            para->getParH(level)->vx_SP_Med[j] = 0.0f;
-            para->getParH(level)->vy_SP_Med[j] = 0.0f;
-            para->getParH(level)->vz_SP_Med[j] = 0.0f;
-            para->getParH(level)->rho_SP_Med[j] = 0.0f;
-            para->getParH(level)->press_SP_Med[j] = 0.0f;
+            para->getParH(level)->vx_SP_Med[pos] = 0.0f;
+            para->getParH(level)->vy_SP_Med[pos] = 0.0f;
+            para->getParH(level)->vz_SP_Med[pos] = 0.0f;
+            para->getParH(level)->rho_SP_Med[pos] = 0.0f;
+            para->getParH(level)->press_SP_Med[pos] = 0.0f;
         }
         if (para->getUseWale()) {
-            para->getParH(level)->turbViscosity[j] = 0.0f;
+            para->getParH(level)->turbViscosity[pos] = 0.0f;
             //Debug
-            para->getParH(level)->gSij[j] = 0.0f;
-            para->getParH(level)->gSDij[j] = 0.0f;
-            para->getParH(level)->gDxvx[j] = 0.0f;
-            para->getParH(level)->gDyvx[j] = 0.0f;
-            para->getParH(level)->gDzvx[j] = 0.0f;
-            para->getParH(level)->gDxvy[j] = 0.0f;
-            para->getParH(level)->gDyvy[j] = 0.0f;
-            para->getParH(level)->gDzvy[j] = 0.0f;
-            para->getParH(level)->gDxvz[j] = 0.0f;
-            para->getParH(level)->gDyvz[j] = 0.0f;
-            para->getParH(level)->gDzvz[j] = 0.0f;
+            para->getParH(level)->gSij[pos] = 0.0f;
+            para->getParH(level)->gSDij[pos] = 0.0f;
+            para->getParH(level)->gDxvx[pos] = 0.0f;
+            para->getParH(level)->gDyvx[pos] = 0.0f;
+            para->getParH(level)->gDzvx[pos] = 0.0f;
+            para->getParH(level)->gDxvy[pos] = 0.0f;
+            para->getParH(level)->gDyvy[pos] = 0.0f;
+            para->getParH(level)->gDzvy[pos] = 0.0f;
+            para->getParH(level)->gDxvz[pos] = 0.0f;
+            para->getParH(level)->gDyvz[pos] = 0.0f;
+            para->getParH(level)->gDzvz[pos] = 0.0f;
         }
 
         if (para->getIsBodyForce()) {
-            para->getParH(level)->forceX_SP[j] = 0.0f;
-            para->getParH(level)->forceY_SP[j] = 0.0f;
-            para->getParH(level)->forceZ_SP[j] = 0.0f;
+            para->getParH(level)->forceX_SP[pos] = 0.0f;
+            para->getParH(level)->forceY_SP[pos] = 0.0f;
+            para->getParH(level)->forceZ_SP[pos] = 0.0f;
         }
     }
 
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h
index 5fc5826735643ec748da169160e782004d7e5fb7..007db1e0d8e27b3810aa38c089bae8069bbe5813 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h
@@ -5,7 +5,7 @@
 #include <vector>
 #include <memory>
 
-
+#include "LBM/LB.h"
 #include "PointerDefinitions.h"
 #include "VirtualFluids_GPU_export.h"
 #include "gpu/GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
@@ -24,34 +24,35 @@ public:
     static std::shared_ptr<GridProvider> makeGridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::gpu::Communicator& communicator);
     static std::shared_ptr<GridProvider> makeGridReader(FILEFORMAT format, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaMemoryManager);
 
-	virtual void allocArrays_CoordNeighborGeo() = 0;
-	virtual void allocArrays_BoundaryValues() = 0;
-	virtual void allocArrays_BoundaryQs() = 0;
+    virtual void allocArrays_CoordNeighborGeo() = 0;
+    virtual void allocArrays_BoundaryValues() = 0;
+    virtual void allocArrays_BoundaryQs() = 0;
     virtual void allocArrays_OffsetScale() = 0;
-    virtual void allocArrays_fluidNodeIndices() = 0;
-    virtual void allocArrays_fluidNodeIndicesBorder() = 0;
+    virtual void allocArrays_taggedFluidNodes() = 0;
+
+    virtual void tagFluidNodeIndices(const std::vector<uint>& taggedFluidNodeIndices, CollisionTemplate tag, uint level) = 0;
+    virtual void sortFluidNodeTags() = 0;
 
-	virtual void setDimensions() = 0;
-	virtual void setBoundingBox() = 0;
-	virtual void initPeriodicNeigh(std::vector<std::vector<std::vector<unsigned int> > > periodV, std::vector<std::vector<unsigned int> > periodIndex, std::string way) = 0;
+    virtual void setDimensions() = 0;
+    virtual void setBoundingBox() = 0;
+    virtual void initPeriodicNeigh(std::vector<std::vector<std::vector<unsigned int> > > periodV, std::vector<std::vector<unsigned int> > periodIndex, std::string way) = 0;
 
     virtual void allocAndCopyForcing();
     virtual void allocAndCopyQuadricLimiters();
     virtual void freeMemoryOnHost();
     virtual void cudaCopyDataToHost(int level);
 
-	virtual ~GridProvider() = default;
+    virtual ~GridProvider() = default;
     virtual void initalGridInformations() = 0;
 
 protected:
-	void setNumberOfNodes(const int numberOfNodes, const int level) const;
-    void setNumberOfFluidNodes(const int numberOfNodes, const int level) const;
-    void setNumberOfFluidNodesBorder(const int numberOfNodes, const int level) const;
-    virtual void setInitalNodeValues(const int numberOfNodes, const int level) const;
-
-	void setPressSizePerLevel(int level, int sizePerLevel) const;
-	void setVelocitySizePerLevel(int level, int sizePerLevel) const;
-	void setOutflowSizePerLevel(int level, int sizePerLevel) const;
+    void setNumberOfNodes(uint numberOfNodes, int level) const;
+    void setNumberOfTaggedFluidNodes(uint numberOfNodes, CollisionTemplate tag, int level) const;
+    virtual void setInitalNodeValues(uint numberOfNodes, int level) const;
+
+    void setPressSizePerLevel(int level, int sizePerLevel) const;
+    void setVelocitySizePerLevel(int level, int sizePerLevel) const;
+    void setOutflowSizePerLevel(int level, int sizePerLevel) const;
 
     std::shared_ptr<Parameter> para;
     std::shared_ptr<CudaMemoryManager> cudaMemoryManager;
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.cpp
index fa432a1d9c3922b88e93588548db74083275ef1e..a1c8554cc4e262e9f1eca4204aed4ffcfd4c3a87 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.cpp
@@ -47,7 +47,7 @@ bool GridReader::getBinaer()
 
 void rearrangeGeometry(Parameter* para, int lev)
 {
-    for (uint index = 0; index < para->getParH(lev)->numberOfNodes; index++)
+    for (size_t index = 0; index < para->getParH(lev)->numberOfNodes; index++)
     {
         if (para->getParH(lev)->typeOfGridNode[index] == GEO_FLUID_OLD)
         {
@@ -74,11 +74,11 @@ void GridReader::allocArrays_CoordNeighborGeo()
 	uint numberOfNodesGlobal = 0;
 	std::cout << "Number of Nodes: " << std::endl;
 
-	for (uint level = 0; level <= maxLevel; level++) 
-	{		
-		int numberOfNodesPerLevel = coordX.getSize(level) + 1;
-		numberOfNodesGlobal += numberOfNodesPerLevel;
-		std::cout << "Level " << level << " = " << numberOfNodesPerLevel << " Nodes" << std::endl;
+    for (uint level = 0; level <= maxLevel; level++)
+    {
+        const uint numberOfNodesPerLevel = coordX.getSize(level) + 1;
+        numberOfNodesGlobal += numberOfNodesPerLevel;
+        std::cout << "Level " << level << " = " << numberOfNodesPerLevel << " Nodes" << std::endl;
 
 		setNumberOfNodes(numberOfNodesPerLevel, level);
 
@@ -130,9 +130,9 @@ void GridReader::allocArrays_BoundaryValues()
 
     for (uint i = 0; i < channelBoundaryConditions.size(); i++)
     {
-        if (     this->channelBoundaryConditions[i] == "velocity") { fillVelocityVectors(i); } 
-		else if (this->channelBoundaryConditions[i] == "pressure") { setPressureValues(i); } 
-		else if (this->channelBoundaryConditions[i] == "outflow")  { setOutflowValues(i);  }
+        if (     this->channelBoundaryConditions[i] == "velocity") { fillVelocityVectors(i); }
+        else if (this->channelBoundaryConditions[i] == "pressure") { setPressureValues(i); }
+        else if (this->channelBoundaryConditions[i] == "outflow")  { setOutflowValues(i);  }
     }
 
 	setVelocityValues();
@@ -218,16 +218,20 @@ void GridReader::allocArrays_OffsetScale()
     std::cout << "-----Ende OffsetScale------" << std::endl;
 }
 
-void GridReader::allocArrays_fluidNodeIndices() {
+void GridReader::allocArrays_taggedFluidNodes() {
     std::cout << "GridReader::allocArrays_fluidNodeIndices not implemented" << std::endl;
 	// TODO
 }
 
-void GridReader::allocArrays_fluidNodeIndicesBorder() {
-    std::cout << "GridReader::allocArrays_fluidNodeIndicesBorder not implemented" << std::endl;
+void GridReader::tagFluidNodeIndices(const std::vector<uint>& taggedFluidNodeIndices, CollisionTemplate tag, uint level){
+    std::cout << "GridReader::tagFluidNodeIndices not implemented" << std::endl;
     // TODO
 }
 
+void GridReader::sortFluidNodeTags(){
+    std::cout << "GridReader::sortFluidNodeTags not implemented" << std::endl;
+    // TODO
+}
 
 void GridReader::setPressureValues(int channelSide) const
 {
@@ -281,23 +285,23 @@ void GridReader::fillVelocityVectors(int channelSide)
 			delete[] veloX_ValuesPerSide;
             delete[] veloY_ValuesPerSide;
             delete[] veloZ_ValuesPerSide;
-        }        
-	}
+        }
+    }
 
 
 }
 
-void GridReader::setVelocityValues() { 
+void GridReader::setVelocityValues() {
     for (int level = 0; level < (int)(velocityX_BCvalues.size()); level++) {
-        
-		int sizePerLevel = (int) velocityX_BCvalues[level].size();
+
+        int sizePerLevel = (int) velocityX_BCvalues[level].size();
         std::cout << "complete size velocity level " << level << " : " << sizePerLevel << std::endl;
         setVelocitySizePerLevel(level, sizePerLevel);
-        
-		if (sizePerLevel > 1) {
+
+        if (sizePerLevel > 1) {
             cudaMemoryManager->cudaAllocVeloBC(level);
             setVelocity(level, sizePerLevel);
-			cudaMemoryManager->cudaCopyVeloBC(level);
+            cudaMemoryManager->cudaCopyVeloBC(level);
         }
     }
 }
@@ -668,8 +672,8 @@ void GridReader::modifyQElement(std::shared_ptr<BoundaryQs> boundaryQ, unsigned
 /*------------------------------------------------------------------------------------------------*/
 /*---------------------------------------private q methods----------------------------------------*/
 /*------------------------------------------------------------------------------------------------*/
-void GridReader::initalVectorForQStruct(std::vector<std::vector<std::vector<real>>> &Qs, std::vector<std::vector<int>> &index, 
-										std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const
+void GridReader::initalVectorForQStruct(std::vector<std::vector<std::vector<real>>> &Qs, std::vector<std::vector<int>> &index,
+                                        std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const
 {
     boundaryQ->setValuesInVector(Qs, level);
     boundaryQ->setIndexInVector(index, level);
@@ -685,7 +689,7 @@ void GridReader::copyVectorsToQStruct(std::vector<std::vector<real>> &Qs,
 
 	for (int direction = 0; direction < para->getD3Qxx(); direction++) {
         for (size_t indexQ = 0; indexQ < sizeOfValues; indexQ++) {
-            qTemp.q27[direction][indexQ] = Qs[direction][indexQ]; 
+            qTemp.q27[direction][indexQ] = Qs[direction][indexQ];
         }
     }
 
@@ -847,46 +851,46 @@ void GridReader::setBoundingBox()
 
 void GridReader::initPeriodicNeigh(std::vector<std::vector<std::vector<unsigned int> > > periodV, std::vector<std::vector<unsigned int> > periodIndex,  std::string boundaryCondition)
 {
-	std::vector<unsigned int>neighVec;
-	std::vector<unsigned int>indexVec;
-	
-	int counter = 0;
-
-	for(unsigned int i=0; i<neighX->getLevel();i++) {
-		if(boundaryCondition =="periodic_y"){
-			neighVec = neighY->getVec(i);
-		} 
-		else if(boundaryCondition =="periodic_x"){
-			neighVec = neighX->getVec(i);
-		}
-		else if(boundaryCondition =="periodic_z"){
-			neighVec = neighZ->getVec(i);
-		}
-		else {
-			std::cout << "wrong String in periodicValue" << std::endl;
-			exit(1);
-		}
+    std::vector<unsigned int>neighVec;
+    std::vector<unsigned int>indexVec;
 
-		for (std::vector<unsigned int>::iterator it = periodIndex[i].begin(); it != periodIndex[i].end(); it++) {
-			if(periodV[i][0][counter] != 0) {
-				neighVec[*it]=periodV[i][0][counter];
-			}
+    int counter = 0;
 
-			counter++;
-		}
+    for(unsigned int i=0; i<neighX->getLevel();i++) {
+        if(boundaryCondition =="periodic_y"){
+            neighVec = neighY->getVec(i);
+        }
+        else if(boundaryCondition =="periodic_x"){
+            neighVec = neighX->getVec(i);
+        }
+        else if(boundaryCondition =="periodic_z"){
+            neighVec = neighZ->getVec(i);
+        }
+        else {
+            std::cout << "wrong String in periodicValue" << std::endl;
+            exit(1);
+        }
 
+        for (std::vector<unsigned int>::iterator it = periodIndex[i].begin(); it != periodIndex[i].end(); it++) {
+            if(periodV[i][0][counter] != 0) {
+                neighVec[*it]=periodV[i][0][counter];
+            }
 
-		if(boundaryCondition =="periodic_y"){
-			neighY->setVec(i, neighVec);
-		} 
-		else if(boundaryCondition =="periodic_x"){
-			neighX->setVec(i, neighVec);
-		}
-		else if(boundaryCondition =="periodic_z"){
-			neighZ->setVec(i, neighVec);
-		}
+            counter++;
+        }
 
-	}
+
+        if(boundaryCondition =="periodic_y"){
+            neighY->setVec(i, neighVec);
+        }
+        else if(boundaryCondition =="periodic_x"){
+            neighX->setVec(i, neighVec);
+        }
+        else if(boundaryCondition =="periodic_z"){
+            neighZ->setVec(i, neighVec);
+        }
+
+    }
 }
 
 void GridReader::makeReader(std::shared_ptr<Parameter> para)
@@ -917,9 +921,9 @@ void GridReader::makeReader(std::vector<std::shared_ptr<BoundaryQs> > &BC_Qs, st
 
 void GridReader::setChannelBoundaryCondition()
 {
-	for (std::size_t i = 0; i < channelDirections.size(); i++)
-	{
-		this->channelBoundaryConditions[i] = BC_Values[i]->getBoundaryCondition();
-		std::cout << this->channelDirections[i] << " Boundary: " << channelBoundaryConditions[i] << std::endl;
-	}
-}
\ No newline at end of file
+    for (std::size_t i = 0; i < channelDirections.size(); i++)
+    {
+        this->channelBoundaryConditions[i] = BC_Values[i]->getBoundaryCondition();
+        std::cout << this->channelDirections[i] << " Boundary: " << channelBoundaryConditions[i] << std::endl;
+    }
+}
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h
index 18efb6a7885191312ea4e2fbb22eb45162ab1de1..041d2c3ce94592f792c5a850eebd14c07f4db1b4 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h
@@ -3,9 +3,9 @@
 
 #include "../GridProvider.h"
 
-#include <vector>
-#include <string>
 #include <memory>
+#include <string>
+#include <vector>
 
 #include "LBM/LB.h"
 
@@ -16,15 +16,14 @@ class BoundaryValues;
 class BoundaryQs;
 class CoordNeighborGeoV;
 
-class VIRTUALFLUIDS_GPU_EXPORT GridReader
-	: public GridProvider
+class VIRTUALFLUIDS_GPU_EXPORT GridReader : public GridProvider
 {
 private:
-	bool binaer;
-	std::vector<std::string> channelDirections;
-	std::vector<std::string> channelBoundaryConditions;
-	std::shared_ptr<CoordNeighborGeoV> neighX, neighY, neighZ, neighWSB;
-	std::vector<std::shared_ptr<BoundaryValues> > BC_Values;
+    bool binaer;
+    std::vector<std::string> channelDirections;
+    std::vector<std::string> channelBoundaryConditions;
+    std::shared_ptr<CoordNeighborGeoV> neighX, neighY, neighZ, neighWSB;
+    std::vector<std::shared_ptr<BoundaryValues>> BC_Values;
 
     std::vector<std::vector<real>> velocityX_BCvalues, velocityY_BCvalues, velocityZ_BCvalues;
     std::vector<std::vector<std::vector<real>>> velocityQs;
@@ -34,57 +33,62 @@ private:
     std::vector<std::vector<real>> outflowBCvalues;
 
 public:
-	GridReader(FILEFORMAT format, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaMemoryManager);
-    ~GridReader();
-	void allocArrays_CoordNeighborGeo() override;
-	void allocArrays_BoundaryValues() override;
+    GridReader(FILEFORMAT format, std::shared_ptr<Parameter> para,
+               std::shared_ptr<CudaMemoryManager> cudaMemoryManager);
+    ~GridReader() override;
+    void allocArrays_CoordNeighborGeo() override;
+    void allocArrays_BoundaryValues() override;
     void allocArrays_OffsetScale() override;
-    void allocArrays_fluidNodeIndices() override;
-    void allocArrays_fluidNodeIndicesBorder() override;
+    void allocArrays_taggedFluidNodes() override;
 
-	void initalValuesDomainDecompostion(int level);
+    void tagFluidNodeIndices(const std::vector<uint> &taggedFluidNodeIndices, CollisionTemplate tag, uint level) override;
 
-	void setChannelBoundaryCondition();
+    void sortFluidNodeTags() override;
 
-	void allocArrays_BoundaryQs() override;
-	bool getBinaer();
-	void setDimensions() override;
-	void setBoundingBox() override;
-	void initPeriodicNeigh(std::vector<std::vector<std::vector<unsigned int> > > periodV, std::vector<std::vector<unsigned int> > periodIndex, std::string way) override;
+    void initalValuesDomainDecompostion(int level);
+
+    void setChannelBoundaryCondition();
+
+    void allocArrays_BoundaryQs() override;
+    bool getBinaer();
+    void setDimensions() override;
+    void setBoundingBox() override;
+    void initPeriodicNeigh(std::vector<std::vector<std::vector<unsigned int>>> periodV,
+                           std::vector<std::vector<unsigned int>> periodIndex, std::string way) override;
 
 private:
-	void makeReader(std::shared_ptr<Parameter> para);
-	void makeReader(std::vector<std::shared_ptr<BoundaryQs> > &BC_Qs, std::shared_ptr<Parameter> para);
+    void makeReader(std::shared_ptr<Parameter> para);
+    void makeReader(std::vector<std::shared_ptr<BoundaryQs>> &BC_Qs, std::shared_ptr<Parameter> para);
 
-	void setPressureValues(int channelSide) const;
-	void setPressRhoBC(int sizePerLevel, int level, int channelSide) const;
+    void setPressureValues(int channelSide) const;
+    void setPressRhoBC(int sizePerLevel, int level, int channelSide) const;
 
-	void fillVelocityVectors(int channelSide);
+    void fillVelocityVectors(int channelSide);
     void setVelocityValues();
-	void setVelocity(int level, int sizePerLevel) const;
+    void setVelocity(int level, int sizePerLevel) const;
 
-	void setOutflowValues(int channelSide) const;
-	void setOutflow(int level, int sizePerLevel, int channelSide) const;
+    void setOutflowValues(int channelSide) const;
+    void setOutflow(int level, int sizePerLevel, int channelSide) const;
 
-
-	//void fillVelocityQVectors(int channelSide);
+    // void fillVelocityQVectors(int channelSide);
     void setPressQs(std::shared_ptr<BoundaryQs> boundaryQ) const;
-	void setVelocityQs(std::shared_ptr<BoundaryQs> boundaryQ);
-	void setOutflowQs(std::shared_ptr<BoundaryQs> boundaryQ) const;
-	void setNoSlipQs(std::shared_ptr<BoundaryQs> boundaryQ) const;
-	void setGeoQs(std::shared_ptr<BoundaryQs> boundaryQ) const;
-	void modifyQElement(std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const;
+    void setVelocityQs(std::shared_ptr<BoundaryQs> boundaryQ);
+    void setOutflowQs(std::shared_ptr<BoundaryQs> boundaryQ) const;
+    void setNoSlipQs(std::shared_ptr<BoundaryQs> boundaryQ) const;
+    void setGeoQs(std::shared_ptr<BoundaryQs> boundaryQ) const;
+    void modifyQElement(std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const;
 
-	void initalVectorForQStruct(std::vector<std::vector<std::vector<real>>> &Qs, std::vector<std::vector<int>> &index,
+    void initalVectorForQStruct(std::vector<std::vector<std::vector<real>>> &Qs, std::vector<std::vector<int>> &index,
                                 std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const;
     void copyVectorsToQStruct(std::vector<std::vector<real>> &Qs, std::vector<int> &index,
                               QforBoundaryConditions &Q) const;
     void initalQStruct(QforBoundaryConditions &Q, std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const;
-	void printQSize(std::string bc, std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const;
-	void setSizeNoSlip(std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const;
-	void setSizeGeoQs(std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const;
-	void setQ27Size(QforBoundaryConditions &Q, real* QQ, unsigned int sizeQ) const;
-	bool hasQs(std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const;
+    void printQSize(std::string bc, std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const;
+    void setSizeNoSlip(std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const;
+    void setSizeGeoQs(std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const;
+    void setQ27Size(QforBoundaryConditions &Q, real *QQ, unsigned int sizeQ) const;
+    bool hasQs(std::shared_ptr<BoundaryQs> boundaryQ, unsigned int level) const;
+
 public:
     void initalGridInformations() override;
 };
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
index 7f61b4357276f38d8fde71489dcf60348b402941..38a7eef7e356e2f2da4c1a819d8375035a37313a 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
@@ -1,5 +1,6 @@
 #include "GridGenerator.h"
 
+#include "LBM/LB.h"
 #include "Parameter/Parameter.h"
 #include "GridGenerator/grid/GridBuilder/GridBuilder.h"
 #include "GPU/CudaMemoryManager.h"
@@ -10,19 +11,24 @@
 #include <algorithm>
 #include "utilities/math/Math.h"
 #include "Output/QDebugWriter.hpp"
+#include "GridGenerator/TransientBCSetter/TransientBCSetter.h"
 
 #include "utilities/communication.h"
 #include "Communication/Communicator.h"
 
+#include <logger/Logger.h>
+
 using namespace vf::lbm::dir;
 
-GridGenerator::GridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::gpu::Communicator& communicator):
-    mpiProcessID(communicator.getPID()), builder(builder)
+GridGenerator::GridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para,
+                             std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::gpu::Communicator &communicator)
+    : mpiProcessID(communicator.getPID()), builder(builder)
 {
     this->para = para;
     this->cudaMemoryManager = cudaMemoryManager;
     this->indexRearrangement = std::make_unique<IndexRearrangementForStreams>(para, builder, communicator);
-    this->interpolationGrouper = std::make_unique<InterpolationCellGrouper>(para->getParHallLevels(), para->getParDallLevels(), builder);
+    this->interpolationGrouper =
+        std::make_unique<InterpolationCellGrouper>(para->getParHallLevels(), para->getParDallLevels(), builder);
 }
 
 GridGenerator::~GridGenerator() = default;
@@ -55,15 +61,15 @@ void GridGenerator::allocArrays_CoordNeighborGeo()
     std::cout << "Number of Level: " << numberOfLevels << std::endl;
     int numberOfNodesGlobal = 0;
     std::cout << "Number of Nodes: " << std::endl;
-    
-    for (uint level = 0; level < numberOfLevels; level++) 
+
+    for (uint level = 0; level < numberOfLevels; level++)
     {
-        const int numberOfNodesPerLevel = builder->getNumberOfNodes(level) + 1;
+        const uint numberOfNodesPerLevel = builder->getNumberOfNodes(level) + 1;
         numberOfNodesGlobal += numberOfNodesPerLevel;
         std::cout << "Level " << level << " = " << numberOfNodesPerLevel << " Nodes" << std::endl;
-    
+
         setNumberOfNodes(numberOfNodesPerLevel, level);
-    
+
         cudaMemoryManager->cudaAllocCoord(level);
         cudaMemoryManager->cudaAllocSP(level);
         //cudaMemoryManager->cudaAllocF3SP(level);
@@ -71,7 +77,7 @@ void GridGenerator::allocArrays_CoordNeighborGeo()
 
         if(para->getUseTurbulentViscosity())
             cudaMemoryManager->cudaAllocTurbulentViscosity(level);
-        
+
         if(para->getIsBodyForce())
             cudaMemoryManager->cudaAllocBodyForce(level);
 
@@ -100,28 +106,104 @@ void GridGenerator::allocArrays_CoordNeighborGeo()
     std::cout << "-----finish Coord, Neighbor, Geo------" << std::endl;
 }
 
-void GridGenerator::allocArrays_fluidNodeIndices() {
-    for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
-        setNumberOfFluidNodes(builder->getNumberOfFluidNodes(level), level);
-        cudaMemoryManager->cudaAllocFluidNodeIndices(level);
-        builder->getFluidNodeIndices(para->getParH(level)->fluidNodeIndices, level);
-        cudaMemoryManager->cudaCopyFluidNodeIndices(level);
-    }    
+void GridGenerator::allocArrays_taggedFluidNodes() {
+
+    for (uint level = 0; level < builder->getNumberOfGridLevels(); level++)
+    {
+        for ( CollisionTemplate tag: all_CollisionTemplate )
+        {   //TODO: Need to add CollisionTemplate to GridBuilder to allow as argument and get rid of indivual get funtions for fluid node indices... and clean up this mess
+            switch(tag)
+            {
+                case CollisionTemplate::Default:
+                    this->setNumberOfTaggedFluidNodes(builder->getNumberOfFluidNodes(level), CollisionTemplate::Default, level);
+                    cudaMemoryManager->cudaAllocTaggedFluidNodeIndices(CollisionTemplate::Default, level);
+                    builder->getFluidNodeIndices(para->getParH(level)->taggedFluidNodeIndices[CollisionTemplate::Default], level);
+                    cudaMemoryManager->cudaCopyTaggedFluidNodeIndices(CollisionTemplate::Default, level);
+                    if(para->getParH(level)->numberOfTaggedFluidNodes[tag]>0)
+                        para->getParH(level)->allocatedBulkFluidNodeTags.push_back(tag);
+                    break;
+                case CollisionTemplate::SubDomainBorder:
+                    this->setNumberOfTaggedFluidNodes(builder->getNumberOfFluidNodesBorder(level), CollisionTemplate::SubDomainBorder, level);
+                    cudaMemoryManager->cudaAllocTaggedFluidNodeIndices(CollisionTemplate::SubDomainBorder, level);
+                    builder->getFluidNodeIndicesBorder(para->getParH(level)->taggedFluidNodeIndices[CollisionTemplate::SubDomainBorder], level);
+                    cudaMemoryManager->cudaCopyTaggedFluidNodeIndices(CollisionTemplate::SubDomainBorder, level);
+                    break;
+                case CollisionTemplate::WriteMacroVars:
+                    this->setNumberOfTaggedFluidNodes(builder->getNumberOfFluidNodesMacroVars(level), CollisionTemplate::WriteMacroVars, level);
+                    cudaMemoryManager->cudaAllocTaggedFluidNodeIndices(CollisionTemplate::WriteMacroVars, level);
+                    builder->getFluidNodeIndicesMacroVars(para->getParH(level)->taggedFluidNodeIndices[CollisionTemplate::WriteMacroVars], level);
+                    cudaMemoryManager->cudaCopyTaggedFluidNodeIndices(CollisionTemplate::WriteMacroVars, level);
+                    if(para->getParH(level)->numberOfTaggedFluidNodes[tag]>0)
+                        para->getParH(level)->allocatedBulkFluidNodeTags.push_back(tag);
+                    break;
+                case CollisionTemplate::ApplyBodyForce:
+                    this->setNumberOfTaggedFluidNodes(builder->getNumberOfFluidNodesApplyBodyForce(level), CollisionTemplate::ApplyBodyForce, level);
+                    cudaMemoryManager->cudaAllocTaggedFluidNodeIndices(CollisionTemplate::ApplyBodyForce, level);
+                    builder->getFluidNodeIndicesApplyBodyForce(para->getParH(level)->taggedFluidNodeIndices[CollisionTemplate::ApplyBodyForce], level);
+                    cudaMemoryManager->cudaCopyTaggedFluidNodeIndices(CollisionTemplate::ApplyBodyForce, level);
+                    if(para->getParH(level)->numberOfTaggedFluidNodes[tag]>0)
+                        para->getParH(level)->allocatedBulkFluidNodeTags.push_back(tag);
+                    break;
+                case CollisionTemplate::AllFeatures:
+                    this->setNumberOfTaggedFluidNodes(builder->getNumberOfFluidNodesAllFeatures(level), CollisionTemplate::AllFeatures, level);
+                    cudaMemoryManager->cudaAllocTaggedFluidNodeIndices(CollisionTemplate::AllFeatures, level);
+                    builder->getFluidNodeIndicesAllFeatures(para->getParH(level)->taggedFluidNodeIndices[CollisionTemplate::AllFeatures], level);
+                    cudaMemoryManager->cudaCopyTaggedFluidNodeIndices(CollisionTemplate::AllFeatures, level);
+                    if(para->getParH(level)->numberOfTaggedFluidNodes[tag]>0)
+                        para->getParH(level)->allocatedBulkFluidNodeTags.push_back(tag);
+                    break;
+                default:
+                    break;
+            }
+        }
+        VF_LOG_INFO("Number of tagged nodes on level {}:", level);
+        VF_LOG_INFO("Default: {}, Border: {}, WriteMacroVars: {}, ApplyBodyForce: {}, AllFeatures: {}",
+                    para->getParH(level)->numberOfTaggedFluidNodes[CollisionTemplate::Default],
+                    para->getParH(level)->numberOfTaggedFluidNodes[CollisionTemplate::SubDomainBorder],
+                    para->getParH(level)->numberOfTaggedFluidNodes[CollisionTemplate::WriteMacroVars],
+                    para->getParH(level)->numberOfTaggedFluidNodes[CollisionTemplate::ApplyBodyForce],
+                    para->getParH(level)->numberOfTaggedFluidNodes[CollisionTemplate::AllFeatures]    );
+    }
 }
 
-void GridGenerator::allocArrays_fluidNodeIndicesBorder() {
-    for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
-        setNumberOfFluidNodesBorder(builder->getNumberOfFluidNodesBorder(level), level);
-        cudaMemoryManager->cudaAllocFluidNodeIndicesBorder(level);
-        builder->getFluidNodeIndicesBorder(para->getParH(level)->fluidNodeIndicesBorder, level);
-        cudaMemoryManager->cudaCopyFluidNodeIndicesBorder(level);
+void GridGenerator::tagFluidNodeIndices(const std::vector<uint>& taggedFluidNodeIndices, CollisionTemplate tag, uint level) {
+    switch(tag)
+    {
+        case CollisionTemplate::WriteMacroVars:
+            builder->addFluidNodeIndicesMacroVars( taggedFluidNodeIndices, level );
+            break;
+        case CollisionTemplate::ApplyBodyForce:
+            builder->addFluidNodeIndicesApplyBodyForce( taggedFluidNodeIndices, level );
+            break;
+        case CollisionTemplate::AllFeatures:
+            builder->addFluidNodeIndicesAllFeatures( taggedFluidNodeIndices, level );
+            break;
+        case CollisionTemplate::Default:
+        case CollisionTemplate::SubDomainBorder:
+            throw std::runtime_error("Cannot tag fluid nodes as Default or SubDomainBorder!");
+        default:
+            throw std::runtime_error("Tagging fluid nodes with invald tag!");
+            break;
+
     }
+
+}
+
+void GridGenerator::sortFluidNodeTags() {
+    VF_LOG_INFO("Start sorting tagged fluid nodes...");
+    for (uint level = 0; level < builder->getNumberOfGridLevels(); level++)
+    {
+        builder->sortFluidNodeIndicesAllFeatures(level); //has to be called first!
+        builder->sortFluidNodeIndicesMacroVars(level);
+        builder->sortFluidNodeIndicesApplyBodyForce(level);
+    }
+    VF_LOG_INFO("done.");
 }
 
 void GridGenerator::allocArrays_BoundaryValues()
 {
     std::cout << "------read BoundaryValues------" << std::endl;
-    int blocks = 0;
+    int blocks;
 
     for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
         const auto numberOfPressureValues = int(builder->getPressureSize(level));
@@ -129,6 +211,7 @@ void GridGenerator::allocArrays_BoundaryValues()
 
         ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
         para->getParH(level)->pressureBC.numberOfBCnodes = 0;
+        para->getParD(level)->outflowPressureCorrectionFactor = para->getOutflowPressureCorrectionFactor();
         if (numberOfPressureValues > 1)
         {
             blocks = (numberOfPressureValues / para->getParH(level)->numberofthreads) + 1;
@@ -148,12 +231,12 @@ void GridGenerator::allocArrays_BoundaryValues()
 
         ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
         para->getParH(level)->slipBC.numberOfBCnodes = 0;
-        if (numberOfSlipValues > 1)
-        {
+        if (numberOfSlipValues > 1) {
             blocks = (numberOfSlipValues / para->getParH(level)->numberofthreads) + 1;
             para->getParH(level)->slipBC.numberOfBCnodes = blocks * para->getParH(level)->numberofthreads;
             cudaMemoryManager->cudaAllocSlipBC(level);
-            builder->getSlipValues(para->getParH(level)->slipBC.normalX, para->getParH(level)->slipBC.normalY, para->getParH(level)->slipBC.normalZ, para->getParH(level)->slipBC.k, level);
+            builder->getSlipValues(para->getParH(level)->slipBC.normalX, para->getParH(level)->slipBC.normalY,
+                                   para->getParH(level)->slipBC.normalZ, para->getParH(level)->slipBC.k, level);
             cudaMemoryManager->cudaCopySlipBC(level);
         }
         para->getParD(level)->slipBC.numberOfBCnodes = para->getParH(level)->slipBC.numberOfBCnodes;
@@ -173,11 +256,11 @@ void GridGenerator::allocArrays_BoundaryValues()
             para->getParH(level)->stressBC.numberOfBCnodes = blocks * para->getParH(level)->numberofthreads;
             cudaMemoryManager->cudaAllocStressBC(level);
             cudaMemoryManager->cudaAllocWallModel(level, para->getHasWallModelMonitor());
-            builder->getStressValues(   para->getParH(level)->stressBC.normalX,  para->getParH(level)->stressBC.normalY,  para->getParH(level)->stressBC.normalZ, 
+            builder->getStressValues(   para->getParH(level)->stressBC.normalX,  para->getParH(level)->stressBC.normalY,  para->getParH(level)->stressBC.normalZ,
                                         para->getParH(level)->stressBC.Vx,       para->getParH(level)->stressBC.Vy,       para->getParH(level)->stressBC.Vz,
                                         para->getParH(level)->stressBC.Vx1,      para->getParH(level)->stressBC.Vy1,      para->getParH(level)->stressBC.Vz1,
-                                        para->getParH(level)->stressBC.k,        para->getParH(level)->stressBC.kN,       
-                                        para->getParH(level)->wallModel.samplingOffset, para->getParH(level)->wallModel.z0, 
+                                        para->getParH(level)->stressBC.k,        para->getParH(level)->stressBC.kN,
+                                        para->getParH(level)->wallModel.samplingOffset, para->getParH(level)->wallModel.z0,
                                         level);
 
             cudaMemoryManager->cudaCopyStressBC(level);
@@ -187,7 +270,7 @@ void GridGenerator::allocArrays_BoundaryValues()
         para->getParH(level)->numberOfStressBCnodesRead = para->getParH(level)->stressBC.numberOfBCnodes * para->getD3Qxx();
         para->getParD(level)->numberOfStressBCnodesRead = para->getParH(level)->stressBC.numberOfBCnodes * para->getD3Qxx();
     }
-    
+
 
     for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
         const auto numberOfVelocityValues = int(builder->getVelocitySize(level));
@@ -204,7 +287,8 @@ void GridGenerator::allocArrays_BoundaryValues()
             cudaMemoryManager->cudaAllocVeloBC(level);
             ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-            builder->getVelocityValues(para->getParH(level)->velocityBC.Vx, para->getParH(level)->velocityBC.Vy, para->getParH(level)->velocityBC.Vz, para->getParH(level)->velocityBC.k, level);
+            builder->getVelocityValues(para->getParH(level)->velocityBC.Vx, para->getParH(level)->velocityBC.Vy,
+                                       para->getParH(level)->velocityBC.Vz, para->getParH(level)->velocityBC.k, level);
 
             ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -242,6 +326,100 @@ void GridGenerator::allocArrays_BoundaryValues()
         para->getParD(level)->numberOfVeloBCnodesRead = para->getParH(level)->velocityBC.numberOfBCnodes * para->getD3Qxx();
     }
 
+    for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
+        const auto numberOfPrecursorValues = int(builder->getPrecursorSize(level));
+        *logging::out << logging::Logger::INFO_INTERMEDIATE << "size precursor level " << level << " : " << numberOfPrecursorValues << "\n";
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        blocks = (numberOfPrecursorValues / para->getParH(level)->numberofthreads) + 1;
+        para->getParH(level)->precursorBC.sizeQ = blocks * para->getParH(level)->numberofthreads;
+        para->getParD(level)->precursorBC.sizeQ = para->getParH(level)->precursorBC.sizeQ;
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        para->getParH(level)->precursorBC.numberOfBCnodes = numberOfPrecursorValues;
+        para->getParD(level)->precursorBC.numberOfBCnodes = numberOfPrecursorValues;
+        para->getParH(level)->numberOfPrecursorBCnodesRead = numberOfPrecursorValues * para->getD3Qxx();
+        para->getParD(level)->numberOfPrecursorBCnodesRead = numberOfPrecursorValues * para->getD3Qxx();
+
+        if (numberOfPrecursorValues > 1)
+        {
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            cudaMemoryManager->cudaAllocPrecursorBC(level);
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            builder->getPrecursorValues(
+                    para->getParH(level)->precursorBC.planeNeighbor0PP, para->getParH(level)->precursorBC.planeNeighbor0PM,
+                    para->getParH(level)->precursorBC.planeNeighbor0MP, para->getParH(level)->precursorBC.planeNeighbor0MM,
+                    para->getParH(level)->precursorBC.weights0PP, para->getParH(level)->precursorBC.weights0PM,
+                    para->getParH(level)->precursorBC.weights0MP, para->getParH(level)->precursorBC.weights0MM,
+                    para->getParH(level)->precursorBC.k, para->getParH(level)->transientBCInputFileReader, para->getParH(level)->precursorBC.numberOfPrecursorNodes,
+                    para->getParH(level)->precursorBC.numberOfQuantities, para->getParH(level)->precursorBC.timeStepsBetweenReads,
+                    para->getParH(level)->precursorBC.velocityX, para->getParH(level)->precursorBC.velocityY, para->getParH(level)->precursorBC.velocityZ,
+                    level);
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            para->getParD(level)->precursorBC.numberOfPrecursorNodes = para->getParH(level)->precursorBC.numberOfPrecursorNodes;
+            para->getParD(level)->precursorBC.numberOfQuantities = para->getParH(level)->precursorBC.numberOfQuantities;
+            para->getParD(level)->precursorBC.timeStepsBetweenReads = para->getParH(level)->precursorBC.timeStepsBetweenReads;
+            para->getParD(level)->precursorBC.velocityX = para->getParH(level)->precursorBC.velocityX;
+            para->getParD(level)->precursorBC.velocityY = para->getParH(level)->precursorBC.velocityY;
+            para->getParD(level)->precursorBC.velocityZ = para->getParH(level)->precursorBC.velocityZ;
+
+            for(auto reader : para->getParH(level)->transientBCInputFileReader)
+            {
+                if(reader->getNumberOfQuantities() != para->getParD(level)->precursorBC.numberOfQuantities)
+                    throw std::runtime_error(
+                        "Number of quantities in reader and number of quantities needed for precursor don't match!");
+            }
+
+            cudaMemoryManager->cudaCopyPrecursorBC(level);
+            cudaMemoryManager->cudaAllocPrecursorData(level);
+
+            // read first timestep of precursor into next and copy to next on device
+            for(auto reader : para->getParH(level)->transientBCInputFileReader)
+            {
+                reader->getNextData(para->getParH(level)->precursorBC.next, para->getParH(level)->precursorBC.numberOfPrecursorNodes, 0);
+            }
+
+            cudaMemoryManager->cudaCopyPrecursorData(level);
+
+            //switch next with last pointers
+            real* tmp = para->getParD(level)->precursorBC.last;
+            para->getParD(level)->precursorBC.last = para->getParD(level)->precursorBC.next;
+            para->getParD(level)->precursorBC.next = tmp;
+
+            //read second timestep of precursor into next and copy next to device
+            real nextTime = para->getParD(level)->precursorBC.timeStepsBetweenReads*pow(2,-((real)level))*para->getTimeRatio();
+            for(auto reader : para->getParH(level)->transientBCInputFileReader)
+            {
+                reader->getNextData(para->getParH(level)->precursorBC.next, para->getParH(level)->precursorBC.numberOfPrecursorNodes, nextTime);
+            }
+
+            cudaMemoryManager->cudaCopyPrecursorData(level);
+
+            para->getParD(level)->precursorBC.nPrecursorReads = 1;
+
+
+            //switch next with current pointers
+            tmp = para->getParD(level)->precursorBC.current;
+            para->getParD(level)->precursorBC.current = para->getParD(level)->precursorBC.next;
+            para->getParD(level)->precursorBC.next = tmp;
+
+            //start usual cycle of loading, i.e. read velocities of timestep after current and copy asynchronously to device
+            for(auto reader : para->getParH(level)->transientBCInputFileReader)
+            {
+                reader->getNextData(para->getParH(level)->precursorBC.next, para->getParH(level)->precursorBC.numberOfPrecursorNodes, 2*nextTime);
+            }
+
+            cudaMemoryManager->cudaCopyPrecursorData(level);
+
+            para->getParD(level)->precursorBC.nPrecursorReads = 2;
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // advection - diffusion stuff
+        if (para->getDiffOn()==true){
+            throw std::runtime_error(" Advection Diffusion not implemented for Precursor!");
+        }
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    }
+
 
 
     if (builder->hasGeometryValues()) {
@@ -303,7 +481,7 @@ void GridGenerator::initalValuesDomainDecompostion()
     if (para->getNumprocs() < 2)
         return;
     if ((para->getNumprocs() > 1) /*&& (procNeighborsSendX.size() == procNeighborsRecvX.size())*/) {
-        
+
         // direction has to be changed in case of periodic BCs and multiple sub domains
         std::vector<int> fillOrder = { 0, 1, 2, 3, 4, 5 };
 
@@ -383,7 +561,7 @@ void GridGenerator::initalValuesDomainDecompostion()
                         builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborX[indexProcessNeighbor].index, direction,
                                                    level);
                         if (level != builder->getNumberOfGridLevels() - 1 && para->useReducedCommunicationAfterFtoC)
-                            indexRearrangement->initCommunicationArraysForCommAfterFinetoCoarseX(level, indexProcessNeighbor, direction);             
+                            indexRearrangement->initCommunicationArraysForCommAfterFinetoCoarseX(level, indexProcessNeighbor, direction);
                         ////////////////////////////////////////////////////////////////////////////////////////
                         cudaMemoryManager->cudaCopyProcessNeighborXIndex(level, indexProcessNeighbor);
                         ////////////////////////////////////////////////////////////////////////////////////////
@@ -446,7 +624,7 @@ void GridGenerator::initalValuesDomainDecompostion()
                         ////////////////////////////////////////////////////////////////////////////////////////
                         // malloc on host and device
                         cudaMemoryManager->cudaAllocProcessNeighborY(level, indexProcessNeighbor);
-                        ////////////////////////////////////////////////////////////////////////////////////////                        
+                        ////////////////////////////////////////////////////////////////////////////////////////
                         // init index arrays
                         builder->getSendIndices(para->getParH(level)->sendProcessNeighborY[indexProcessNeighbor].index, direction, level);
                         builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborY[indexProcessNeighbor].index, direction,
@@ -465,7 +643,7 @@ void GridGenerator::initalValuesDomainDecompostion()
 
                     if (tempSend > 0) {
                         int indexProcessNeighbor = (int)para->getParH(level)->sendProcessNeighborZ.size();
-    
+
                         para->getParH(level)->sendProcessNeighborZ.emplace_back();
                         para->getParD(level)->sendProcessNeighborZ.emplace_back();
                         para->getParH(level)->recvProcessNeighborZ.emplace_back();
@@ -755,9 +933,9 @@ void GridGenerator::allocArrays_BoundaryQs()
             //preprocessing
             real* QQ = para->getParH(i)->pressureBC.q27[0];
             unsigned int sizeQ = para->getParH(i)->pressureBC.numberOfBCnodes;
-            QforBoundaryConditions Q;
+            QforBoundaryConditions &Q = para->getParH(i)->pressureBC;
             getPointersToBoundaryConditions(Q, QQ, sizeQ);
-            
+
             builder->getPressureQs(Q.q27, i);
 
 
@@ -802,9 +980,9 @@ void GridGenerator::allocArrays_BoundaryQs()
             //preprocessing
             real* QQ = para->getParH(i)->slipBC.q27[0];
             unsigned int sizeQ = para->getParH(i)->slipBC.numberOfBCnodes;
-            QforBoundaryConditions Q;
+            QforBoundaryConditions &Q = para->getParH(i)->slipBC;
             getPointersToBoundaryConditions(Q, QQ, sizeQ);
-            
+
             builder->getSlipQs(Q.q27, i);
             ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
             cudaMemoryManager->cudaCopySlipBC(i);
@@ -822,9 +1000,9 @@ void GridGenerator::allocArrays_BoundaryQs()
             //preprocessing
             real* QQ = para->getParH(i)->stressBC.q27[0];
             unsigned int sizeQ = para->getParH(i)->stressBC.numberOfBCnodes;
-            QforBoundaryConditions Q;
+            QforBoundaryConditions &Q = para->getParH(i)->stressBC;
             getPointersToBoundaryConditions(Q, QQ, sizeQ);
-            
+
             builder->getStressQs(Q.q27, i);
             ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
             cudaMemoryManager->cudaCopyStressBC(i);
@@ -842,7 +1020,7 @@ void GridGenerator::allocArrays_BoundaryQs()
             //preprocessing
             real* QQ = para->getParH(i)->velocityBC.q27[0];
             unsigned int sizeQ = para->getParH(i)->velocityBC.numberOfBCnodes;
-            QforBoundaryConditions Q;
+            QforBoundaryConditions &Q = para->getParH(i)->velocityBC;
             getPointersToBoundaryConditions(Q, QQ, sizeQ);
             builder->getVelocityQs(Q.q27, i);
 
@@ -874,6 +1052,50 @@ void GridGenerator::allocArrays_BoundaryQs()
         }
     }
 
+    for (uint i = 0; i < builder->getNumberOfGridLevels(); i++) {
+        const auto numberOfPrecursorNodes = int(builder->getPrecursorSize(i));
+        if (numberOfPrecursorNodes > 0)
+        {
+            std::cout << "size velocity level " << i << " : " << numberOfPrecursorNodes << std::endl;
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            //preprocessing
+            real* QQ = para->getParH(i)->precursorBC.q27[0];
+            unsigned int sizeQ = para->getParH(i)->precursorBC.numberOfBCnodes;
+            QforBoundaryConditions Q;
+            getPointersToBoundaryConditions(Q, QQ, sizeQ);
+
+            builder->getPrecursorQs(Q.q27, i);
+
+            if (para->getDiffOn()) {
+                throw std::runtime_error("Advection diffusion not implemented for Precursor!");
+                //////////////////////////////////////////////////////////////////////////
+                // para->getParH(i)->TempVel.kTemp = numberOfVelocityNodes;
+                // para->getParD(i)->TempVel.kTemp = numberOfVelocityNodes;
+                // std::cout << "Groesse TempVel.kTemp = " << para->getParH(i)->TempPress.kTemp << std::endl;
+                // std::cout << "getTemperatureInit = " << para->getTemperatureInit() << std::endl;
+                // std::cout << "getTemperatureBC = " << para->getTemperatureBC() << std::endl;
+                // //////////////////////////////////////////////////////////////////////////
+                // cudaMemoryManager->cudaAllocTempVeloBC(i);
+                // //cout << "nach alloc " << std::endl;
+                // //////////////////////////////////////////////////////////////////////////
+                // for (int m = 0; m < numberOfVelocityNodes; m++)
+                // {
+                //     para->getParH(i)->TempVel.temp[m] = para->getTemperatureInit();
+                //     para->getParH(i)->TempVel.tempPulse[m] = para->getTemperatureBC();
+                //     para->getParH(i)->TempVel.velo[m] = para->getVelocity();
+                //     para->getParH(i)->TempVel.k[m] = para->getParH(i)->Qinflow.k[m];
+                // }
+                // //////////////////////////////////////////////////////////////////////////
+                // //cout << "vor copy " << std::endl;
+                // cudaMemoryManager->cudaCopyTempVeloBCHD(i);
+                // //cout << "nach copy " << std::endl;
+                //////////////////////////////////////////////////////////////////////////
+            }
+            cudaMemoryManager->cudaCopyPrecursorBC(i);
+        }
+    }
+
+
 
     for (uint i = 0; i < builder->getNumberOfGridLevels(); i++) {
         const int numberOfGeometryNodes = builder->getGeometrySize(i);
@@ -898,7 +1120,7 @@ void GridGenerator::allocArrays_BoundaryQs()
             //preprocessing
             real* QQ = para->getParH(i)->geometryBC.q27[0];
             unsigned int sizeQ = para->getParH(i)->geometryBC.numberOfBCnodes;
-            QforBoundaryConditions Q;
+            QforBoundaryConditions &Q = para->getParH(i)->geometryBC;
             getPointersToBoundaryConditions(Q, QQ, sizeQ);
             //////////////////////////////////////////////////////////////////
 
@@ -948,7 +1170,7 @@ void GridGenerator::allocArrays_BoundaryQs()
 
 void GridGenerator::allocArrays_OffsetScale()
 {
-    for (uint level = 0; level < builder->getNumberOfGridLevels() - 1; level++) 
+    for (uint level = 0; level < builder->getNumberOfGridLevels() - 1; level++)
     {
         const uint numberOfNodesPerLevelCF = builder->getNumberOfNodesCF(level);
         const uint numberOfNodesPerLevelFC = builder->getNumberOfNodesFC(level);
@@ -987,7 +1209,7 @@ void GridGenerator::allocArrays_OffsetScale()
         builder->getOffsetCF(para->getParH(level)->offCF.xOffCF, para->getParH(level)->offCF.yOffCF, para->getParH(level)->offCF.zOffCF, level);
         builder->getOffsetFC(para->getParH(level)->offFC.xOffFC, para->getParH(level)->offFC.yOffFC, para->getParH(level)->offFC.zOffFC, level);
         builder->getGridInterfaceIndices(para->getParH(level)->intCF.ICellCFC, para->getParH(level)->intCF.ICellCFF, para->getParH(level)->intFC.ICellFCC, para->getParH(level)->intFC.ICellFCF, level);
-        
+
         if (para->getUseStreams() || para->getNumprocs() > 1) {
             // split fine-to-coarse indices into border and bulk
             interpolationGrouper->splitFineToCoarseIntoBorderAndBulk(level);
@@ -1060,8 +1282,8 @@ std::string GridGenerator::verifyNeighborIndices(int level) const
     int wrongNeighbors = 0;
     int stopperNodes = 0;
 
-    for (uint index = 0; index < para->getParH(level)->numberOfNodes; index++)
-        oss << verifyNeighborIndex(level, index, invalidNodes, stopperNodes, wrongNeighbors);
+    for (size_t index = 0; index < para->getParH(level)->numberOfNodes; index++)
+        oss << verifyNeighborIndex(level, (int)index, invalidNodes, stopperNodes, wrongNeighbors);
 
 
     oss << "invalid nodes found: " << invalidNodes << "\n";
@@ -1090,7 +1312,7 @@ std::string GridGenerator::verifyNeighborIndex(int level, int index , int &inval
 
     //std::cout << para->getParH(level)->coordinateX[1] << ", " << para->getParH(level)->coordinateY[1] << ", " << para->getParH(level)->coordinateZ[1] << std::endl;
     //std::cout << para->getParH(level)->coordinateX[para->getParH(level)->numberOfNodes - 1] << ", " << para->getParH(level)->coordinateY[para->getParH(level)->numberOfNodes - 1] << ", " << para->getParH(level)->coordinateZ[para->getParH(level)->numberOfNodes - 1] << std::endl;
-    
+
     real maxX = para->getParH(level)->coordinateX[para->getParH(level)->numberOfNodes - 1] - delta;
     real maxY = para->getParH(level)->coordinateY[para->getParH(level)->numberOfNodes - 1] - delta;
     real maxZ = para->getParH(level)->coordinateZ[para->getParH(level)->numberOfNodes - 1] - delta;
@@ -1131,8 +1353,8 @@ std::string GridGenerator::checkNeighbor(int level, real x, real y, real z, int
 
     if (!neighborValid) {
         oss << "NeighborX invalid from: (" << x << ", " << y << ", " << z << "), index: " << index << ", "
-            << direction << " neighborIndex: " << neighborIndex << 
-            ", actual neighborCoords : (" << neighborCoordX << ", " << neighborCoordY << ", " << neighborCoordZ << 
+            << direction << " neighborIndex: " << neighborIndex <<
+            ", actual neighborCoords : (" << neighborCoordX << ", " << neighborCoordY << ", " << neighborCoordZ <<
             "), expected neighborCoords : (" << neighborX << ", " << neighborY << ", " << neighborZ << ")\n";
         numberOfWrongNeihgbors++;
     }
@@ -1140,31 +1362,31 @@ std::string GridGenerator::checkNeighbor(int level, real x, real y, real z, int
 }
 
 void GridGenerator::getPointersToBoundaryConditions(QforBoundaryConditions& boundaryConditionStruct, real* subgridDistances, const unsigned int numberOfBCnodes){
-    boundaryConditionStruct.q27[DIR_P00] =    &subgridDistances[DIR_P00   * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_M00] =    &subgridDistances[DIR_M00   * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_0P0] =    &subgridDistances[DIR_0P0   * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_0M0] =    &subgridDistances[DIR_0M0   * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_00P] =    &subgridDistances[DIR_00P   * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_00M] =    &subgridDistances[DIR_00M   * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_PP0] =   &subgridDistances[DIR_PP0  * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_MM0] =   &subgridDistances[DIR_MM0  * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_PM0] =   &subgridDistances[DIR_PM0  * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_MP0] =   &subgridDistances[DIR_MP0  * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_P0P] =   &subgridDistances[DIR_P0P  * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_M0M] =   &subgridDistances[DIR_M0M  * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_P0M] =   &subgridDistances[DIR_P0M  * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_M0P] =   &subgridDistances[DIR_M0P  * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_0PP] =   &subgridDistances[DIR_0PP  * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_0MM] =   &subgridDistances[DIR_0MM  * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_0PM] =   &subgridDistances[DIR_0PM  * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_0MP] =   &subgridDistances[DIR_0MP  * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_000] = &subgridDistances[DIR_000* numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_PPP] =  &subgridDistances[DIR_PPP * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_MMP] =  &subgridDistances[DIR_MMP * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_PMP] =  &subgridDistances[DIR_PMP * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_MPP] =  &subgridDistances[DIR_MPP * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_PPM] =  &subgridDistances[DIR_PPM * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_MMM] =  &subgridDistances[DIR_MMM * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_PMM] =  &subgridDistances[DIR_PMM * numberOfBCnodes];
-    boundaryConditionStruct.q27[DIR_MPM] =  &subgridDistances[DIR_MPM * numberOfBCnodes];
-}
\ No newline at end of file
+    boundaryConditionStruct.q27[DIR_P00] = &subgridDistances[DIR_P00 * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_M00] = &subgridDistances[DIR_M00 * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_0P0] = &subgridDistances[DIR_0P0 * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_0M0] = &subgridDistances[DIR_0M0 * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_00P] = &subgridDistances[DIR_00P * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_00M] = &subgridDistances[DIR_00M * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_PP0] = &subgridDistances[DIR_PP0 * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_MM0] = &subgridDistances[DIR_MM0 * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_PM0] = &subgridDistances[DIR_PM0 * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_MP0] = &subgridDistances[DIR_MP0 * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_P0P] = &subgridDistances[DIR_P0P * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_M0M] = &subgridDistances[DIR_M0M * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_P0M] = &subgridDistances[DIR_P0M * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_M0P] = &subgridDistances[DIR_M0P * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_0PP] = &subgridDistances[DIR_0PP * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_0MM] = &subgridDistances[DIR_0MM * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_0PM] = &subgridDistances[DIR_0PM * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_0MP] = &subgridDistances[DIR_0MP * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_000] = &subgridDistances[DIR_000 * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_PPP] = &subgridDistances[DIR_PPP * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_MMP] = &subgridDistances[DIR_MMP * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_PMP] = &subgridDistances[DIR_PMP * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_MPP] = &subgridDistances[DIR_MPP * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_PPM] = &subgridDistances[DIR_PPM * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_MMM] = &subgridDistances[DIR_MMM * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_PMM] = &subgridDistances[DIR_PMM * numberOfBCnodes];
+    boundaryConditionStruct.q27[DIR_MPM] = &subgridDistances[DIR_MPM * numberOfBCnodes];
+}
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h
index d2f56e1df4ee5658c61b8e8a3e94a820d1a4f2f1..c97ed02a64da1d5fafa18150c75d149f96484d44 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h
@@ -40,6 +40,7 @@
 
 #include "LBM/LB.h"
 
+
 class Parameter;
 class GridBuilder;
 class IndexRearrangementForStreams;
@@ -75,8 +76,10 @@ public:
     //! \brief allocates and initialized the sub-grid distances at the boundary conditions
     void allocArrays_BoundaryQs() override;
     void allocArrays_OffsetScale() override;
-    void allocArrays_fluidNodeIndices() override;
-    void allocArrays_fluidNodeIndicesBorder() override;
+    void allocArrays_taggedFluidNodes() override;
+
+    void tagFluidNodeIndices(const std::vector<uint>& taggedFluidNodeIndices, CollisionTemplate tag, uint level) override;
+    void sortFluidNodeTags() override;
 
     virtual void setDimensions() override;
     virtual void setBoundingBox() override;
diff --git a/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.cpp b/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.cpp
index bff054eb174a0f5fa34119deedde6f1c9733d83c..b1c398638cff1ec1b6d52f59f8e773183e270331 100644
--- a/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.cpp
+++ b/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.cpp
@@ -35,6 +35,11 @@ void BoundaryConditionFactory::setStressBoundaryCondition(const StressBC boundar
     this->stressBoundaryCondition = boundaryConditionType;
 }
 
+void BoundaryConditionFactory::setPrecursorBoundaryCondition(const PrecursorBC boundaryConditionType)
+{
+    this->precursorBoundaryCondition = boundaryConditionType;
+}
+
 boundaryCondition BoundaryConditionFactory::getVelocityBoundaryConditionPost(bool isGeometryBC) const
 {
     const VelocityBC &boundaryCondition =
@@ -132,6 +137,22 @@ boundaryCondition BoundaryConditionFactory::getPressureBoundaryConditionPre() co
         case PressureBC::OutflowNonReflective:
             return QPressNoRhoDev27;
             break;
+        case PressureBC::OutflowNonReflectivePressureCorrection:
+            return QPressZeroRhoOutflowDev27;
+        default:
+            return nullptr;
+    }
+}
+
+precursorBoundaryConditionFunc BoundaryConditionFactory::getPrecursorBoundaryConditionPost() const
+{
+    switch (this->precursorBoundaryCondition) {
+        case PrecursorBC::VelocityPrecursor:
+            return QPrecursorDevCompZeroPress;
+            break;
+        case PrecursorBC::DistributionsPrecursor:
+            return PrecursorDevDistributions;
+            break;
         default:
             return nullptr;
     }
diff --git a/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.h b/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.h
index 9d6872c4847be72dff4be7137b774c8082e39e34..c6877cbfeffe5b32c0c2d336e46b02d68cd946a3 100644
--- a/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.h
+++ b/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.h
@@ -42,11 +42,13 @@
 #include "Parameter/Parameter.h"
 #include "gpu/GridGenerator/grid/BoundaryConditions/Side.h"
 
+
 struct LBMSimulationParameter;
 class Parameter;
 
 using boundaryCondition = std::function<void(LBMSimulationParameter *, QforBoundaryConditions *)>;
 using boundaryConditionWithParameter = std::function<void(Parameter *, QforBoundaryConditions *, const int level)>;
+using precursorBoundaryConditionFunc = std::function<void(LBMSimulationParameter *, QforPrecursorBoundaryConditions *, real timeRatio, real velocityRatio)>;
 
 class BoundaryConditionFactory
 {
@@ -109,6 +111,8 @@ public:
         PressureNonEquilibriumCompressible,
         //! - OutflowNonReflective = outflow boundary condition, should be combined with VelocityAndPressureCompressible
         OutflowNonReflective,
+        //! - OutflowNonreflectivePressureCorrection = like OutflowNonReflective, but also reduces pressure overshoot
+        OutflowNonReflectivePressureCorrection,
         //! - NotSpecified =  the user did not set a boundary condition
         NotSpecified
     };
@@ -128,11 +132,21 @@ public:
     // enum class OutflowBoundaryCondition {};  // TODO:
     // https://git.rz.tu-bs.de/m.schoenherr/VirtualFluids_dev/-/issues/16
 
+    enum class PrecursorBC {
+        //! - VelocityPrecursor
+        VelocityPrecursor,
+        //! - DisitributionsPrecursor
+        DistributionsPrecursor,
+        //! - NotSpecified =  the user did not set a boundary condition
+        NotSpecified
+    };
+
     void setVelocityBoundaryCondition(const BoundaryConditionFactory::VelocityBC boundaryConditionType);
     void setNoSlipBoundaryCondition(const BoundaryConditionFactory::NoSlipBC boundaryConditionType);
     void setSlipBoundaryCondition(const BoundaryConditionFactory::SlipBC boundaryConditionType);
     void setPressureBoundaryCondition(const BoundaryConditionFactory::PressureBC boundaryConditionType);
     void setStressBoundaryCondition(const BoundaryConditionFactory::StressBC boundaryConditionType);
+    void setPrecursorBoundaryCondition(const BoundaryConditionFactory::PrecursorBC boundaryConditionType);
     //! \brief set a boundary condition for the geometry
     //! param boundaryConditionType: a velocity, no-slip or slip boundary condition
     //! \details suggestions for boundaryConditionType:
@@ -152,6 +166,8 @@ public:
     [[nodiscard]] boundaryCondition getSlipBoundaryConditionPost(bool isGeometryBC = false) const;
     [[nodiscard]] boundaryCondition getPressureBoundaryConditionPre() const;
     [[nodiscard]] boundaryCondition getGeometryBoundaryConditionPost() const;
+    [[nodiscard]] precursorBoundaryConditionFunc getPrecursorBoundaryConditionPost() const;
+
 
     [[nodiscard]] boundaryConditionWithParameter getStressBoundaryConditionPost() const;
 
@@ -162,6 +178,7 @@ private:
     PressureBC pressureBoundaryCondition = PressureBC::NotSpecified;
     std::variant<VelocityBC, NoSlipBC, SlipBC> geometryBoundaryCondition = NoSlipBC::NoSlipImplicitBounceBack;
     StressBC stressBoundaryCondition = StressBC::NotSpecified;
+    PrecursorBC precursorBoundaryCondition = PrecursorBC::NotSpecified;
 
     // OutflowBoundaryConditon outflowBC // TODO: https://git.rz.tu-bs.de/m.schoenherr/VirtualFluids_dev/-/issues/16
 };
diff --git a/src/gpu/VirtualFluids_GPU/GPU/AdvectionDiffusion27chim.cu b/src/gpu/VirtualFluids_GPU/GPU/AdvectionDiffusion27chim.cu
index 8f54358e04063c9063c873caf02a86e76bb7f936..04f6afe4cf9ebd99dc293ded16f55a56f0d77036 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/AdvectionDiffusion27chim.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/AdvectionDiffusion27chim.cu
@@ -74,7 +74,7 @@ __global__ void Factorized_Central_Moments_Advection_Diffusion_Device_Kernel(
 	uint* neighborZ,
 	real* distributions,
 	real* distributionsAD,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	real* forces,
 	bool isEvenTimestep)
 {
@@ -100,7 +100,7 @@ __global__ void Factorized_Central_Moments_Advection_Diffusion_Device_Kernel(
 
 	//////////////////////////////////////////////////////////////////////////
 	// run for all indices in size_Mat and fluid nodes
-	if ((k < size_Mat) && (typeOfGridNode[k] == GEO_FLUID))
+	if ((k < numberOfLBnodes) && (typeOfGridNode[k] == GEO_FLUID))
 	{
 		//////////////////////////////////////////////////////////////////////////
 		//! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
@@ -109,125 +109,125 @@ __global__ void Factorized_Central_Moments_Advection_Diffusion_Device_Kernel(
 		Distributions27 dist;
 		if (isEvenTimestep)
 		{
-			dist.f[DIR_P00   ] = &distributions[DIR_P00   *size_Mat];
-			dist.f[DIR_M00   ] = &distributions[DIR_M00   *size_Mat];
-			dist.f[DIR_0P0   ] = &distributions[DIR_0P0   *size_Mat];
-			dist.f[DIR_0M0   ] = &distributions[DIR_0M0   *size_Mat];
-			dist.f[DIR_00P   ] = &distributions[DIR_00P   *size_Mat];
-			dist.f[DIR_00M   ] = &distributions[DIR_00M   *size_Mat];
-			dist.f[DIR_PP0  ] = &distributions[DIR_PP0  *size_Mat];
-			dist.f[DIR_MM0  ] = &distributions[DIR_MM0  *size_Mat];
-			dist.f[DIR_PM0  ] = &distributions[DIR_PM0  *size_Mat];
-			dist.f[DIR_MP0  ] = &distributions[DIR_MP0  *size_Mat];
-			dist.f[DIR_P0P  ] = &distributions[DIR_P0P  *size_Mat];
-			dist.f[DIR_M0M  ] = &distributions[DIR_M0M  *size_Mat];
-			dist.f[DIR_P0M  ] = &distributions[DIR_P0M  *size_Mat];
-			dist.f[DIR_M0P  ] = &distributions[DIR_M0P  *size_Mat];
-			dist.f[DIR_0PP  ] = &distributions[DIR_0PP  *size_Mat];
-			dist.f[DIR_0MM  ] = &distributions[DIR_0MM  *size_Mat];
-			dist.f[DIR_0PM  ] = &distributions[DIR_0PM  *size_Mat];
-			dist.f[DIR_0MP  ] = &distributions[DIR_0MP  *size_Mat];
-			dist.f[DIR_000] = &distributions[DIR_000*size_Mat];
-			dist.f[DIR_PPP ] = &distributions[DIR_PPP *size_Mat];
-			dist.f[DIR_MMP ] = &distributions[DIR_MMP *size_Mat];
-			dist.f[DIR_PMP ] = &distributions[DIR_PMP *size_Mat];
-			dist.f[DIR_MPP ] = &distributions[DIR_MPP *size_Mat];
-			dist.f[DIR_PPM ] = &distributions[DIR_PPM *size_Mat];
-			dist.f[DIR_MMM ] = &distributions[DIR_MMM *size_Mat];
-			dist.f[DIR_PMM ] = &distributions[DIR_PMM *size_Mat];
-			dist.f[DIR_MPM ] = &distributions[DIR_MPM *size_Mat];
+			dist.f[DIR_P00] = &distributions[DIR_P00 * numberOfLBnodes];
+			dist.f[DIR_M00] = &distributions[DIR_M00 * numberOfLBnodes];
+			dist.f[DIR_0P0] = &distributions[DIR_0P0 * numberOfLBnodes];
+			dist.f[DIR_0M0] = &distributions[DIR_0M0 * numberOfLBnodes];
+			dist.f[DIR_00P] = &distributions[DIR_00P * numberOfLBnodes];
+			dist.f[DIR_00M] = &distributions[DIR_00M * numberOfLBnodes];
+			dist.f[DIR_PP0] = &distributions[DIR_PP0 * numberOfLBnodes];
+			dist.f[DIR_MM0] = &distributions[DIR_MM0 * numberOfLBnodes];
+			dist.f[DIR_PM0] = &distributions[DIR_PM0 * numberOfLBnodes];
+			dist.f[DIR_MP0] = &distributions[DIR_MP0 * numberOfLBnodes];
+			dist.f[DIR_P0P] = &distributions[DIR_P0P * numberOfLBnodes];
+			dist.f[DIR_M0M] = &distributions[DIR_M0M * numberOfLBnodes];
+			dist.f[DIR_P0M] = &distributions[DIR_P0M * numberOfLBnodes];
+			dist.f[DIR_M0P] = &distributions[DIR_M0P * numberOfLBnodes];
+			dist.f[DIR_0PP] = &distributions[DIR_0PP * numberOfLBnodes];
+			dist.f[DIR_0MM] = &distributions[DIR_0MM * numberOfLBnodes];
+			dist.f[DIR_0PM] = &distributions[DIR_0PM * numberOfLBnodes];
+			dist.f[DIR_0MP] = &distributions[DIR_0MP * numberOfLBnodes];
+			dist.f[DIR_000] = &distributions[DIR_000 * numberOfLBnodes];
+			dist.f[DIR_PPP] = &distributions[DIR_PPP * numberOfLBnodes];
+			dist.f[DIR_MMP] = &distributions[DIR_MMP * numberOfLBnodes];
+			dist.f[DIR_PMP] = &distributions[DIR_PMP * numberOfLBnodes];
+			dist.f[DIR_MPP] = &distributions[DIR_MPP * numberOfLBnodes];
+			dist.f[DIR_PPM] = &distributions[DIR_PPM * numberOfLBnodes];
+			dist.f[DIR_MMM] = &distributions[DIR_MMM * numberOfLBnodes];
+			dist.f[DIR_PMM] = &distributions[DIR_PMM * numberOfLBnodes];
+			dist.f[DIR_MPM] = &distributions[DIR_MPM * numberOfLBnodes];
 		}
 		else
 		{
-			dist.f[DIR_M00   ] = &distributions[DIR_P00   *size_Mat];
-			dist.f[DIR_P00   ] = &distributions[DIR_M00   *size_Mat];
-			dist.f[DIR_0M0   ] = &distributions[DIR_0P0   *size_Mat];
-			dist.f[DIR_0P0   ] = &distributions[DIR_0M0   *size_Mat];
-			dist.f[DIR_00M   ] = &distributions[DIR_00P   *size_Mat];
-			dist.f[DIR_00P   ] = &distributions[DIR_00M   *size_Mat];
-			dist.f[DIR_MM0  ] = &distributions[DIR_PP0  *size_Mat];
-			dist.f[DIR_PP0  ] = &distributions[DIR_MM0  *size_Mat];
-			dist.f[DIR_MP0  ] = &distributions[DIR_PM0  *size_Mat];
-			dist.f[DIR_PM0  ] = &distributions[DIR_MP0  *size_Mat];
-			dist.f[DIR_M0M  ] = &distributions[DIR_P0P  *size_Mat];
-			dist.f[DIR_P0P  ] = &distributions[DIR_M0M  *size_Mat];
-			dist.f[DIR_M0P  ] = &distributions[DIR_P0M  *size_Mat];
-			dist.f[DIR_P0M  ] = &distributions[DIR_M0P  *size_Mat];
-			dist.f[DIR_0MM  ] = &distributions[DIR_0PP  *size_Mat];
-			dist.f[DIR_0PP  ] = &distributions[DIR_0MM  *size_Mat];
-			dist.f[DIR_0MP  ] = &distributions[DIR_0PM  *size_Mat];
-			dist.f[DIR_0PM  ] = &distributions[DIR_0MP  *size_Mat];
-			dist.f[DIR_000] = &distributions[DIR_000*size_Mat];
-			dist.f[DIR_MMM ] = &distributions[DIR_PPP *size_Mat];
-			dist.f[DIR_PPM ] = &distributions[DIR_MMP *size_Mat];
-			dist.f[DIR_MPM ] = &distributions[DIR_PMP *size_Mat];
-			dist.f[DIR_PMM ] = &distributions[DIR_MPP *size_Mat];
-			dist.f[DIR_MMP ] = &distributions[DIR_PPM *size_Mat];
-			dist.f[DIR_PPP ] = &distributions[DIR_MMM *size_Mat];
-			dist.f[DIR_MPP ] = &distributions[DIR_PMM *size_Mat];
-			dist.f[DIR_PMP ] = &distributions[DIR_MPM *size_Mat];
+			dist.f[DIR_M00] = &distributions[DIR_P00 * numberOfLBnodes];
+			dist.f[DIR_P00] = &distributions[DIR_M00 * numberOfLBnodes];
+			dist.f[DIR_0M0] = &distributions[DIR_0P0 * numberOfLBnodes];
+			dist.f[DIR_0P0] = &distributions[DIR_0M0 * numberOfLBnodes];
+			dist.f[DIR_00M] = &distributions[DIR_00P * numberOfLBnodes];
+			dist.f[DIR_00P] = &distributions[DIR_00M * numberOfLBnodes];
+			dist.f[DIR_MM0] = &distributions[DIR_PP0 * numberOfLBnodes];
+			dist.f[DIR_PP0] = &distributions[DIR_MM0 * numberOfLBnodes];
+			dist.f[DIR_MP0] = &distributions[DIR_PM0 * numberOfLBnodes];
+			dist.f[DIR_PM0] = &distributions[DIR_MP0 * numberOfLBnodes];
+			dist.f[DIR_M0M] = &distributions[DIR_P0P * numberOfLBnodes];
+			dist.f[DIR_P0P] = &distributions[DIR_M0M * numberOfLBnodes];
+			dist.f[DIR_M0P] = &distributions[DIR_P0M * numberOfLBnodes];
+			dist.f[DIR_P0M] = &distributions[DIR_M0P * numberOfLBnodes];
+			dist.f[DIR_0MM] = &distributions[DIR_0PP * numberOfLBnodes];
+			dist.f[DIR_0PP] = &distributions[DIR_0MM * numberOfLBnodes];
+			dist.f[DIR_0MP] = &distributions[DIR_0PM * numberOfLBnodes];
+			dist.f[DIR_0PM] = &distributions[DIR_0MP * numberOfLBnodes];
+			dist.f[DIR_000] = &distributions[DIR_000 * numberOfLBnodes];
+			dist.f[DIR_MMM] = &distributions[DIR_PPP * numberOfLBnodes];
+			dist.f[DIR_PPM] = &distributions[DIR_MMP * numberOfLBnodes];
+			dist.f[DIR_MPM] = &distributions[DIR_PMP * numberOfLBnodes];
+			dist.f[DIR_PMM] = &distributions[DIR_MPP * numberOfLBnodes];
+			dist.f[DIR_MMP] = &distributions[DIR_PPM * numberOfLBnodes];
+			dist.f[DIR_PPP] = &distributions[DIR_MMM * numberOfLBnodes];
+			dist.f[DIR_MPP] = &distributions[DIR_PMM * numberOfLBnodes];
+			dist.f[DIR_PMP] = &distributions[DIR_MPM * numberOfLBnodes];
 		}
 		////////////////////////////////////////////////////////////////////////////////
 		Distributions27 distAD;
 		if (isEvenTimestep)
 		{
-			distAD.f[DIR_P00   ] = &distributionsAD[DIR_P00   *size_Mat];
-			distAD.f[DIR_M00   ] = &distributionsAD[DIR_M00   *size_Mat];
-			distAD.f[DIR_0P0   ] = &distributionsAD[DIR_0P0   *size_Mat];
-			distAD.f[DIR_0M0   ] = &distributionsAD[DIR_0M0   *size_Mat];
-			distAD.f[DIR_00P   ] = &distributionsAD[DIR_00P   *size_Mat];
-			distAD.f[DIR_00M   ] = &distributionsAD[DIR_00M   *size_Mat];
-			distAD.f[DIR_PP0  ] = &distributionsAD[DIR_PP0  *size_Mat];
-			distAD.f[DIR_MM0  ] = &distributionsAD[DIR_MM0  *size_Mat];
-			distAD.f[DIR_PM0  ] = &distributionsAD[DIR_PM0  *size_Mat];
-			distAD.f[DIR_MP0  ] = &distributionsAD[DIR_MP0  *size_Mat];
-			distAD.f[DIR_P0P  ] = &distributionsAD[DIR_P0P  *size_Mat];
-			distAD.f[DIR_M0M  ] = &distributionsAD[DIR_M0M  *size_Mat];
-			distAD.f[DIR_P0M  ] = &distributionsAD[DIR_P0M  *size_Mat];
-			distAD.f[DIR_M0P  ] = &distributionsAD[DIR_M0P  *size_Mat];
-			distAD.f[DIR_0PP  ] = &distributionsAD[DIR_0PP  *size_Mat];
-			distAD.f[DIR_0MM  ] = &distributionsAD[DIR_0MM  *size_Mat];
-			distAD.f[DIR_0PM  ] = &distributionsAD[DIR_0PM  *size_Mat];
-			distAD.f[DIR_0MP  ] = &distributionsAD[DIR_0MP  *size_Mat];
-			distAD.f[DIR_000] = &distributionsAD[DIR_000*size_Mat];
-			distAD.f[DIR_PPP ] = &distributionsAD[DIR_PPP *size_Mat];
-			distAD.f[DIR_MMP ] = &distributionsAD[DIR_MMP *size_Mat];
-			distAD.f[DIR_PMP ] = &distributionsAD[DIR_PMP *size_Mat];
-			distAD.f[DIR_MPP ] = &distributionsAD[DIR_MPP *size_Mat];
-			distAD.f[DIR_PPM ] = &distributionsAD[DIR_PPM *size_Mat];
-			distAD.f[DIR_MMM ] = &distributionsAD[DIR_MMM *size_Mat];
-			distAD.f[DIR_PMM ] = &distributionsAD[DIR_PMM *size_Mat];
-			distAD.f[DIR_MPM ] = &distributionsAD[DIR_MPM *size_Mat];
+			distAD.f[DIR_P00] = &distributionsAD[DIR_P00 * numberOfLBnodes];
+			distAD.f[DIR_M00] = &distributionsAD[DIR_M00 * numberOfLBnodes];
+			distAD.f[DIR_0P0] = &distributionsAD[DIR_0P0 * numberOfLBnodes];
+			distAD.f[DIR_0M0] = &distributionsAD[DIR_0M0 * numberOfLBnodes];
+			distAD.f[DIR_00P] = &distributionsAD[DIR_00P * numberOfLBnodes];
+			distAD.f[DIR_00M] = &distributionsAD[DIR_00M * numberOfLBnodes];
+			distAD.f[DIR_PP0] = &distributionsAD[DIR_PP0 * numberOfLBnodes];
+			distAD.f[DIR_MM0] = &distributionsAD[DIR_MM0 * numberOfLBnodes];
+			distAD.f[DIR_PM0] = &distributionsAD[DIR_PM0 * numberOfLBnodes];
+			distAD.f[DIR_MP0] = &distributionsAD[DIR_MP0 * numberOfLBnodes];
+			distAD.f[DIR_P0P] = &distributionsAD[DIR_P0P * numberOfLBnodes];
+			distAD.f[DIR_M0M] = &distributionsAD[DIR_M0M * numberOfLBnodes];
+			distAD.f[DIR_P0M] = &distributionsAD[DIR_P0M * numberOfLBnodes];
+			distAD.f[DIR_M0P] = &distributionsAD[DIR_M0P * numberOfLBnodes];
+			distAD.f[DIR_0PP] = &distributionsAD[DIR_0PP * numberOfLBnodes];
+			distAD.f[DIR_0MM] = &distributionsAD[DIR_0MM * numberOfLBnodes];
+			distAD.f[DIR_0PM] = &distributionsAD[DIR_0PM * numberOfLBnodes];
+			distAD.f[DIR_0MP] = &distributionsAD[DIR_0MP * numberOfLBnodes];
+			distAD.f[DIR_000] = &distributionsAD[DIR_000 * numberOfLBnodes];
+			distAD.f[DIR_PPP] = &distributionsAD[DIR_PPP * numberOfLBnodes];
+			distAD.f[DIR_MMP] = &distributionsAD[DIR_MMP * numberOfLBnodes];
+			distAD.f[DIR_PMP] = &distributionsAD[DIR_PMP * numberOfLBnodes];
+			distAD.f[DIR_MPP] = &distributionsAD[DIR_MPP * numberOfLBnodes];
+			distAD.f[DIR_PPM] = &distributionsAD[DIR_PPM * numberOfLBnodes];
+			distAD.f[DIR_MMM] = &distributionsAD[DIR_MMM * numberOfLBnodes];
+			distAD.f[DIR_PMM] = &distributionsAD[DIR_PMM * numberOfLBnodes];
+			distAD.f[DIR_MPM] = &distributionsAD[DIR_MPM * numberOfLBnodes];
 		}
 		else
 		{
-			distAD.f[DIR_M00   ] = &distributionsAD[DIR_P00   *size_Mat];
-			distAD.f[DIR_P00   ] = &distributionsAD[DIR_M00   *size_Mat];
-			distAD.f[DIR_0M0   ] = &distributionsAD[DIR_0P0   *size_Mat];
-			distAD.f[DIR_0P0   ] = &distributionsAD[DIR_0M0   *size_Mat];
-			distAD.f[DIR_00M   ] = &distributionsAD[DIR_00P   *size_Mat];
-			distAD.f[DIR_00P   ] = &distributionsAD[DIR_00M   *size_Mat];
-			distAD.f[DIR_MM0  ] = &distributionsAD[DIR_PP0  *size_Mat];
-			distAD.f[DIR_PP0  ] = &distributionsAD[DIR_MM0  *size_Mat];
-			distAD.f[DIR_MP0  ] = &distributionsAD[DIR_PM0  *size_Mat];
-			distAD.f[DIR_PM0  ] = &distributionsAD[DIR_MP0  *size_Mat];
-			distAD.f[DIR_M0M  ] = &distributionsAD[DIR_P0P  *size_Mat];
-			distAD.f[DIR_P0P  ] = &distributionsAD[DIR_M0M  *size_Mat];
-			distAD.f[DIR_M0P  ] = &distributionsAD[DIR_P0M  *size_Mat];
-			distAD.f[DIR_P0M  ] = &distributionsAD[DIR_M0P  *size_Mat];
-			distAD.f[DIR_0MM  ] = &distributionsAD[DIR_0PP  *size_Mat];
-			distAD.f[DIR_0PP  ] = &distributionsAD[DIR_0MM  *size_Mat];
-			distAD.f[DIR_0MP  ] = &distributionsAD[DIR_0PM  *size_Mat];
-			distAD.f[DIR_0PM  ] = &distributionsAD[DIR_0MP  *size_Mat];
-			distAD.f[DIR_000] = &distributionsAD[DIR_000*size_Mat];
-			distAD.f[DIR_MMM ] = &distributionsAD[DIR_PPP *size_Mat];
-			distAD.f[DIR_PPM ] = &distributionsAD[DIR_MMP *size_Mat];
-			distAD.f[DIR_MPM ] = &distributionsAD[DIR_PMP *size_Mat];
-			distAD.f[DIR_PMM ] = &distributionsAD[DIR_MPP *size_Mat];
-			distAD.f[DIR_MMP ] = &distributionsAD[DIR_PPM *size_Mat];
-			distAD.f[DIR_PPP ] = &distributionsAD[DIR_MMM *size_Mat];
-			distAD.f[DIR_MPP ] = &distributionsAD[DIR_PMM *size_Mat];
-			distAD.f[DIR_PMP ] = &distributionsAD[DIR_MPM *size_Mat];
+			distAD.f[DIR_M00] = &distributionsAD[DIR_P00 * numberOfLBnodes];
+			distAD.f[DIR_P00] = &distributionsAD[DIR_M00 * numberOfLBnodes];
+			distAD.f[DIR_0M0] = &distributionsAD[DIR_0P0 * numberOfLBnodes];
+			distAD.f[DIR_0P0] = &distributionsAD[DIR_0M0 * numberOfLBnodes];
+			distAD.f[DIR_00M] = &distributionsAD[DIR_00P * numberOfLBnodes];
+			distAD.f[DIR_00P] = &distributionsAD[DIR_00M * numberOfLBnodes];
+			distAD.f[DIR_MM0] = &distributionsAD[DIR_PP0 * numberOfLBnodes];
+			distAD.f[DIR_PP0] = &distributionsAD[DIR_MM0 * numberOfLBnodes];
+			distAD.f[DIR_MP0] = &distributionsAD[DIR_PM0 * numberOfLBnodes];
+			distAD.f[DIR_PM0] = &distributionsAD[DIR_MP0 * numberOfLBnodes];
+			distAD.f[DIR_M0M] = &distributionsAD[DIR_P0P * numberOfLBnodes];
+			distAD.f[DIR_P0P] = &distributionsAD[DIR_M0M * numberOfLBnodes];
+			distAD.f[DIR_M0P] = &distributionsAD[DIR_P0M * numberOfLBnodes];
+			distAD.f[DIR_P0M] = &distributionsAD[DIR_M0P * numberOfLBnodes];
+			distAD.f[DIR_0MM] = &distributionsAD[DIR_0PP * numberOfLBnodes];
+			distAD.f[DIR_0PP] = &distributionsAD[DIR_0MM * numberOfLBnodes];
+			distAD.f[DIR_0MP] = &distributionsAD[DIR_0PM * numberOfLBnodes];
+			distAD.f[DIR_0PM] = &distributionsAD[DIR_0MP * numberOfLBnodes];
+			distAD.f[DIR_000] = &distributionsAD[DIR_000 * numberOfLBnodes];
+			distAD.f[DIR_MMM] = &distributionsAD[DIR_PPP * numberOfLBnodes];
+			distAD.f[DIR_PPM] = &distributionsAD[DIR_MMP * numberOfLBnodes];
+			distAD.f[DIR_MPM] = &distributionsAD[DIR_PMP * numberOfLBnodes];
+			distAD.f[DIR_PMM] = &distributionsAD[DIR_MPP * numberOfLBnodes];
+			distAD.f[DIR_MMP] = &distributionsAD[DIR_PPM * numberOfLBnodes];
+			distAD.f[DIR_PPP] = &distributionsAD[DIR_MMM * numberOfLBnodes];
+			distAD.f[DIR_MPP] = &distributionsAD[DIR_PMM * numberOfLBnodes];
+			distAD.f[DIR_PMP] = &distributionsAD[DIR_MPM * numberOfLBnodes];
 		}
 		////////////////////////////////////////////////////////////////////////////////
 		//! - Set neighbor indices (necessary for indirect addressing)
@@ -241,63 +241,63 @@ __global__ void Factorized_Central_Moments_Advection_Diffusion_Device_Kernel(
 		////////////////////////////////////////////////////////////////////////////////////
 		//! - Set local distributions Fluid
 		//!
-		real fcbb = (dist.f[DIR_P00   ])[k];
-		real fabb = (dist.f[DIR_M00   ])[kw];
-		real fbcb = (dist.f[DIR_0P0   ])[k];
-		real fbab = (dist.f[DIR_0M0   ])[ks];
-		real fbbc = (dist.f[DIR_00P   ])[k];
-		real fbba = (dist.f[DIR_00M   ])[kb];
-		real fccb = (dist.f[DIR_PP0  ])[k];
-		real faab = (dist.f[DIR_MM0  ])[ksw];
-		real fcab = (dist.f[DIR_PM0  ])[ks];
-		real facb = (dist.f[DIR_MP0  ])[kw];
-		real fcbc = (dist.f[DIR_P0P  ])[k];
-		real faba = (dist.f[DIR_M0M  ])[kbw];
-		real fcba = (dist.f[DIR_P0M  ])[kb];
-		real fabc = (dist.f[DIR_M0P  ])[kw];
-		real fbcc = (dist.f[DIR_0PP  ])[k];
-		real fbaa = (dist.f[DIR_0MM  ])[kbs];
-		real fbca = (dist.f[DIR_0PM  ])[kb];
-		real fbac = (dist.f[DIR_0MP  ])[ks];
+		real fcbb = (dist.f[DIR_P00])[k];
+		real fabb = (dist.f[DIR_M00])[kw];
+		real fbcb = (dist.f[DIR_0P0])[k];
+		real fbab = (dist.f[DIR_0M0])[ks];
+		real fbbc = (dist.f[DIR_00P])[k];
+		real fbba = (dist.f[DIR_00M])[kb];
+		real fccb = (dist.f[DIR_PP0])[k];
+		real faab = (dist.f[DIR_MM0])[ksw];
+		real fcab = (dist.f[DIR_PM0])[ks];
+		real facb = (dist.f[DIR_MP0])[kw];
+		real fcbc = (dist.f[DIR_P0P])[k];
+		real faba = (dist.f[DIR_M0M])[kbw];
+		real fcba = (dist.f[DIR_P0M])[kb];
+		real fabc = (dist.f[DIR_M0P])[kw];
+		real fbcc = (dist.f[DIR_0PP])[k];
+		real fbaa = (dist.f[DIR_0MM])[kbs];
+		real fbca = (dist.f[DIR_0PM])[kb];
+		real fbac = (dist.f[DIR_0MP])[ks];
 		real fbbb = (dist.f[DIR_000])[k];
-		real fccc = (dist.f[DIR_PPP ])[k];
-		real faac = (dist.f[DIR_MMP ])[ksw];
-		real fcac = (dist.f[DIR_PMP ])[ks];
-		real facc = (dist.f[DIR_MPP ])[kw];
-		real fcca = (dist.f[DIR_PPM ])[kb];
-		real faaa = (dist.f[DIR_MMM ])[kbsw];
-		real fcaa = (dist.f[DIR_PMM ])[kbs];
-		real faca = (dist.f[DIR_MPM ])[kbw];
+		real fccc = (dist.f[DIR_PPP])[k];
+		real faac = (dist.f[DIR_MMP])[ksw];
+		real fcac = (dist.f[DIR_PMP])[ks];
+		real facc = (dist.f[DIR_MPP])[kw];
+		real fcca = (dist.f[DIR_PPM])[kb];
+		real faaa = (dist.f[DIR_MMM])[kbsw];
+		real fcaa = (dist.f[DIR_PMM])[kbs];
+		real faca = (dist.f[DIR_MPM])[kbw];
 		////////////////////////////////////////////////////////////////////////////////////
 		//! - Set local distributions Advection Diffusion
 		//!
-		real mfcbb = (distAD.f[DIR_P00   ])[k];
-		real mfabb = (distAD.f[DIR_M00   ])[kw];
-		real mfbcb = (distAD.f[DIR_0P0   ])[k];
-		real mfbab = (distAD.f[DIR_0M0   ])[ks];
-		real mfbbc = (distAD.f[DIR_00P   ])[k];
-		real mfbba = (distAD.f[DIR_00M   ])[kb];
-		real mfccb = (distAD.f[DIR_PP0  ])[k];
-		real mfaab = (distAD.f[DIR_MM0  ])[ksw];
-		real mfcab = (distAD.f[DIR_PM0  ])[ks];
-		real mfacb = (distAD.f[DIR_MP0  ])[kw];
-		real mfcbc = (distAD.f[DIR_P0P  ])[k];
-		real mfaba = (distAD.f[DIR_M0M  ])[kbw];
-		real mfcba = (distAD.f[DIR_P0M  ])[kb];
-		real mfabc = (distAD.f[DIR_M0P  ])[kw];
-		real mfbcc = (distAD.f[DIR_0PP  ])[k];
-		real mfbaa = (distAD.f[DIR_0MM  ])[kbs];
-		real mfbca = (distAD.f[DIR_0PM  ])[kb];
-		real mfbac = (distAD.f[DIR_0MP  ])[ks];
+		real mfcbb = (distAD.f[DIR_P00])[k];
+		real mfabb = (distAD.f[DIR_M00])[kw];
+		real mfbcb = (distAD.f[DIR_0P0])[k];
+		real mfbab = (distAD.f[DIR_0M0])[ks];
+		real mfbbc = (distAD.f[DIR_00P])[k];
+		real mfbba = (distAD.f[DIR_00M])[kb];
+		real mfccb = (distAD.f[DIR_PP0])[k];
+		real mfaab = (distAD.f[DIR_MM0])[ksw];
+		real mfcab = (distAD.f[DIR_PM0])[ks];
+		real mfacb = (distAD.f[DIR_MP0])[kw];
+		real mfcbc = (distAD.f[DIR_P0P])[k];
+		real mfaba = (distAD.f[DIR_M0M])[kbw];
+		real mfcba = (distAD.f[DIR_P0M])[kb];
+		real mfabc = (distAD.f[DIR_M0P])[kw];
+		real mfbcc = (distAD.f[DIR_0PP])[k];
+		real mfbaa = (distAD.f[DIR_0MM])[kbs];
+		real mfbca = (distAD.f[DIR_0PM])[kb];
+		real mfbac = (distAD.f[DIR_0MP])[ks];
 		real mfbbb = (distAD.f[DIR_000])[k];
-		real mfccc = (distAD.f[DIR_PPP ])[k];
-		real mfaac = (distAD.f[DIR_MMP ])[ksw];
-		real mfcac = (distAD.f[DIR_PMP ])[ks];
-		real mfacc = (distAD.f[DIR_MPP ])[kw];
-		real mfcca = (distAD.f[DIR_PPM ])[kb];
-		real mfaaa = (distAD.f[DIR_MMM ])[kbsw];
-		real mfcaa = (distAD.f[DIR_PMM ])[kbs];
-		real mfaca = (distAD.f[DIR_MPM ])[kbw];
+		real mfccc = (distAD.f[DIR_PPP])[k];
+		real mfaac = (distAD.f[DIR_MMP])[ksw];
+		real mfcac = (distAD.f[DIR_PMP])[ks];
+		real mfacc = (distAD.f[DIR_MPP])[kw];
+		real mfcca = (distAD.f[DIR_PPM])[kb];
+		real mfaaa = (distAD.f[DIR_MMM])[kbsw];
+		real mfcaa = (distAD.f[DIR_PMM])[kbs];
+		real mfaca = (distAD.f[DIR_MPM])[kbw];
 		////////////////////////////////////////////////////////////////////////////////////
 		//! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
 		//! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
@@ -503,33 +503,33 @@ __global__ void Factorized_Central_Moments_Advection_Diffusion_Device_Kernel(
 		//! stored arrays dependent on timestep is based on the esoteric twist algorithm
 		//! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
 		//!
-		(distAD.f[DIR_P00   ])[k   ] = mfabb;
-		(distAD.f[DIR_M00   ])[kw  ] = mfcbb;
-		(distAD.f[DIR_0P0   ])[k   ] = mfbab;
-		(distAD.f[DIR_0M0   ])[ks  ] = mfbcb;
-		(distAD.f[DIR_00P   ])[k   ] = mfbba;
-		(distAD.f[DIR_00M   ])[kb  ] = mfbbc;
-		(distAD.f[DIR_PP0  ])[k   ] = mfaab;
-		(distAD.f[DIR_MM0  ])[ksw ] = mfccb;
-		(distAD.f[DIR_PM0  ])[ks  ] = mfacb;
-		(distAD.f[DIR_MP0  ])[kw  ] = mfcab;
-		(distAD.f[DIR_P0P  ])[k   ] = mfaba;
-		(distAD.f[DIR_M0M  ])[kbw ] = mfcbc;
-		(distAD.f[DIR_P0M  ])[kb  ] = mfabc;
-		(distAD.f[DIR_M0P  ])[kw  ] = mfcba;
-		(distAD.f[DIR_0PP  ])[k   ] = mfbaa;
-		(distAD.f[DIR_0MM  ])[kbs ] = mfbcc;
-		(distAD.f[DIR_0PM  ])[kb  ] = mfbac;
-		(distAD.f[DIR_0MP  ])[ks  ] = mfbca;
+		(distAD.f[DIR_P00])[k   ] = mfabb;
+		(distAD.f[DIR_M00])[kw  ] = mfcbb;
+		(distAD.f[DIR_0P0])[k   ] = mfbab;
+		(distAD.f[DIR_0M0])[ks  ] = mfbcb;
+		(distAD.f[DIR_00P])[k   ] = mfbba;
+		(distAD.f[DIR_00M])[kb  ] = mfbbc;
+		(distAD.f[DIR_PP0])[k   ] = mfaab;
+		(distAD.f[DIR_MM0])[ksw ] = mfccb;
+		(distAD.f[DIR_PM0])[ks  ] = mfacb;
+		(distAD.f[DIR_MP0])[kw  ] = mfcab;
+		(distAD.f[DIR_P0P])[k   ] = mfaba;
+		(distAD.f[DIR_M0M])[kbw ] = mfcbc;
+		(distAD.f[DIR_P0M])[kb  ] = mfabc;
+		(distAD.f[DIR_M0P])[kw  ] = mfcba;
+		(distAD.f[DIR_0PP])[k   ] = mfbaa;
+		(distAD.f[DIR_0MM])[kbs ] = mfbcc;
+		(distAD.f[DIR_0PM])[kb  ] = mfbac;
+		(distAD.f[DIR_0MP])[ks  ] = mfbca;
 		(distAD.f[DIR_000])[k   ] = mfbbb;
-		(distAD.f[DIR_PPP ])[k   ] = mfaaa;
-		(distAD.f[DIR_PMP ])[ks  ] = mfaca;
-		(distAD.f[DIR_PPM ])[kb  ] = mfaac;
-		(distAD.f[DIR_PMM ])[kbs ] = mfacc;
-		(distAD.f[DIR_MPP ])[kw  ] = mfcaa;
-		(distAD.f[DIR_MMP ])[ksw ] = mfcca;
-		(distAD.f[DIR_MPM ])[kbw ] = mfcac;
-		(distAD.f[DIR_MMM ])[kbsw] = mfccc;
+		(distAD.f[DIR_PPP])[k   ] = mfaaa;
+		(distAD.f[DIR_PMP])[ks  ] = mfaca;
+		(distAD.f[DIR_PPM])[kb  ] = mfaac;
+		(distAD.f[DIR_PMM])[kbs ] = mfacc;
+		(distAD.f[DIR_MPP])[kw  ] = mfcaa;
+		(distAD.f[DIR_MMP])[ksw ] = mfcca;
+		(distAD.f[DIR_MPM])[kbw ] = mfcac;
+		(distAD.f[DIR_MMM])[kbsw] = mfccc;
 	}
 }
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/GPU/AdvectionDiffusionBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/AdvectionDiffusionBCs27.cu
index ecf98a7494a0a5e1c81c1040917e941f066605e6..116ce20389985e0efa650598108224b2e3e25221 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/AdvectionDiffusionBCs27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/AdvectionDiffusionBCs27.cu
@@ -20,91 +20,91 @@ __global__ void QADPress7(  real* DD,
                                        unsigned int* neighborX,
                                        unsigned int* neighborY,
                                        unsigned int* neighborZ,
-                                       unsigned int size_Mat, 
+                                       unsigned long long numberOfLBnodes, 
                                        bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
 
    Distributions7 D7;
    if (isEvenTimestep==true)
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[1] = &DD7[1*size_Mat];
-      D7.f[2] = &DD7[2*size_Mat];
-      D7.f[3] = &DD7[3*size_Mat];
-      D7.f[4] = &DD7[4*size_Mat];
-      D7.f[5] = &DD7[5*size_Mat];
-      D7.f[6] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[1] = &DD7[1*numberOfLBnodes];
+      D7.f[2] = &DD7[2*numberOfLBnodes];
+      D7.f[3] = &DD7[3*numberOfLBnodes];
+      D7.f[4] = &DD7[4*numberOfLBnodes];
+      D7.f[5] = &DD7[5*numberOfLBnodes];
+      D7.f[6] = &DD7[6*numberOfLBnodes];
    }
    else
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[2] = &DD7[1*size_Mat];
-      D7.f[1] = &DD7[2*size_Mat];
-      D7.f[4] = &DD7[3*size_Mat];
-      D7.f[3] = &DD7[4*size_Mat];
-      D7.f[6] = &DD7[5*size_Mat];
-      D7.f[5] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[2] = &DD7[1*numberOfLBnodes];
+      D7.f[1] = &DD7[2*numberOfLBnodes];
+      D7.f[4] = &DD7[3*numberOfLBnodes];
+      D7.f[3] = &DD7[4*numberOfLBnodes];
+      D7.f[6] = &DD7[5*numberOfLBnodes];
+      D7.f[5] = &DD7[6*numberOfLBnodes];
    }
 
 
@@ -128,24 +128,24 @@ __global__ void QADPress7(  real* DD,
       //         *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
       //         *q_dirBSE, *q_dirBNW;
 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      //q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      //q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      //q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      //q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      //q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      //q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      //q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      //q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      //q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      //q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      //q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      //q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      //q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      //q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      //q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      //q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      //q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      //q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      //q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      //q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      //q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      //q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      //q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      //q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       //q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       //q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       //q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -188,32 +188,32 @@ __global__ void QADPress7(  real* DD,
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       /*real drho*/;
       //real vx1_Inflow   = zero;
@@ -293,23 +293,23 @@ __global__ void QADPress7(  real* DD,
       //pointertausch
       if (isEvenTimestep==false)
       {
-         D7.f[0] = &DD7[0*size_Mat];
-         D7.f[1] = &DD7[1*size_Mat];
-         D7.f[2] = &DD7[2*size_Mat];
-         D7.f[3] = &DD7[3*size_Mat];
-         D7.f[4] = &DD7[4*size_Mat];
-         D7.f[5] = &DD7[5*size_Mat];
-         D7.f[6] = &DD7[6*size_Mat];
+         D7.f[0] = &DD7[0*numberOfLBnodes];
+         D7.f[1] = &DD7[1*numberOfLBnodes];
+         D7.f[2] = &DD7[2*numberOfLBnodes];
+         D7.f[3] = &DD7[3*numberOfLBnodes];
+         D7.f[4] = &DD7[4*numberOfLBnodes];
+         D7.f[5] = &DD7[5*numberOfLBnodes];
+         D7.f[6] = &DD7[6*numberOfLBnodes];
       }
       else
       {
-         D7.f[0] = &DD7[0*size_Mat];
-         D7.f[2] = &DD7[1*size_Mat];
-         D7.f[1] = &DD7[2*size_Mat];
-         D7.f[4] = &DD7[3*size_Mat];
-         D7.f[3] = &DD7[4*size_Mat];
-         D7.f[6] = &DD7[5*size_Mat];
-         D7.f[5] = &DD7[6*size_Mat];
+         D7.f[0] = &DD7[0*numberOfLBnodes];
+         D7.f[2] = &DD7[1*numberOfLBnodes];
+         D7.f[1] = &DD7[2*numberOfLBnodes];
+         D7.f[4] = &DD7[3*numberOfLBnodes];
+         D7.f[3] = &DD7[4*numberOfLBnodes];
+         D7.f[6] = &DD7[5*numberOfLBnodes];
+         D7.f[5] = &DD7[6*numberOfLBnodes];
       }
 
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -461,131 +461,131 @@ __global__ void QADPress27( real* DD,
                                        unsigned int* neighborX,
                                        unsigned int* neighborY,
                                        unsigned int* neighborZ,
-                                       unsigned int size_Mat, 
+                                       unsigned long long numberOfLBnodes, 
                                        bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
 
    Distributions27 D27;
    if (isEvenTimestep==true)
    {
-      D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+      D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+      D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -606,24 +606,24 @@ __global__ void QADPress27( real* DD,
          *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
          *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
          *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -663,33 +663,33 @@ __global__ void QADPress27( real* DD,
       unsigned int ktne = KQK;
       unsigned int kbsw = neighborZ[ksw];
       ////////////////////////////////////////////////////////////////////////////////
-      real f_W    = (D.f[DIR_P00   ])[ke   ];
-      real f_E    = (D.f[DIR_M00   ])[kw   ];
-      real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      real f_B    = (D.f[DIR_00P   ])[kt   ];
-      real f_T    = (D.f[DIR_00M   ])[kb   ];
-      real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (D.f[DIR_0MP  ])[kts  ];
+      real f_W    = (D.f[DIR_P00])[ke   ];
+      real f_E    = (D.f[DIR_M00])[kw   ];
+      real f_S    = (D.f[DIR_0P0])[kn   ];
+      real f_N    = (D.f[DIR_0M0])[ks   ];
+      real f_B    = (D.f[DIR_00P])[kt   ];
+      real f_T    = (D.f[DIR_00M])[kb   ];
+      real f_SW   = (D.f[DIR_PP0])[kne  ];
+      real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      real f_NW   = (D.f[DIR_PM0])[kse  ];
+      real f_SE   = (D.f[DIR_MP0])[knw  ];
+      real f_BW   = (D.f[DIR_P0P])[kte  ];
+      real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      real f_BN   = (D.f[DIR_0MP])[kts  ];
       real f_ZERO = (D.f[DIR_000])[kzero];
-      real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      real f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, /*drho, feq,*/ q;
       //drho   = f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -715,33 +715,33 @@ __global__ void QADPress27( real* DD,
       vx2            =  OORho*((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_BNW-f_TSE)+(f_TNW-f_BSE) +(f_NE-f_SW)+(f_NW-f_SE)+(f_TN-f_BS)+(f_BN-f_TS)+(f_N-f_S));
       vx3            =  OORho*((f_TNE-f_BSW)+(f_TSW-f_BNE)+(f_TSE-f_BNW)+(f_TNW-f_BSE) +(f_TE-f_BW)+(f_TW-f_BE)+(f_TN-f_BS)+(f_TS-f_BN)+(f_T-f_B));
       ////////////////////////////////////////////////////////////////////////////////
-      real f27_W    = (D27.f[DIR_P00   ])[ke   ];
-      real f27_E    = (D27.f[DIR_M00   ])[kw   ];
-      real f27_S    = (D27.f[DIR_0P0   ])[kn   ];
-      real f27_N    = (D27.f[DIR_0M0   ])[ks   ];
-      real f27_B    = (D27.f[DIR_00P   ])[kt   ];
-      real f27_T    = (D27.f[DIR_00M   ])[kb   ];
-      real f27_SW   = (D27.f[DIR_PP0  ])[kne  ];
-      real f27_NE   = (D27.f[DIR_MM0  ])[ksw  ];
-      real f27_NW   = (D27.f[DIR_PM0  ])[kse  ];
-      real f27_SE   = (D27.f[DIR_MP0  ])[knw  ];
-      real f27_BW   = (D27.f[DIR_P0P  ])[kte  ];
-      real f27_TE   = (D27.f[DIR_M0M  ])[kbw  ];
-      real f27_TW   = (D27.f[DIR_P0M  ])[kbe  ];
-      real f27_BE   = (D27.f[DIR_M0P  ])[ktw  ];
-      real f27_BS   = (D27.f[DIR_0PP  ])[ktn  ];
-      real f27_TN   = (D27.f[DIR_0MM  ])[kbs  ];
-      real f27_TS   = (D27.f[DIR_0PM  ])[kbn  ];
-      real f27_BN   = (D27.f[DIR_0MP  ])[kts  ];
+      real f27_W    = (D27.f[DIR_P00])[ke   ];
+      real f27_E    = (D27.f[DIR_M00])[kw   ];
+      real f27_S    = (D27.f[DIR_0P0])[kn   ];
+      real f27_N    = (D27.f[DIR_0M0])[ks   ];
+      real f27_B    = (D27.f[DIR_00P])[kt   ];
+      real f27_T    = (D27.f[DIR_00M])[kb   ];
+      real f27_SW   = (D27.f[DIR_PP0])[kne  ];
+      real f27_NE   = (D27.f[DIR_MM0])[ksw  ];
+      real f27_NW   = (D27.f[DIR_PM0])[kse  ];
+      real f27_SE   = (D27.f[DIR_MP0])[knw  ];
+      real f27_BW   = (D27.f[DIR_P0P])[kte  ];
+      real f27_TE   = (D27.f[DIR_M0M])[kbw  ];
+      real f27_TW   = (D27.f[DIR_P0M])[kbe  ];
+      real f27_BE   = (D27.f[DIR_M0P])[ktw  ];
+      real f27_BS   = (D27.f[DIR_0PP])[ktn  ];
+      real f27_TN   = (D27.f[DIR_0MM])[kbs  ];
+      real f27_TS   = (D27.f[DIR_0PM])[kbn  ];
+      real f27_BN   = (D27.f[DIR_0MP])[kts  ];
       real f27_ZERO = (D27.f[DIR_000])[kzero];
-      real f27_BSW  = (D27.f[DIR_PPP ])[ktne ];
-      real f27_BNE  = (D27.f[DIR_MMP ])[ktsw ];
-      real f27_BNW  = (D27.f[DIR_PMP ])[ktse ];
-      real f27_BSE  = (D27.f[DIR_MPP ])[ktnw ];
-      real f27_TSW  = (D27.f[DIR_PPM ])[kbne ];
-      real f27_TNE  = (D27.f[DIR_MMM ])[kbsw ];
-      real f27_TNW  = (D27.f[DIR_PMM ])[kbse ];
-      real f27_TSE  = (D27.f[DIR_MPM ])[kbnw ];
+      real f27_BSW  = (D27.f[DIR_PPP])[ktne ];
+      real f27_BNE  = (D27.f[DIR_MMP])[ktsw ];
+      real f27_BNW  = (D27.f[DIR_PMP])[ktse ];
+      real f27_BSE  = (D27.f[DIR_MPP])[ktnw ];
+      real f27_TSW  = (D27.f[DIR_PPM])[kbne ];
+      real f27_TNE  = (D27.f[DIR_MMM])[kbsw ];
+      real f27_TNW  = (D27.f[DIR_PMM])[kbse ];
+      real f27_TSE  = (D27.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
       ////////////////////////////////////////////////////////////////////////////////
@@ -849,86 +849,86 @@ __global__ void QADPress27( real* DD,
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+         D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+         D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
       //(D.f[DIR_000])[k]=c1o10;
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00  ])[kw  ]=(c2o1*feqW27_W  -(f27_E  *(q*omegaD-c1o1)-omegaD*feq27_E  *(q-c1o1))/(omegaD-c1o1)+f27_W  *q)/(q+c1o1);
-      q = q_dirW[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00  ])[ke  ]=(c2o1*feqW27_E  -(f27_W  *(q*omegaD-c1o1)-omegaD*feq27_W  *(q-c1o1))/(omegaD-c1o1)+f27_E  *q)/(q+c1o1);
-      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0  ])[ks  ]=(c2o1*feqW27_S  -(f27_N  *(q*omegaD-c1o1)-omegaD*feq27_N  *(q-c1o1))/(omegaD-c1o1)+f27_S  *q)/(q+c1o1);
-      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0  ])[kn  ]=(c2o1*feqW27_N  -(f27_S  *(q*omegaD-c1o1)-omegaD*feq27_S  *(q-c1o1))/(omegaD-c1o1)+f27_N  *q)/(q+c1o1);
-      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M  ])[kb  ]=(c2o1*feqW27_B  -(f27_T  *(q*omegaD-c1o1)-omegaD*feq27_T  *(q-c1o1))/(omegaD-c1o1)+f27_B  *q)/(q+c1o1);
-      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P  ])[kt  ]=(c2o1*feqW27_T  -(f27_B  *(q*omegaD-c1o1)-omegaD*feq27_B  *(q-c1o1))/(omegaD-c1o1)+f27_T  *q)/(q+c1o1);
-      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0 ])[ksw ]=(c2o1*feqW27_SW -(f27_NE *(q*omegaD-c1o1)-omegaD*feq27_NE *(q-c1o1))/(omegaD-c1o1)+f27_SW *q)/(q+c1o1);
-      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0 ])[kne ]=(c2o1*feqW27_NE -(f27_SW *(q*omegaD-c1o1)-omegaD*feq27_SW *(q-c1o1))/(omegaD-c1o1)+f27_NE *q)/(q+c1o1);
-      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0 ])[knw ]=(c2o1*feqW27_NW -(f27_SE *(q*omegaD-c1o1)-omegaD*feq27_SE *(q-c1o1))/(omegaD-c1o1)+f27_NW *q)/(q+c1o1);
-      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0 ])[kse ]=(c2o1*feqW27_SE -(f27_NW *(q*omegaD-c1o1)-omegaD*feq27_NW *(q-c1o1))/(omegaD-c1o1)+f27_SE *q)/(q+c1o1);
-      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M ])[kbw ]=(c2o1*feqW27_BW -(f27_TE *(q*omegaD-c1o1)-omegaD*feq27_TE *(q-c1o1))/(omegaD-c1o1)+f27_BW *q)/(q+c1o1);
-      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P ])[kte ]=(c2o1*feqW27_TE -(f27_BW *(q*omegaD-c1o1)-omegaD*feq27_BW *(q-c1o1))/(omegaD-c1o1)+f27_TE *q)/(q+c1o1);
-      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P ])[ktw ]=(c2o1*feqW27_TW -(f27_BE *(q*omegaD-c1o1)-omegaD*feq27_BE *(q-c1o1))/(omegaD-c1o1)+f27_TW *q)/(q+c1o1);
-      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M ])[kbe ]=(c2o1*feqW27_BE -(f27_TW *(q*omegaD-c1o1)-omegaD*feq27_TW *(q-c1o1))/(omegaD-c1o1)+f27_BE *q)/(q+c1o1);
-      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM ])[kbs ]=(c2o1*feqW27_BS -(f27_TN *(q*omegaD-c1o1)-omegaD*feq27_TN *(q-c1o1))/(omegaD-c1o1)+f27_BS *q)/(q+c1o1);
-      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP ])[ktn ]=(c2o1*feqW27_TN -(f27_BS *(q*omegaD-c1o1)-omegaD*feq27_BS *(q-c1o1))/(omegaD-c1o1)+f27_TN *q)/(q+c1o1);
-      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP ])[kts ]=(c2o1*feqW27_TS -(f27_BN *(q*omegaD-c1o1)-omegaD*feq27_BN *(q-c1o1))/(omegaD-c1o1)+f27_TS *q)/(q+c1o1);
-      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM ])[kbn ]=(c2o1*feqW27_BN -(f27_TS *(q*omegaD-c1o1)-omegaD*feq27_TS *(q-c1o1))/(omegaD-c1o1)+f27_BN *q)/(q+c1o1);
+      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00])[kw  ]=(c2o1*feqW27_W  -(f27_E  *(q*omegaD-c1o1)-omegaD*feq27_E  *(q-c1o1))/(omegaD-c1o1)+f27_W  *q)/(q+c1o1);
+      q = q_dirW[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00])[ke  ]=(c2o1*feqW27_E  -(f27_W  *(q*omegaD-c1o1)-omegaD*feq27_W  *(q-c1o1))/(omegaD-c1o1)+f27_E  *q)/(q+c1o1);
+      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0])[ks  ]=(c2o1*feqW27_S  -(f27_N  *(q*omegaD-c1o1)-omegaD*feq27_N  *(q-c1o1))/(omegaD-c1o1)+f27_S  *q)/(q+c1o1);
+      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0])[kn  ]=(c2o1*feqW27_N  -(f27_S  *(q*omegaD-c1o1)-omegaD*feq27_S  *(q-c1o1))/(omegaD-c1o1)+f27_N  *q)/(q+c1o1);
+      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M])[kb  ]=(c2o1*feqW27_B  -(f27_T  *(q*omegaD-c1o1)-omegaD*feq27_T  *(q-c1o1))/(omegaD-c1o1)+f27_B  *q)/(q+c1o1);
+      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P])[kt  ]=(c2o1*feqW27_T  -(f27_B  *(q*omegaD-c1o1)-omegaD*feq27_B  *(q-c1o1))/(omegaD-c1o1)+f27_T  *q)/(q+c1o1);
+      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0])[ksw ]=(c2o1*feqW27_SW -(f27_NE *(q*omegaD-c1o1)-omegaD*feq27_NE *(q-c1o1))/(omegaD-c1o1)+f27_SW *q)/(q+c1o1);
+      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0])[kne ]=(c2o1*feqW27_NE -(f27_SW *(q*omegaD-c1o1)-omegaD*feq27_SW *(q-c1o1))/(omegaD-c1o1)+f27_NE *q)/(q+c1o1);
+      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0])[knw ]=(c2o1*feqW27_NW -(f27_SE *(q*omegaD-c1o1)-omegaD*feq27_SE *(q-c1o1))/(omegaD-c1o1)+f27_NW *q)/(q+c1o1);
+      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0])[kse ]=(c2o1*feqW27_SE -(f27_NW *(q*omegaD-c1o1)-omegaD*feq27_NW *(q-c1o1))/(omegaD-c1o1)+f27_SE *q)/(q+c1o1);
+      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M])[kbw ]=(c2o1*feqW27_BW -(f27_TE *(q*omegaD-c1o1)-omegaD*feq27_TE *(q-c1o1))/(omegaD-c1o1)+f27_BW *q)/(q+c1o1);
+      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P])[kte ]=(c2o1*feqW27_TE -(f27_BW *(q*omegaD-c1o1)-omegaD*feq27_BW *(q-c1o1))/(omegaD-c1o1)+f27_TE *q)/(q+c1o1);
+      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P])[ktw ]=(c2o1*feqW27_TW -(f27_BE *(q*omegaD-c1o1)-omegaD*feq27_BE *(q-c1o1))/(omegaD-c1o1)+f27_TW *q)/(q+c1o1);
+      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M])[kbe ]=(c2o1*feqW27_BE -(f27_TW *(q*omegaD-c1o1)-omegaD*feq27_TW *(q-c1o1))/(omegaD-c1o1)+f27_BE *q)/(q+c1o1);
+      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM])[kbs ]=(c2o1*feqW27_BS -(f27_TN *(q*omegaD-c1o1)-omegaD*feq27_TN *(q-c1o1))/(omegaD-c1o1)+f27_BS *q)/(q+c1o1);
+      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP])[ktn ]=(c2o1*feqW27_TN -(f27_BS *(q*omegaD-c1o1)-omegaD*feq27_BS *(q-c1o1))/(omegaD-c1o1)+f27_TN *q)/(q+c1o1);
+      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP])[kts ]=(c2o1*feqW27_TS -(f27_BN *(q*omegaD-c1o1)-omegaD*feq27_BN *(q-c1o1))/(omegaD-c1o1)+f27_TS *q)/(q+c1o1);
+      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM])[kbn ]=(c2o1*feqW27_BN -(f27_TS *(q*omegaD-c1o1)-omegaD*feq27_TS *(q-c1o1))/(omegaD-c1o1)+f27_BN *q)/(q+c1o1);
       q = q_dirTNE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMM])[kbsw]=(c2o1*feqW27_BSW-(f27_TNE*(q*omegaD-c1o1)-omegaD*feq27_TNE*(q-c1o1))/(omegaD-c1o1)+f27_BSW*q)/(q+c1o1);
       q = q_dirBSW[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PPP])[ktne]=(c2o1*feqW27_TNE-(f27_BSW*(q*omegaD-c1o1)-omegaD*feq27_BSW*(q-c1o1))/(omegaD-c1o1)+f27_TNE*q)/(q+c1o1);
       q = q_dirBNE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMP])[ktsw]=(c2o1*feqW27_TSW-(f27_BNE*(q*omegaD-c1o1)-omegaD*feq27_BNE*(q-c1o1))/(omegaD-c1o1)+f27_TSW*q)/(q+c1o1);
@@ -989,132 +989,132 @@ __global__ void QADPressNEQNeighbor27(
 													unsigned int* neighborX,
 													unsigned int* neighborY,
 													unsigned int* neighborZ,
-													unsigned int size_Mat,
+													unsigned long long numberOfLBnodes,
 													bool isEvenTimestep
 												)
 {
 	Distributions27 D;
 	if (isEvenTimestep == true)
 	{
-		D.f[DIR_P00] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_M00] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0P0] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0M0] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00P] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00M] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_PP0] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_MM0] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_PM0] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_MP0] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_P0P] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_M0M] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_P0M] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_M0P] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0PP] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0MM] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0PM] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0MP] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_MMP] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_PMP] = &DD[DIR_PMP *size_Mat];
-		D.f[DIR_MPP] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_PPM] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_MMM] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_PMM] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_MPM] = &DD[DIR_MPM *size_Mat];
+		D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
 	}
 	else
 	{
-		D.f[DIR_M00] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_P00] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0M0] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0P0] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00M] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00P] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_MM0] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_PP0] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_MP0] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_PM0] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_M0M] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_P0P] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_M0P] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_P0M] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0MM] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0PP] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0MP] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0PM] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_MMP] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_PMP] = &DD[DIR_MPM *size_Mat];
-		D.f[DIR_MPP] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_PPM] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_MMM] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_PMM] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_MPM] = &DD[DIR_PMP *size_Mat];
+		D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
 	}
 
 	Distributions27 D27;
 	if (isEvenTimestep == true)
 	{
-		D27.f[DIR_P00] = &DD27[DIR_P00   *size_Mat];
-		D27.f[DIR_M00] = &DD27[DIR_M00   *size_Mat];
-		D27.f[DIR_0P0] = &DD27[DIR_0P0   *size_Mat];
-		D27.f[DIR_0M0] = &DD27[DIR_0M0   *size_Mat];
-		D27.f[DIR_00P] = &DD27[DIR_00P   *size_Mat];
-		D27.f[DIR_00M] = &DD27[DIR_00M   *size_Mat];
-		D27.f[DIR_PP0] = &DD27[DIR_PP0  *size_Mat];
-		D27.f[DIR_MM0] = &DD27[DIR_MM0  *size_Mat];
-		D27.f[DIR_PM0] = &DD27[DIR_PM0  *size_Mat];
-		D27.f[DIR_MP0] = &DD27[DIR_MP0  *size_Mat];
-		D27.f[DIR_P0P] = &DD27[DIR_P0P  *size_Mat];
-		D27.f[DIR_M0M] = &DD27[DIR_M0M  *size_Mat];
-		D27.f[DIR_P0M] = &DD27[DIR_P0M  *size_Mat];
-		D27.f[DIR_M0P] = &DD27[DIR_M0P  *size_Mat];
-		D27.f[DIR_0PP] = &DD27[DIR_0PP  *size_Mat];
-		D27.f[DIR_0MM] = &DD27[DIR_0MM  *size_Mat];
-		D27.f[DIR_0PM] = &DD27[DIR_0PM  *size_Mat];
-		D27.f[DIR_0MP] = &DD27[DIR_0MP  *size_Mat];
-		D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-		D27.f[DIR_PPP] = &DD27[DIR_PPP *size_Mat];
-		D27.f[DIR_MMP] = &DD27[DIR_MMP *size_Mat];
-		D27.f[DIR_PMP] = &DD27[DIR_PMP *size_Mat];
-		D27.f[DIR_MPP] = &DD27[DIR_MPP *size_Mat];
-		D27.f[DIR_PPM] = &DD27[DIR_PPM *size_Mat];
-		D27.f[DIR_MMM] = &DD27[DIR_MMM *size_Mat];
-		D27.f[DIR_PMM] = &DD27[DIR_PMM *size_Mat];
-		D27.f[DIR_MPM] = &DD27[DIR_MPM *size_Mat];
+		D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+		D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+		D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+		D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+		D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+		D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+		D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+		D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+		D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+		D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+		D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+		D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+		D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+		D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+		D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+		D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+		D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+		D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+		D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+		D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+		D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+		D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+		D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+		D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+		D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+		D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+		D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
 	}
 	else
 	{
-		D27.f[DIR_M00] = &DD27[DIR_P00   *size_Mat];
-		D27.f[DIR_P00] = &DD27[DIR_M00   *size_Mat];
-		D27.f[DIR_0M0] = &DD27[DIR_0P0   *size_Mat];
-		D27.f[DIR_0P0] = &DD27[DIR_0M0   *size_Mat];
-		D27.f[DIR_00M] = &DD27[DIR_00P   *size_Mat];
-		D27.f[DIR_00P] = &DD27[DIR_00M   *size_Mat];
-		D27.f[DIR_MM0] = &DD27[DIR_PP0  *size_Mat];
-		D27.f[DIR_PP0] = &DD27[DIR_MM0  *size_Mat];
-		D27.f[DIR_MP0] = &DD27[DIR_PM0  *size_Mat];
-		D27.f[DIR_PM0] = &DD27[DIR_MP0  *size_Mat];
-		D27.f[DIR_M0M] = &DD27[DIR_P0P  *size_Mat];
-		D27.f[DIR_P0P] = &DD27[DIR_M0M  *size_Mat];
-		D27.f[DIR_M0P] = &DD27[DIR_P0M  *size_Mat];
-		D27.f[DIR_P0M] = &DD27[DIR_M0P  *size_Mat];
-		D27.f[DIR_0MM] = &DD27[DIR_0PP  *size_Mat];
-		D27.f[DIR_0PP] = &DD27[DIR_0MM  *size_Mat];
-		D27.f[DIR_0MP] = &DD27[DIR_0PM  *size_Mat];
-		D27.f[DIR_0PM] = &DD27[DIR_0MP  *size_Mat];
-		D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-		D27.f[DIR_PPP] = &DD27[DIR_MMM *size_Mat];
-		D27.f[DIR_MMP] = &DD27[DIR_PPM *size_Mat];
-		D27.f[DIR_PMP] = &DD27[DIR_MPM *size_Mat];
-		D27.f[DIR_MPP] = &DD27[DIR_PMM *size_Mat];
-		D27.f[DIR_PPM] = &DD27[DIR_MMP *size_Mat];
-		D27.f[DIR_MMM] = &DD27[DIR_PPP *size_Mat];
-		D27.f[DIR_PMM] = &DD27[DIR_MPP *size_Mat];
-		D27.f[DIR_MPM] = &DD27[DIR_PMP *size_Mat];
+		D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+		D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+		D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+		D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+		D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+		D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+		D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+		D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+		D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+		D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+		D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+		D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+		D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+		D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+		D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+		D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+		D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+		D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+		D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+		D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+		D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+		D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+		D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+		D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+		D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+		D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+		D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
 	}
 	////////////////////////////////////////////////////////////////////////////////
 	const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -1345,33 +1345,33 @@ __global__ void QADPressNEQNeighbor27(
 		unsigned int kNbsw = neighborZ[kNsw];
 		////////////////////////////////////////////////////////////////////////////////
 		//update distributions at neighbor nodes
-        (D27.f[DIR_P00   ])[kNe   ] = f27_W   ;  
-        (D27.f[DIR_M00   ])[kNw   ] = f27_E   ;	
-        (D27.f[DIR_0P0   ])[kNn   ] = f27_S   ;	
-        (D27.f[DIR_0M0   ])[kNs   ] = f27_N   ;	
-        (D27.f[DIR_00P   ])[kNt   ] = f27_B   ;	
-        (D27.f[DIR_00M   ])[kNb   ] = f27_T   ;	
-        (D27.f[DIR_PP0  ])[kNne  ] = f27_SW  ;	
-        (D27.f[DIR_MM0  ])[kNsw  ] = f27_NE  ;	
-        (D27.f[DIR_PM0  ])[kNse  ] = f27_NW  ;	
-        (D27.f[DIR_MP0  ])[kNnw  ] = f27_SE  ;	
-        (D27.f[DIR_P0P  ])[kNte  ] = f27_BW  ;	
-        (D27.f[DIR_M0M  ])[kNbw  ] = f27_TE  ;	
-        (D27.f[DIR_P0M  ])[kNbe  ] = f27_TW  ;	
-        (D27.f[DIR_M0P  ])[kNtw  ] = f27_BE  ;	
-        (D27.f[DIR_0PP  ])[kNtn  ] = f27_BS  ;	
-        (D27.f[DIR_0MM  ])[kNbs  ] = f27_TN  ;	
-        (D27.f[DIR_0PM  ])[kNbn  ] = f27_TS  ;	
-        (D27.f[DIR_0MP  ])[kNts  ] = f27_BN  ;	
+        (D27.f[DIR_P00])[kNe   ] = f27_W   ;  
+        (D27.f[DIR_M00])[kNw   ] = f27_E   ;	
+        (D27.f[DIR_0P0])[kNn   ] = f27_S   ;	
+        (D27.f[DIR_0M0])[kNs   ] = f27_N   ;	
+        (D27.f[DIR_00P])[kNt   ] = f27_B   ;	
+        (D27.f[DIR_00M])[kNb   ] = f27_T   ;	
+        (D27.f[DIR_PP0])[kNne  ] = f27_SW  ;	
+        (D27.f[DIR_MM0])[kNsw  ] = f27_NE  ;	
+        (D27.f[DIR_PM0])[kNse  ] = f27_NW  ;	
+        (D27.f[DIR_MP0])[kNnw  ] = f27_SE  ;	
+        (D27.f[DIR_P0P])[kNte  ] = f27_BW  ;	
+        (D27.f[DIR_M0M])[kNbw  ] = f27_TE  ;	
+        (D27.f[DIR_P0M])[kNbe  ] = f27_TW  ;	
+        (D27.f[DIR_M0P])[kNtw  ] = f27_BE  ;	
+        (D27.f[DIR_0PP])[kNtn  ] = f27_BS  ;	
+        (D27.f[DIR_0MM])[kNbs  ] = f27_TN  ;	
+        (D27.f[DIR_0PM])[kNbn  ] = f27_TS  ;	
+        (D27.f[DIR_0MP])[kNts  ] = f27_BN  ;	
         (D27.f[DIR_000])[kNzero] = f27_ZERO;	
-        (D27.f[DIR_PPP ])[kNtne ] = f27_BSW ;	
-        (D27.f[DIR_MMP ])[kNtsw ] = f27_BNE ;	
-        (D27.f[DIR_PMP ])[kNtse ] = f27_BNW ;	
-        (D27.f[DIR_MPP ])[kNtnw ] = f27_BSE ;	
-        (D27.f[DIR_PPM ])[kNbne ] = f27_TSW ;	
-        (D27.f[DIR_MMM ])[kNbsw ] = f27_TNE ;	
-        (D27.f[DIR_PMM ])[kNbse ] = f27_TNW ;	
-        (D27.f[DIR_MPM ])[kNbnw ] = f27_TSE ;       
+        (D27.f[DIR_PPP])[kNtne ] = f27_BSW ;	
+        (D27.f[DIR_MMP])[kNtsw ] = f27_BNE ;	
+        (D27.f[DIR_PMP])[kNtse ] = f27_BNW ;	
+        (D27.f[DIR_MPP])[kNtnw ] = f27_BSE ;	
+        (D27.f[DIR_PPM])[kNbne ] = f27_TSW ;	
+        (D27.f[DIR_MMM])[kNbsw ] = f27_TNE ;	
+        (D27.f[DIR_PMM])[kNbse ] = f27_TNW ;	
+        (D27.f[DIR_MPM])[kNbnw ] = f27_TSE ;       
 	}
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1427,91 +1427,91 @@ __global__ void QADVel7( real* DD,
                                     unsigned int* neighborX,
                                     unsigned int* neighborY,
                                     unsigned int* neighborZ,
-                                    unsigned int size_Mat, 
+                                    unsigned long long numberOfLBnodes, 
                                     bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
 
    Distributions7 D7;
    if (isEvenTimestep==true)
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[1] = &DD7[1*size_Mat];
-      D7.f[2] = &DD7[2*size_Mat];
-      D7.f[3] = &DD7[3*size_Mat];
-      D7.f[4] = &DD7[4*size_Mat];
-      D7.f[5] = &DD7[5*size_Mat];
-      D7.f[6] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[1] = &DD7[1*numberOfLBnodes];
+      D7.f[2] = &DD7[2*numberOfLBnodes];
+      D7.f[3] = &DD7[3*numberOfLBnodes];
+      D7.f[4] = &DD7[4*numberOfLBnodes];
+      D7.f[5] = &DD7[5*numberOfLBnodes];
+      D7.f[6] = &DD7[6*numberOfLBnodes];
    }
    else
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[2] = &DD7[1*size_Mat];
-      D7.f[1] = &DD7[2*size_Mat];
-      D7.f[4] = &DD7[3*size_Mat];
-      D7.f[3] = &DD7[4*size_Mat];
-      D7.f[6] = &DD7[5*size_Mat];
-      D7.f[5] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[2] = &DD7[1*numberOfLBnodes];
+      D7.f[1] = &DD7[2*numberOfLBnodes];
+      D7.f[4] = &DD7[3*numberOfLBnodes];
+      D7.f[3] = &DD7[4*numberOfLBnodes];
+      D7.f[6] = &DD7[5*numberOfLBnodes];
+      D7.f[5] = &DD7[6*numberOfLBnodes];
    }
 
 
@@ -1531,12 +1531,12 @@ __global__ void QADVel7( real* DD,
       //////////////////////////////////////////////////////////////////////////////////
       real  *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB;//, 
 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
       //////////////////////////////////////////////////////////////////////////////////
       //index
       unsigned int KQK  = k_Q[k];
@@ -1571,32 +1571,32 @@ __global__ void QADVel7( real* DD,
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       /*real drho*/;
       real vx1_Inflow   = c0o1;
@@ -1676,23 +1676,23 @@ __global__ void QADVel7( real* DD,
       //pointertausch
       if (isEvenTimestep==false)
       {
-         D7.f[0] = &DD7[0*size_Mat];
-         D7.f[1] = &DD7[1*size_Mat];
-         D7.f[2] = &DD7[2*size_Mat];
-         D7.f[3] = &DD7[3*size_Mat];
-         D7.f[4] = &DD7[4*size_Mat];
-         D7.f[5] = &DD7[5*size_Mat];
-         D7.f[6] = &DD7[6*size_Mat];
+         D7.f[0] = &DD7[0*numberOfLBnodes];
+         D7.f[1] = &DD7[1*numberOfLBnodes];
+         D7.f[2] = &DD7[2*numberOfLBnodes];
+         D7.f[3] = &DD7[3*numberOfLBnodes];
+         D7.f[4] = &DD7[4*numberOfLBnodes];
+         D7.f[5] = &DD7[5*numberOfLBnodes];
+         D7.f[6] = &DD7[6*numberOfLBnodes];
       }
       else
       {
-         D7.f[0] = &DD7[0*size_Mat];
-         D7.f[2] = &DD7[1*size_Mat];
-         D7.f[1] = &DD7[2*size_Mat];
-         D7.f[4] = &DD7[3*size_Mat];
-         D7.f[3] = &DD7[4*size_Mat];
-         D7.f[6] = &DD7[5*size_Mat];
-         D7.f[5] = &DD7[6*size_Mat];
+         D7.f[0] = &DD7[0*numberOfLBnodes];
+         D7.f[2] = &DD7[1*numberOfLBnodes];
+         D7.f[1] = &DD7[2*numberOfLBnodes];
+         D7.f[4] = &DD7[3*numberOfLBnodes];
+         D7.f[3] = &DD7[4*numberOfLBnodes];
+         D7.f[6] = &DD7[5*numberOfLBnodes];
+         D7.f[5] = &DD7[6*numberOfLBnodes];
       }
 
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1844,131 +1844,131 @@ __global__ void QADVel27(real* DD,
                                     unsigned int* neighborX,
                                     unsigned int* neighborY,
                                     unsigned int* neighborZ,
-                                    unsigned int size_Mat, 
+                                    unsigned long long numberOfLBnodes, 
                                     bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
 
    Distributions27 D27;
    if (isEvenTimestep==true)
    {
-      D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+      D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+      D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -1989,24 +1989,24 @@ __global__ void QADVel27(real* DD,
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -2046,33 +2046,33 @@ __global__ void QADVel27(real* DD,
       unsigned int ktne = KQK;
       unsigned int kbsw = neighborZ[ksw];
       ////////////////////////////////////////////////////////////////////////////////
-      real f_W    = (D.f[DIR_P00   ])[ke   ];
-      real f_E    = (D.f[DIR_M00   ])[kw   ];
-      real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      real f_B    = (D.f[DIR_00P   ])[kt   ];
-      real f_T    = (D.f[DIR_00M   ])[kb   ];
-      real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (D.f[DIR_0MP  ])[kts  ];
+      real f_W    = (D.f[DIR_P00])[ke   ];
+      real f_E    = (D.f[DIR_M00])[kw   ];
+      real f_S    = (D.f[DIR_0P0])[kn   ];
+      real f_N    = (D.f[DIR_0M0])[ks   ];
+      real f_B    = (D.f[DIR_00P])[kt   ];
+      real f_T    = (D.f[DIR_00M])[kb   ];
+      real f_SW   = (D.f[DIR_PP0])[kne  ];
+      real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      real f_NW   = (D.f[DIR_PM0])[kse  ];
+      real f_SE   = (D.f[DIR_MP0])[knw  ];
+      real f_BW   = (D.f[DIR_P0P])[kte  ];
+      real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      real f_BN   = (D.f[DIR_0MP])[kts  ];
       real f_ZERO = (D.f[DIR_000])[kzero];
-      real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      real f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, /*drho, feq,*/ q;
       ////drho   = f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -2098,33 +2098,33 @@ __global__ void QADVel27(real* DD,
       vx2     =  OORho*((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_BNW-f_TSE)+(f_TNW-f_BSE) +(f_NE-f_SW)+(f_NW-f_SE)+(f_TN-f_BS)+(f_BN-f_TS)+(f_N-f_S));
       vx3     =  OORho*((f_TNE-f_BSW)+(f_TSW-f_BNE)+(f_TSE-f_BNW)+(f_TNW-f_BSE) +(f_TE-f_BW)+(f_TW-f_BE)+(f_TN-f_BS)+(f_TS-f_BN)+(f_T-f_B));
       ////////////////////////////////////////////////////////////////////////////////
-      //real f27_W    = (D27.f[DIR_P00   ])[ke   ];
-      //real f27_E    = (D27.f[DIR_M00   ])[kw   ];
-      //real f27_S    = (D27.f[DIR_0P0   ])[kn   ];
-      //real f27_N    = (D27.f[DIR_0M0   ])[ks   ];
-      //real f27_B    = (D27.f[DIR_00P   ])[kt   ];
-      //real f27_T    = (D27.f[DIR_00M   ])[kb   ];
-      //real f27_SW   = (D27.f[DIR_PP0  ])[kne  ];
-      //real f27_NE   = (D27.f[DIR_MM0  ])[ksw  ];
-      //real f27_NW   = (D27.f[DIR_PM0  ])[kse  ];
-      //real f27_SE   = (D27.f[DIR_MP0  ])[knw  ];
-      //real f27_BW   = (D27.f[DIR_P0P  ])[kte  ];
-      //real f27_TE   = (D27.f[DIR_M0M  ])[kbw  ];
-      //real f27_TW   = (D27.f[DIR_P0M  ])[kbe  ];
-      //real f27_BE   = (D27.f[DIR_M0P  ])[ktw  ];
-      //real f27_BS   = (D27.f[DIR_0PP  ])[ktn  ];
-      //real f27_TN   = (D27.f[DIR_0MM  ])[kbs  ];
-      //real f27_TS   = (D27.f[DIR_0PM  ])[kbn  ];
-      //real f27_BN   = (D27.f[DIR_0MP  ])[kts  ];
+      //real f27_W    = (D27.f[DIR_P00])[ke   ];
+      //real f27_E    = (D27.f[DIR_M00])[kw   ];
+      //real f27_S    = (D27.f[DIR_0P0])[kn   ];
+      //real f27_N    = (D27.f[DIR_0M0])[ks   ];
+      //real f27_B    = (D27.f[DIR_00P])[kt   ];
+      //real f27_T    = (D27.f[DIR_00M])[kb   ];
+      //real f27_SW   = (D27.f[DIR_PP0])[kne  ];
+      //real f27_NE   = (D27.f[DIR_MM0])[ksw  ];
+      //real f27_NW   = (D27.f[DIR_PM0])[kse  ];
+      //real f27_SE   = (D27.f[DIR_MP0])[knw  ];
+      //real f27_BW   = (D27.f[DIR_P0P])[kte  ];
+      //real f27_TE   = (D27.f[DIR_M0M])[kbw  ];
+      //real f27_TW   = (D27.f[DIR_P0M])[kbe  ];
+      //real f27_BE   = (D27.f[DIR_M0P])[ktw  ];
+      //real f27_BS   = (D27.f[DIR_0PP])[ktn  ];
+      //real f27_TN   = (D27.f[DIR_0MM])[kbs  ];
+      //real f27_TS   = (D27.f[DIR_0PM])[kbn  ];
+      //real f27_BN   = (D27.f[DIR_0MP])[kts  ];
       //real f27_ZERO = (D27.f[DIR_000])[kzero];
-      //real f27_BSW  = (D27.f[DIR_PPP ])[ktne ];
-      //real f27_BNE  = (D27.f[DIR_MMP ])[ktsw ];
-      //real f27_BNW  = (D27.f[DIR_PMP ])[ktse ];
-      //real f27_BSE  = (D27.f[DIR_MPP ])[ktnw ];
-      //real f27_TSW  = (D27.f[DIR_PPM ])[kbne ];
-      //real f27_TNE  = (D27.f[DIR_MMM ])[kbsw ];
-      //real f27_TNW  = (D27.f[DIR_PMM ])[kbse ];
-      //real f27_TSE  = (D27.f[DIR_MPM ])[kbnw ];
+      //real f27_BSW  = (D27.f[DIR_PPP])[ktne ];
+      //real f27_BNE  = (D27.f[DIR_MMP])[ktsw ];
+      //real f27_BNW  = (D27.f[DIR_PMP])[ktse ];
+      //real f27_BSE  = (D27.f[DIR_MPP])[ktnw ];
+      //real f27_TSW  = (D27.f[DIR_PPM])[kbne ];
+      //real f27_TNE  = (D27.f[DIR_MMM])[kbsw ];
+      //real f27_TNW  = (D27.f[DIR_PMM])[kbse ];
+      //real f27_TSE  = (D27.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
       ////////////////////////////////////////////////////////////////////////////////
@@ -2233,63 +2233,63 @@ __global__ void QADVel27(real* DD,
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+         D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+         D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
@@ -2299,24 +2299,24 @@ __global__ void QADVel27(real* DD,
       //Test
       //(D.f[DIR_000])[k]=c1o10;
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      //(D27.f[DIR_M00  ])[kw  ]= four;
-      //(D27.f[DIR_P00  ])[ke  ]= four;
-      //(D27.f[DIR_0M0  ])[ks  ]= four;
-      //(D27.f[DIR_0P0  ])[kn  ]= four;
-      //(D27.f[DIR_00M  ])[kb  ]= four;
-      //(D27.f[DIR_00P  ])[kt  ]= four;
-      //(D27.f[DIR_MM0 ])[ksw ]= four;
-      //(D27.f[DIR_PP0 ])[kne ]= four;
-      //(D27.f[DIR_MP0 ])[knw ]= four;
-      //(D27.f[DIR_PM0 ])[kse ]= four;
-      //(D27.f[DIR_M0M ])[kbw ]= four;
-      //(D27.f[DIR_P0P ])[kte ]= four;
-      //(D27.f[DIR_M0P ])[ktw ]= four;
-      //(D27.f[DIR_P0M ])[kbe ]= four;
-      //(D27.f[DIR_0MM ])[kbs ]= four;
-      //(D27.f[DIR_0PP ])[ktn ]= four;
-      //(D27.f[DIR_0MP ])[kts ]= four;
-      //(D27.f[DIR_0PM ])[kbn ]= four;
+      //(D27.f[DIR_M00])[kw  ]= four;
+      //(D27.f[DIR_P00])[ke  ]= four;
+      //(D27.f[DIR_0M0])[ks  ]= four;
+      //(D27.f[DIR_0P0])[kn  ]= four;
+      //(D27.f[DIR_00M])[kb  ]= four;
+      //(D27.f[DIR_00P])[kt  ]= four;
+      //(D27.f[DIR_MM0])[ksw ]= four;
+      //(D27.f[DIR_PP0])[kne ]= four;
+      //(D27.f[DIR_MP0])[knw ]= four;
+      //(D27.f[DIR_PM0])[kse ]= four;
+      //(D27.f[DIR_M0M])[kbw ]= four;
+      //(D27.f[DIR_P0P])[kte ]= four;
+      //(D27.f[DIR_M0P])[ktw ]= four;
+      //(D27.f[DIR_P0M])[kbe ]= four;
+      //(D27.f[DIR_0MM])[kbs ]= four;
+      //(D27.f[DIR_0PP])[ktn ]= four;
+      //(D27.f[DIR_0MP])[kts ]= four;
+      //(D27.f[DIR_0PM])[kbn ]= four;
       //(D27.f[DIR_MMM])[kbsw]= four;
       //(D27.f[DIR_PPP])[ktne]= four;
       //(D27.f[DIR_MMP])[ktsw]= four;
@@ -2325,24 +2325,24 @@ __global__ void QADVel27(real* DD,
       //(D27.f[DIR_PMP])[ktse]= four;
       //(D27.f[DIR_MPP])[ktnw]= four;
       //(D27.f[DIR_PMM])[kbse]= four;
-      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00  ])[kw  ]= -feqW27_W  + c2o1 * c2o27  * TempD;
-      q = q_dirW[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00  ])[ke  ]= -feqW27_E  + c2o1 * c2o27  * TempD;
-      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0  ])[ks  ]= -feqW27_S  + c2o1 * c2o27  * TempD;
-      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0  ])[kn  ]= -feqW27_N  + c2o1 * c2o27  * TempD;
-      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M  ])[kb  ]= -feqW27_B  + c2o1 * c2o27  * TempD;
-      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P  ])[kt  ]= -feqW27_T  + c2o1 * c2o27  * TempD;
-      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0 ])[ksw ]= -feqW27_SW + c2o1 * c1o54  * TempD;
-      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0 ])[kne ]= -feqW27_NE + c2o1 * c1o54  * TempD;
-      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0 ])[knw ]= -feqW27_NW + c2o1 * c1o54  * TempD;
-      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0 ])[kse ]= -feqW27_SE + c2o1 * c1o54  * TempD;
-      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M ])[kbw ]= -feqW27_BW + c2o1 * c1o54  * TempD;
-      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P ])[kte ]= -feqW27_TE + c2o1 * c1o54  * TempD;
-      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P ])[ktw ]= -feqW27_TW + c2o1 * c1o54  * TempD;
-      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M ])[kbe ]= -feqW27_BE + c2o1 * c1o54  * TempD;
-      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM ])[kbs ]= -feqW27_BS + c2o1 * c1o54  * TempD;
-      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP ])[ktn ]= -feqW27_TN + c2o1 * c1o54  * TempD;
-      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP ])[kts ]= -feqW27_TS + c2o1 * c1o54  * TempD;
-      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM ])[kbn ]= -feqW27_BN + c2o1 * c1o54  * TempD;
+      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00])[kw  ]= -feqW27_W  + c2o1 * c2o27  * TempD;
+      q = q_dirW[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00])[ke  ]= -feqW27_E  + c2o1 * c2o27  * TempD;
+      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0])[ks  ]= -feqW27_S  + c2o1 * c2o27  * TempD;
+      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0])[kn  ]= -feqW27_N  + c2o1 * c2o27  * TempD;
+      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M])[kb  ]= -feqW27_B  + c2o1 * c2o27  * TempD;
+      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P])[kt  ]= -feqW27_T  + c2o1 * c2o27  * TempD;
+      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0])[ksw ]= -feqW27_SW + c2o1 * c1o54  * TempD;
+      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0])[kne ]= -feqW27_NE + c2o1 * c1o54  * TempD;
+      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0])[knw ]= -feqW27_NW + c2o1 * c1o54  * TempD;
+      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0])[kse ]= -feqW27_SE + c2o1 * c1o54  * TempD;
+      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M])[kbw ]= -feqW27_BW + c2o1 * c1o54  * TempD;
+      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P])[kte ]= -feqW27_TE + c2o1 * c1o54  * TempD;
+      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P])[ktw ]= -feqW27_TW + c2o1 * c1o54  * TempD;
+      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M])[kbe ]= -feqW27_BE + c2o1 * c1o54  * TempD;
+      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM])[kbs ]= -feqW27_BS + c2o1 * c1o54  * TempD;
+      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP])[ktn ]= -feqW27_TN + c2o1 * c1o54  * TempD;
+      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP])[kts ]= -feqW27_TS + c2o1 * c1o54  * TempD;
+      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM])[kbn ]= -feqW27_BN + c2o1 * c1o54  * TempD;
       q = q_dirTNE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMM])[kbsw]= -feqW27_BSW+ c2o1 * c1o216 * TempD;
       q = q_dirBSW[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PPP])[ktne]= -feqW27_TNE+ c2o1 * c1o216 * TempD;
       q = q_dirBNE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMP])[ktsw]= -feqW27_TSW+ c2o1 * c1o216 * TempD;
@@ -2351,24 +2351,24 @@ __global__ void QADVel27(real* DD,
       q = q_dirBNW[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PMP])[ktse]= -feqW27_TSE+ c2o1 * c1o216 * TempD;
       q = q_dirBSE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MPP])[ktnw]= -feqW27_TNW+ c2o1 * c1o216 * TempD;
       q = q_dirTNW[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PMM])[kbse]= -feqW27_BSE+ c2o1 * c1o216 * TempD;
-      //q = q_dirE[k];   if (q>=zero && q<=one) (D27.f[DIR_M00  ])[kw  ]=(two*feqW27_W  -(f27_E  *(q*omegaD-one)-omegaD*feq27_E  *(q-one))/(omegaD-one)+f27_W  *q)/(q+one);
-      //q = q_dirW[k];   if (q>=zero && q<=one) (D27.f[DIR_P00  ])[ke  ]=(two*feqW27_E  -(f27_W  *(q*omegaD-one)-omegaD*feq27_W  *(q-one))/(omegaD-one)+f27_E  *q)/(q+one);
-      //q = q_dirN[k];   if (q>=zero && q<=one) (D27.f[DIR_0M0  ])[ks  ]=(two*feqW27_S  -(f27_N  *(q*omegaD-one)-omegaD*feq27_N  *(q-one))/(omegaD-one)+f27_S  *q)/(q+one);
-      //q = q_dirS[k];   if (q>=zero && q<=one) (D27.f[DIR_0P0  ])[kn  ]=(two*feqW27_N  -(f27_S  *(q*omegaD-one)-omegaD*feq27_S  *(q-one))/(omegaD-one)+f27_N  *q)/(q+one);
-      //q = q_dirT[k];   if (q>=zero && q<=one) (D27.f[DIR_00M  ])[kb  ]=(two*feqW27_B  -(f27_T  *(q*omegaD-one)-omegaD*feq27_T  *(q-one))/(omegaD-one)+f27_B  *q)/(q+one);
-      //q = q_dirB[k];   if (q>=zero && q<=one) (D27.f[DIR_00P  ])[kt  ]=(two*feqW27_T  -(f27_B  *(q*omegaD-one)-omegaD*feq27_B  *(q-one))/(omegaD-one)+f27_T  *q)/(q+one);
-      //q = q_dirNE[k];  if (q>=zero && q<=one) (D27.f[DIR_MM0 ])[ksw ]=(two*feqW27_SW -(f27_NE *(q*omegaD-one)-omegaD*feq27_NE *(q-one))/(omegaD-one)+f27_SW *q)/(q+one);
-      //q = q_dirSW[k];  if (q>=zero && q<=one) (D27.f[DIR_PP0 ])[kne ]=(two*feqW27_NE -(f27_SW *(q*omegaD-one)-omegaD*feq27_SW *(q-one))/(omegaD-one)+f27_NE *q)/(q+one);
-      //q = q_dirSE[k];  if (q>=zero && q<=one) (D27.f[DIR_MP0 ])[knw ]=(two*feqW27_NW -(f27_SE *(q*omegaD-one)-omegaD*feq27_SE *(q-one))/(omegaD-one)+f27_NW *q)/(q+one);
-      //q = q_dirNW[k];  if (q>=zero && q<=one) (D27.f[DIR_PM0 ])[kse ]=(two*feqW27_SE -(f27_NW *(q*omegaD-one)-omegaD*feq27_NW *(q-one))/(omegaD-one)+f27_SE *q)/(q+one);
-      //q = q_dirTE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0M ])[kbw ]=(two*feqW27_BW -(f27_TE *(q*omegaD-one)-omegaD*feq27_TE *(q-one))/(omegaD-one)+f27_BW *q)/(q+one);
-      //q = q_dirBW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0P ])[kte ]=(two*feqW27_TE -(f27_BW *(q*omegaD-one)-omegaD*feq27_BW *(q-one))/(omegaD-one)+f27_TE *q)/(q+one);
-      //q = q_dirBE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0P ])[ktw ]=(two*feqW27_TW -(f27_BE *(q*omegaD-one)-omegaD*feq27_BE *(q-one))/(omegaD-one)+f27_TW *q)/(q+one);
-      //q = q_dirTW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0M ])[kbe ]=(two*feqW27_BE -(f27_TW *(q*omegaD-one)-omegaD*feq27_TW *(q-one))/(omegaD-one)+f27_BE *q)/(q+one);
-      //q = q_dirTN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MM ])[kbs ]=(two*feqW27_BS -(f27_TN *(q*omegaD-one)-omegaD*feq27_TN *(q-one))/(omegaD-one)+f27_BS *q)/(q+one);
-      //q = q_dirBS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PP ])[ktn ]=(two*feqW27_TN -(f27_BS *(q*omegaD-one)-omegaD*feq27_BS *(q-one))/(omegaD-one)+f27_TN *q)/(q+one);
-      //q = q_dirBN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MP ])[kts ]=(two*feqW27_TS -(f27_BN *(q*omegaD-one)-omegaD*feq27_BN *(q-one))/(omegaD-one)+f27_TS *q)/(q+one);
-      //q = q_dirTS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PM ])[kbn ]=(two*feqW27_BN -(f27_TS *(q*omegaD-one)-omegaD*feq27_TS *(q-one))/(omegaD-one)+f27_BN *q)/(q+one);
+      //q = q_dirE[k];   if (q>=zero && q<=one) (D27.f[DIR_M00])[kw  ]=(two*feqW27_W  -(f27_E  *(q*omegaD-one)-omegaD*feq27_E  *(q-one))/(omegaD-one)+f27_W  *q)/(q+one);
+      //q = q_dirW[k];   if (q>=zero && q<=one) (D27.f[DIR_P00])[ke  ]=(two*feqW27_E  -(f27_W  *(q*omegaD-one)-omegaD*feq27_W  *(q-one))/(omegaD-one)+f27_E  *q)/(q+one);
+      //q = q_dirN[k];   if (q>=zero && q<=one) (D27.f[DIR_0M0])[ks  ]=(two*feqW27_S  -(f27_N  *(q*omegaD-one)-omegaD*feq27_N  *(q-one))/(omegaD-one)+f27_S  *q)/(q+one);
+      //q = q_dirS[k];   if (q>=zero && q<=one) (D27.f[DIR_0P0])[kn  ]=(two*feqW27_N  -(f27_S  *(q*omegaD-one)-omegaD*feq27_S  *(q-one))/(omegaD-one)+f27_N  *q)/(q+one);
+      //q = q_dirT[k];   if (q>=zero && q<=one) (D27.f[DIR_00M])[kb  ]=(two*feqW27_B  -(f27_T  *(q*omegaD-one)-omegaD*feq27_T  *(q-one))/(omegaD-one)+f27_B  *q)/(q+one);
+      //q = q_dirB[k];   if (q>=zero && q<=one) (D27.f[DIR_00P])[kt  ]=(two*feqW27_T  -(f27_B  *(q*omegaD-one)-omegaD*feq27_B  *(q-one))/(omegaD-one)+f27_T  *q)/(q+one);
+      //q = q_dirNE[k];  if (q>=zero && q<=one) (D27.f[DIR_MM0])[ksw ]=(two*feqW27_SW -(f27_NE *(q*omegaD-one)-omegaD*feq27_NE *(q-one))/(omegaD-one)+f27_SW *q)/(q+one);
+      //q = q_dirSW[k];  if (q>=zero && q<=one) (D27.f[DIR_PP0])[kne ]=(two*feqW27_NE -(f27_SW *(q*omegaD-one)-omegaD*feq27_SW *(q-one))/(omegaD-one)+f27_NE *q)/(q+one);
+      //q = q_dirSE[k];  if (q>=zero && q<=one) (D27.f[DIR_MP0])[knw ]=(two*feqW27_NW -(f27_SE *(q*omegaD-one)-omegaD*feq27_SE *(q-one))/(omegaD-one)+f27_NW *q)/(q+one);
+      //q = q_dirNW[k];  if (q>=zero && q<=one) (D27.f[DIR_PM0])[kse ]=(two*feqW27_SE -(f27_NW *(q*omegaD-one)-omegaD*feq27_NW *(q-one))/(omegaD-one)+f27_SE *q)/(q+one);
+      //q = q_dirTE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0M])[kbw ]=(two*feqW27_BW -(f27_TE *(q*omegaD-one)-omegaD*feq27_TE *(q-one))/(omegaD-one)+f27_BW *q)/(q+one);
+      //q = q_dirBW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0P])[kte ]=(two*feqW27_TE -(f27_BW *(q*omegaD-one)-omegaD*feq27_BW *(q-one))/(omegaD-one)+f27_TE *q)/(q+one);
+      //q = q_dirBE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0P])[ktw ]=(two*feqW27_TW -(f27_BE *(q*omegaD-one)-omegaD*feq27_BE *(q-one))/(omegaD-one)+f27_TW *q)/(q+one);
+      //q = q_dirTW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0M])[kbe ]=(two*feqW27_BE -(f27_TW *(q*omegaD-one)-omegaD*feq27_TW *(q-one))/(omegaD-one)+f27_BE *q)/(q+one);
+      //q = q_dirTN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MM])[kbs ]=(two*feqW27_BS -(f27_TN *(q*omegaD-one)-omegaD*feq27_TN *(q-one))/(omegaD-one)+f27_BS *q)/(q+one);
+      //q = q_dirBS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PP])[ktn ]=(two*feqW27_TN -(f27_BS *(q*omegaD-one)-omegaD*feq27_BS *(q-one))/(omegaD-one)+f27_TN *q)/(q+one);
+      //q = q_dirBN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MP])[kts ]=(two*feqW27_TS -(f27_BN *(q*omegaD-one)-omegaD*feq27_BN *(q-one))/(omegaD-one)+f27_TS *q)/(q+one);
+      //q = q_dirTS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PM])[kbn ]=(two*feqW27_BN -(f27_TS *(q*omegaD-one)-omegaD*feq27_TS *(q-one))/(omegaD-one)+f27_BN *q)/(q+one);
       //q = q_dirTNE[k]; if (q>=zero && q<=one) (D27.f[DIR_MMM])[kbsw]=(two*feqW27_BSW-(f27_TNE*(q*omegaD-one)-omegaD*feq27_TNE*(q-one))/(omegaD-one)+f27_BSW*q)/(q+one);
       //q = q_dirBSW[k]; if (q>=zero && q<=one) (D27.f[DIR_PPP])[ktne]=(two*feqW27_TNE-(f27_BSW*(q*omegaD-one)-omegaD*feq27_BSW*(q-one))/(omegaD-one)+f27_TNE*q)/(q+one);
       //q = q_dirBNE[k]; if (q>=zero && q<=one) (D27.f[DIR_MMP])[ktsw]=(two*feqW27_TSW-(f27_BNE*(q*omegaD-one)-omegaD*feq27_BNE*(q-one))/(omegaD-one)+f27_TSW*q)/(q+one);
@@ -2431,91 +2431,91 @@ __global__ void QAD7( real* DD,
                                  unsigned int* neighborX,
                                  unsigned int* neighborY,
                                  unsigned int* neighborZ,
-                                 unsigned int size_Mat, 
+                                 unsigned long long numberOfLBnodes, 
                                  bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
 
    Distributions7 D7;
    if (isEvenTimestep==true)
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[1] = &DD7[1*size_Mat];
-      D7.f[2] = &DD7[2*size_Mat];
-      D7.f[3] = &DD7[3*size_Mat];
-      D7.f[4] = &DD7[4*size_Mat];
-      D7.f[5] = &DD7[5*size_Mat];
-      D7.f[6] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[1] = &DD7[1*numberOfLBnodes];
+      D7.f[2] = &DD7[2*numberOfLBnodes];
+      D7.f[3] = &DD7[3*numberOfLBnodes];
+      D7.f[4] = &DD7[4*numberOfLBnodes];
+      D7.f[5] = &DD7[5*numberOfLBnodes];
+      D7.f[6] = &DD7[6*numberOfLBnodes];
    }
    else
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[2] = &DD7[1*size_Mat];
-      D7.f[1] = &DD7[2*size_Mat];
-      D7.f[4] = &DD7[3*size_Mat];
-      D7.f[3] = &DD7[4*size_Mat];
-      D7.f[6] = &DD7[5*size_Mat];
-      D7.f[5] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[2] = &DD7[1*numberOfLBnodes];
+      D7.f[1] = &DD7[2*numberOfLBnodes];
+      D7.f[4] = &DD7[3*numberOfLBnodes];
+      D7.f[3] = &DD7[4*numberOfLBnodes];
+      D7.f[6] = &DD7[5*numberOfLBnodes];
+      D7.f[5] = &DD7[6*numberOfLBnodes];
    }
 
 
@@ -2539,24 +2539,24 @@ __global__ void QAD7( real* DD,
       //         *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
       //         *q_dirBSE, *q_dirBNW;
 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      //q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      //q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      //q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      //q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      //q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      //q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      //q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      //q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      //q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      //q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      //q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      //q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      //q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      //q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      //q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      //q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      //q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      //q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      //q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      //q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      //q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      //q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      //q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      //q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       //q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       //q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       //q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -2599,32 +2599,32 @@ __global__ void QAD7( real* DD,
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3/*, drho*/;
       //drho   =    f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -2696,23 +2696,23 @@ __global__ void QAD7( real* DD,
       //pointertausch
       if (isEvenTimestep==false)
       {
-         D7.f[0] = &DD7[0*size_Mat];
-         D7.f[1] = &DD7[1*size_Mat];
-         D7.f[2] = &DD7[2*size_Mat];
-         D7.f[3] = &DD7[3*size_Mat];
-         D7.f[4] = &DD7[4*size_Mat];
-         D7.f[5] = &DD7[5*size_Mat];
-         D7.f[6] = &DD7[6*size_Mat];
+         D7.f[0] = &DD7[0*numberOfLBnodes];
+         D7.f[1] = &DD7[1*numberOfLBnodes];
+         D7.f[2] = &DD7[2*numberOfLBnodes];
+         D7.f[3] = &DD7[3*numberOfLBnodes];
+         D7.f[4] = &DD7[4*numberOfLBnodes];
+         D7.f[5] = &DD7[5*numberOfLBnodes];
+         D7.f[6] = &DD7[6*numberOfLBnodes];
       }
       else
       {
-         D7.f[0] = &DD7[0*size_Mat];
-         D7.f[2] = &DD7[1*size_Mat];
-         D7.f[1] = &DD7[2*size_Mat];
-         D7.f[4] = &DD7[3*size_Mat];
-         D7.f[3] = &DD7[4*size_Mat];
-         D7.f[6] = &DD7[5*size_Mat];
-         D7.f[5] = &DD7[6*size_Mat];
+         D7.f[0] = &DD7[0*numberOfLBnodes];
+         D7.f[2] = &DD7[1*numberOfLBnodes];
+         D7.f[1] = &DD7[2*numberOfLBnodes];
+         D7.f[4] = &DD7[3*numberOfLBnodes];
+         D7.f[3] = &DD7[4*numberOfLBnodes];
+         D7.f[6] = &DD7[5*numberOfLBnodes];
+         D7.f[5] = &DD7[6*numberOfLBnodes];
       }
 
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -2864,131 +2864,131 @@ __global__ void QADDirichlet27(
 											 unsigned int* neighborX,
 											 unsigned int* neighborY,
 											 unsigned int* neighborZ,
-											 unsigned int size_Mat, 
+											 unsigned long long numberOfLBnodes, 
 											 bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
 
    Distributions27 D27;
    if (isEvenTimestep==true)
    {
-      D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+      D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+      D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -3009,24 +3009,24 @@ __global__ void QADDirichlet27(
          *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
          *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
          *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -3066,33 +3066,33 @@ __global__ void QADDirichlet27(
       unsigned int ktne = KQK;
       unsigned int kbsw = neighborZ[ksw];
       ////////////////////////////////////////////////////////////////////////////////
-      real f_W    = (D.f[DIR_P00   ])[ke   ];
-      real f_E    = (D.f[DIR_M00   ])[kw   ];
-      real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      real f_B    = (D.f[DIR_00P   ])[kt   ];
-      real f_T    = (D.f[DIR_00M   ])[kb   ];
-      real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (D.f[DIR_0MP  ])[kts  ];
+      real f_W    = (D.f[DIR_P00])[ke   ];
+      real f_E    = (D.f[DIR_M00])[kw   ];
+      real f_S    = (D.f[DIR_0P0])[kn   ];
+      real f_N    = (D.f[DIR_0M0])[ks   ];
+      real f_B    = (D.f[DIR_00P])[kt   ];
+      real f_T    = (D.f[DIR_00M])[kb   ];
+      real f_SW   = (D.f[DIR_PP0])[kne  ];
+      real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      real f_NW   = (D.f[DIR_PM0])[kse  ];
+      real f_SE   = (D.f[DIR_MP0])[knw  ];
+      real f_BW   = (D.f[DIR_P0P])[kte  ];
+      real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      real f_BN   = (D.f[DIR_0MP])[kts  ];
       real f_ZERO = (D.f[DIR_000])[kzero];
-      real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      real f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, /*drho, feq,*/ q;
       ////drho   = f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -3118,33 +3118,33 @@ __global__ void QADDirichlet27(
       vx2     =  OORho*((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_BNW-f_TSE)+(f_TNW-f_BSE) +(f_NE-f_SW)+(f_NW-f_SE)+(f_TN-f_BS)+(f_BN-f_TS)+(f_N-f_S));
       vx3     =  OORho*((f_TNE-f_BSW)+(f_TSW-f_BNE)+(f_TSE-f_BNW)+(f_TNW-f_BSE) +(f_TE-f_BW)+(f_TW-f_BE)+(f_TN-f_BS)+(f_TS-f_BN)+(f_T-f_B));
       ////////////////////////////////////////////////////////////////////////////////
-      real f27_W    = (D27.f[DIR_P00   ])[ke   ];
-      real f27_E    = (D27.f[DIR_M00   ])[kw   ];
-      real f27_S    = (D27.f[DIR_0P0   ])[kn   ];
-      real f27_N    = (D27.f[DIR_0M0   ])[ks   ];
-      real f27_B    = (D27.f[DIR_00P   ])[kt   ];
-      real f27_T    = (D27.f[DIR_00M   ])[kb   ];
-      real f27_SW   = (D27.f[DIR_PP0  ])[kne  ];
-      real f27_NE   = (D27.f[DIR_MM0  ])[ksw  ];
-      real f27_NW   = (D27.f[DIR_PM0  ])[kse  ];
-      real f27_SE   = (D27.f[DIR_MP0  ])[knw  ];
-      real f27_BW   = (D27.f[DIR_P0P  ])[kte  ];
-      real f27_TE   = (D27.f[DIR_M0M  ])[kbw  ];
-      real f27_TW   = (D27.f[DIR_P0M  ])[kbe  ];
-      real f27_BE   = (D27.f[DIR_M0P  ])[ktw  ];
-      real f27_BS   = (D27.f[DIR_0PP  ])[ktn  ];
-      real f27_TN   = (D27.f[DIR_0MM  ])[kbs  ];
-      real f27_TS   = (D27.f[DIR_0PM  ])[kbn  ];
-      real f27_BN   = (D27.f[DIR_0MP  ])[kts  ];
+      real f27_W    = (D27.f[DIR_P00])[ke   ];
+      real f27_E    = (D27.f[DIR_M00])[kw   ];
+      real f27_S    = (D27.f[DIR_0P0])[kn   ];
+      real f27_N    = (D27.f[DIR_0M0])[ks   ];
+      real f27_B    = (D27.f[DIR_00P])[kt   ];
+      real f27_T    = (D27.f[DIR_00M])[kb   ];
+      real f27_SW   = (D27.f[DIR_PP0])[kne  ];
+      real f27_NE   = (D27.f[DIR_MM0])[ksw  ];
+      real f27_NW   = (D27.f[DIR_PM0])[kse  ];
+      real f27_SE   = (D27.f[DIR_MP0])[knw  ];
+      real f27_BW   = (D27.f[DIR_P0P])[kte  ];
+      real f27_TE   = (D27.f[DIR_M0M])[kbw  ];
+      real f27_TW   = (D27.f[DIR_P0M])[kbe  ];
+      real f27_BE   = (D27.f[DIR_M0P])[ktw  ];
+      real f27_BS   = (D27.f[DIR_0PP])[ktn  ];
+      real f27_TN   = (D27.f[DIR_0MM])[kbs  ];
+      real f27_TS   = (D27.f[DIR_0PM])[kbn  ];
+      real f27_BN   = (D27.f[DIR_0MP])[kts  ];
       real f27_ZERO = (D27.f[DIR_000])[kzero];
-      real f27_BSW  = (D27.f[DIR_PPP ])[ktne ];
-      real f27_BNE  = (D27.f[DIR_MMP ])[ktsw ];
-      real f27_BNW  = (D27.f[DIR_PMP ])[ktse ];
-      real f27_BSE  = (D27.f[DIR_MPP ])[ktnw ];
-      real f27_TSW  = (D27.f[DIR_PPM ])[kbne ];
-      real f27_TNE  = (D27.f[DIR_MMM ])[kbsw ];
-      real f27_TNW  = (D27.f[DIR_PMM ])[kbse ];
-      real f27_TSE  = (D27.f[DIR_MPM ])[kbnw ];
+      real f27_BSW  = (D27.f[DIR_PPP])[ktne ];
+      real f27_BNE  = (D27.f[DIR_MMP])[ktsw ];
+      real f27_BNW  = (D27.f[DIR_PMP])[ktse ];
+      real f27_BSE  = (D27.f[DIR_MPP])[ktnw ];
+      real f27_TSW  = (D27.f[DIR_PPM])[kbne ];
+      real f27_TNE  = (D27.f[DIR_MMM])[kbsw ];
+      real f27_TNW  = (D27.f[DIR_PMM])[kbse ];
+      real f27_TSE  = (D27.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
       ////////////////////////////////////////////////////////////////////////////////
@@ -3220,86 +3220,86 @@ __global__ void QADDirichlet27(
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+         D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+         D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
       //(D.f[DIR_000])[k]=0.1f;
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      q = q_dirE[  ke   ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00  ])[kw  ]=(c2o1*feqW27_W  -(f27_E  *(q*omegaD-c1o1)-omegaD*feq27_E  *(q-c1o1))/(omegaD-c1o1)+f27_W  *q)/(q+c1o1);
-      q = q_dirW[  kw   ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00  ])[ke  ]=(c2o1*feqW27_E  -(f27_W  *(q*omegaD-c1o1)-omegaD*feq27_W  *(q-c1o1))/(omegaD-c1o1)+f27_E  *q)/(q+c1o1);
-      q = q_dirN[  kn   ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0  ])[ks  ]=(c2o1*feqW27_S  -(f27_N  *(q*omegaD-c1o1)-omegaD*feq27_N  *(q-c1o1))/(omegaD-c1o1)+f27_S  *q)/(q+c1o1);
-      q = q_dirS[  ks   ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0  ])[kn  ]=(c2o1*feqW27_N  -(f27_S  *(q*omegaD-c1o1)-omegaD*feq27_S  *(q-c1o1))/(omegaD-c1o1)+f27_N  *q)/(q+c1o1);
-      q = q_dirT[  kt   ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M  ])[kb  ]=(c2o1*feqW27_B  -(f27_T  *(q*omegaD-c1o1)-omegaD*feq27_T  *(q-c1o1))/(omegaD-c1o1)+f27_B  *q)/(q+c1o1);
-      q = q_dirB[  kb   ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P  ])[kt  ]=(c2o1*feqW27_T  -(f27_B  *(q*omegaD-c1o1)-omegaD*feq27_B  *(q-c1o1))/(omegaD-c1o1)+f27_T  *q)/(q+c1o1);
-      q = q_dirNE[ kne  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0 ])[ksw ]=(c2o1*feqW27_SW -(f27_NE *(q*omegaD-c1o1)-omegaD*feq27_NE *(q-c1o1))/(omegaD-c1o1)+f27_SW *q)/(q+c1o1);
-      q = q_dirSW[ ksw  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0 ])[kne ]=(c2o1*feqW27_NE -(f27_SW *(q*omegaD-c1o1)-omegaD*feq27_SW *(q-c1o1))/(omegaD-c1o1)+f27_NE *q)/(q+c1o1);
-      q = q_dirSE[ kse  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0 ])[knw ]=(c2o1*feqW27_NW -(f27_SE *(q*omegaD-c1o1)-omegaD*feq27_SE *(q-c1o1))/(omegaD-c1o1)+f27_NW *q)/(q+c1o1);
-      q = q_dirNW[ knw  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0 ])[kse ]=(c2o1*feqW27_SE -(f27_NW *(q*omegaD-c1o1)-omegaD*feq27_NW *(q-c1o1))/(omegaD-c1o1)+f27_SE *q)/(q+c1o1);
-      q = q_dirTE[ kte  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M ])[kbw ]=(c2o1*feqW27_BW -(f27_TE *(q*omegaD-c1o1)-omegaD*feq27_TE *(q-c1o1))/(omegaD-c1o1)+f27_BW *q)/(q+c1o1);
-      q = q_dirBW[ kbw  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P ])[kte ]=(c2o1*feqW27_TE -(f27_BW *(q*omegaD-c1o1)-omegaD*feq27_BW *(q-c1o1))/(omegaD-c1o1)+f27_TE *q)/(q+c1o1);
-      q = q_dirBE[ kbe  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P ])[ktw ]=(c2o1*feqW27_TW -(f27_BE *(q*omegaD-c1o1)-omegaD*feq27_BE *(q-c1o1))/(omegaD-c1o1)+f27_TW *q)/(q+c1o1);
-      q = q_dirTW[ ktw  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M ])[kbe ]=(c2o1*feqW27_BE -(f27_TW *(q*omegaD-c1o1)-omegaD*feq27_TW *(q-c1o1))/(omegaD-c1o1)+f27_BE *q)/(q+c1o1);
-      q = q_dirTN[ ktn  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM ])[kbs ]=(c2o1*feqW27_BS -(f27_TN *(q*omegaD-c1o1)-omegaD*feq27_TN *(q-c1o1))/(omegaD-c1o1)+f27_BS *q)/(q+c1o1);
-      q = q_dirBS[ kbs  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP ])[ktn ]=(c2o1*feqW27_TN -(f27_BS *(q*omegaD-c1o1)-omegaD*feq27_BS *(q-c1o1))/(omegaD-c1o1)+f27_TN *q)/(q+c1o1);
-      q = q_dirBN[ kbn  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP ])[kts ]=(c2o1*feqW27_TS -(f27_BN *(q*omegaD-c1o1)-omegaD*feq27_BN *(q-c1o1))/(omegaD-c1o1)+f27_TS *q)/(q+c1o1);
-      q = q_dirTS[ kts  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM ])[kbn ]=(c2o1*feqW27_BN -(f27_TS *(q*omegaD-c1o1)-omegaD*feq27_TS *(q-c1o1))/(omegaD-c1o1)+f27_BN *q)/(q+c1o1);
+      q = q_dirE[  ke   ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00])[kw  ]=(c2o1*feqW27_W  -(f27_E  *(q*omegaD-c1o1)-omegaD*feq27_E  *(q-c1o1))/(omegaD-c1o1)+f27_W  *q)/(q+c1o1);
+      q = q_dirW[  kw   ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00])[ke  ]=(c2o1*feqW27_E  -(f27_W  *(q*omegaD-c1o1)-omegaD*feq27_W  *(q-c1o1))/(omegaD-c1o1)+f27_E  *q)/(q+c1o1);
+      q = q_dirN[  kn   ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0])[ks  ]=(c2o1*feqW27_S  -(f27_N  *(q*omegaD-c1o1)-omegaD*feq27_N  *(q-c1o1))/(omegaD-c1o1)+f27_S  *q)/(q+c1o1);
+      q = q_dirS[  ks   ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0])[kn  ]=(c2o1*feqW27_N  -(f27_S  *(q*omegaD-c1o1)-omegaD*feq27_S  *(q-c1o1))/(omegaD-c1o1)+f27_N  *q)/(q+c1o1);
+      q = q_dirT[  kt   ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M])[kb  ]=(c2o1*feqW27_B  -(f27_T  *(q*omegaD-c1o1)-omegaD*feq27_T  *(q-c1o1))/(omegaD-c1o1)+f27_B  *q)/(q+c1o1);
+      q = q_dirB[  kb   ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P])[kt  ]=(c2o1*feqW27_T  -(f27_B  *(q*omegaD-c1o1)-omegaD*feq27_B  *(q-c1o1))/(omegaD-c1o1)+f27_T  *q)/(q+c1o1);
+      q = q_dirNE[ kne  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0])[ksw ]=(c2o1*feqW27_SW -(f27_NE *(q*omegaD-c1o1)-omegaD*feq27_NE *(q-c1o1))/(omegaD-c1o1)+f27_SW *q)/(q+c1o1);
+      q = q_dirSW[ ksw  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0])[kne ]=(c2o1*feqW27_NE -(f27_SW *(q*omegaD-c1o1)-omegaD*feq27_SW *(q-c1o1))/(omegaD-c1o1)+f27_NE *q)/(q+c1o1);
+      q = q_dirSE[ kse  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0])[knw ]=(c2o1*feqW27_NW -(f27_SE *(q*omegaD-c1o1)-omegaD*feq27_SE *(q-c1o1))/(omegaD-c1o1)+f27_NW *q)/(q+c1o1);
+      q = q_dirNW[ knw  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0])[kse ]=(c2o1*feqW27_SE -(f27_NW *(q*omegaD-c1o1)-omegaD*feq27_NW *(q-c1o1))/(omegaD-c1o1)+f27_SE *q)/(q+c1o1);
+      q = q_dirTE[ kte  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M])[kbw ]=(c2o1*feqW27_BW -(f27_TE *(q*omegaD-c1o1)-omegaD*feq27_TE *(q-c1o1))/(omegaD-c1o1)+f27_BW *q)/(q+c1o1);
+      q = q_dirBW[ kbw  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P])[kte ]=(c2o1*feqW27_TE -(f27_BW *(q*omegaD-c1o1)-omegaD*feq27_BW *(q-c1o1))/(omegaD-c1o1)+f27_TE *q)/(q+c1o1);
+      q = q_dirBE[ kbe  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P])[ktw ]=(c2o1*feqW27_TW -(f27_BE *(q*omegaD-c1o1)-omegaD*feq27_BE *(q-c1o1))/(omegaD-c1o1)+f27_TW *q)/(q+c1o1);
+      q = q_dirTW[ ktw  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M])[kbe ]=(c2o1*feqW27_BE -(f27_TW *(q*omegaD-c1o1)-omegaD*feq27_TW *(q-c1o1))/(omegaD-c1o1)+f27_BE *q)/(q+c1o1);
+      q = q_dirTN[ ktn  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM])[kbs ]=(c2o1*feqW27_BS -(f27_TN *(q*omegaD-c1o1)-omegaD*feq27_TN *(q-c1o1))/(omegaD-c1o1)+f27_BS *q)/(q+c1o1);
+      q = q_dirBS[ kbs  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP])[ktn ]=(c2o1*feqW27_TN -(f27_BS *(q*omegaD-c1o1)-omegaD*feq27_BS *(q-c1o1))/(omegaD-c1o1)+f27_TN *q)/(q+c1o1);
+      q = q_dirBN[ kbn  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP])[kts ]=(c2o1*feqW27_TS -(f27_BN *(q*omegaD-c1o1)-omegaD*feq27_BN *(q-c1o1))/(omegaD-c1o1)+f27_TS *q)/(q+c1o1);
+      q = q_dirTS[ kts  ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM])[kbn ]=(c2o1*feqW27_BN -(f27_TS *(q*omegaD-c1o1)-omegaD*feq27_TS *(q-c1o1))/(omegaD-c1o1)+f27_BN *q)/(q+c1o1);
       q = q_dirTNE[ktne ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMM])[kbsw]=(c2o1*feqW27_BSW-(f27_TNE*(q*omegaD-c1o1)-omegaD*feq27_TNE*(q-c1o1))/(omegaD-c1o1)+f27_BSW*q)/(q+c1o1);
       q = q_dirBSW[kbsw ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PPP])[ktne]=(c2o1*feqW27_TNE-(f27_BSW*(q*omegaD-c1o1)-omegaD*feq27_BSW*(q-c1o1))/(omegaD-c1o1)+f27_TNE*q)/(q+c1o1);
       q = q_dirBNE[kbne ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMP])[ktsw]=(c2o1*feqW27_TSW-(f27_BNE*(q*omegaD-c1o1)-omegaD*feq27_BNE*(q-c1o1))/(omegaD-c1o1)+f27_TSW*q)/(q+c1o1);
@@ -3308,24 +3308,24 @@ __global__ void QADDirichlet27(
       q = q_dirBNW[kbnw ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PMP])[ktse]=(c2o1*feqW27_TSE-(f27_BNW*(q*omegaD-c1o1)-omegaD*feq27_BNW*(q-c1o1))/(omegaD-c1o1)+f27_TSE*q)/(q+c1o1);
       q = q_dirBSE[kbse ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MPP])[ktnw]=(c2o1*feqW27_TNW-(f27_BSE*(q*omegaD-c1o1)-omegaD*feq27_BSE*(q-c1o1))/(omegaD-c1o1)+f27_TNW*q)/(q+c1o1);
       q = q_dirTNW[ktnw ]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PMM])[kbse]=(c2o1*feqW27_BSE-(f27_TNW*(q*omegaD-c1o1)-omegaD*feq27_TNW*(q-c1o1))/(omegaD-c1o1)+f27_BSE*q)/(q+c1o1);
-      //q = q_dirE[k];   if (q>=zero && q<=one) (D27.f[DIR_M00  ])[kw  ]=(two*feqW27_W  -(f27_E  *(q*omegaD-one)-omegaD*feq27_E  *(q-one))/(omegaD-one)+f27_W  *q)/(q+one);
-      //q = q_dirW[k];   if (q>=zero && q<=one) (D27.f[DIR_P00  ])[ke  ]=(two*feqW27_E  -(f27_W  *(q*omegaD-one)-omegaD*feq27_W  *(q-one))/(omegaD-one)+f27_E  *q)/(q+one);
-      //q = q_dirN[k];   if (q>=zero && q<=one) (D27.f[DIR_0M0  ])[ks  ]=(two*feqW27_S  -(f27_N  *(q*omegaD-one)-omegaD*feq27_N  *(q-one))/(omegaD-one)+f27_S  *q)/(q+one);
-      //q = q_dirS[k];   if (q>=zero && q<=one) (D27.f[DIR_0P0  ])[kn  ]=(two*feqW27_N  -(f27_S  *(q*omegaD-one)-omegaD*feq27_S  *(q-one))/(omegaD-one)+f27_N  *q)/(q+one);
-      //q = q_dirT[k];   if (q>=zero && q<=one) (D27.f[DIR_00M  ])[kb  ]=(two*feqW27_B  -(f27_T  *(q*omegaD-one)-omegaD*feq27_T  *(q-one))/(omegaD-one)+f27_B  *q)/(q+one);
-      //q = q_dirB[k];   if (q>=zero && q<=one) (D27.f[DIR_00P  ])[kt  ]=(two*feqW27_T  -(f27_B  *(q*omegaD-one)-omegaD*feq27_B  *(q-one))/(omegaD-one)+f27_T  *q)/(q+one);
-      //q = q_dirNE[k];  if (q>=zero && q<=one) (D27.f[DIR_MM0 ])[ksw ]=(two*feqW27_SW -(f27_NE *(q*omegaD-one)-omegaD*feq27_NE *(q-one))/(omegaD-one)+f27_SW *q)/(q+one);
-      //q = q_dirSW[k];  if (q>=zero && q<=one) (D27.f[DIR_PP0 ])[kne ]=(two*feqW27_NE -(f27_SW *(q*omegaD-one)-omegaD*feq27_SW *(q-one))/(omegaD-one)+f27_NE *q)/(q+one);
-      //q = q_dirSE[k];  if (q>=zero && q<=one) (D27.f[DIR_MP0 ])[knw ]=(two*feqW27_NW -(f27_SE *(q*omegaD-one)-omegaD*feq27_SE *(q-one))/(omegaD-one)+f27_NW *q)/(q+one);
-      //q = q_dirNW[k];  if (q>=zero && q<=one) (D27.f[DIR_PM0 ])[kse ]=(two*feqW27_SE -(f27_NW *(q*omegaD-one)-omegaD*feq27_NW *(q-one))/(omegaD-one)+f27_SE *q)/(q+one);
-      //q = q_dirTE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0M ])[kbw ]=(two*feqW27_BW -(f27_TE *(q*omegaD-one)-omegaD*feq27_TE *(q-one))/(omegaD-one)+f27_BW *q)/(q+one);
-      //q = q_dirBW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0P ])[kte ]=(two*feqW27_TE -(f27_BW *(q*omegaD-one)-omegaD*feq27_BW *(q-one))/(omegaD-one)+f27_TE *q)/(q+one);
-      //q = q_dirBE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0P ])[ktw ]=(two*feqW27_TW -(f27_BE *(q*omegaD-one)-omegaD*feq27_BE *(q-one))/(omegaD-one)+f27_TW *q)/(q+one);
-      //q = q_dirTW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0M ])[kbe ]=(two*feqW27_BE -(f27_TW *(q*omegaD-one)-omegaD*feq27_TW *(q-one))/(omegaD-one)+f27_BE *q)/(q+one);
-      //q = q_dirTN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MM ])[kbs ]=(two*feqW27_BS -(f27_TN *(q*omegaD-one)-omegaD*feq27_TN *(q-one))/(omegaD-one)+f27_BS *q)/(q+one);
-      //q = q_dirBS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PP ])[ktn ]=(two*feqW27_TN -(f27_BS *(q*omegaD-one)-omegaD*feq27_BS *(q-one))/(omegaD-one)+f27_TN *q)/(q+one);
-      //q = q_dirBN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MP ])[kts ]=(two*feqW27_TS -(f27_BN *(q*omegaD-one)-omegaD*feq27_BN *(q-one))/(omegaD-one)+f27_TS *q)/(q+one);
-      //q = q_dirTS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PM ])[kbn ]=(two*feqW27_BN -(f27_TS *(q*omegaD-one)-omegaD*feq27_TS *(q-one))/(omegaD-one)+f27_BN *q)/(q+one);
+      //q = q_dirE[k];   if (q>=zero && q<=one) (D27.f[DIR_M00])[kw  ]=(two*feqW27_W  -(f27_E  *(q*omegaD-one)-omegaD*feq27_E  *(q-one))/(omegaD-one)+f27_W  *q)/(q+one);
+      //q = q_dirW[k];   if (q>=zero && q<=one) (D27.f[DIR_P00])[ke  ]=(two*feqW27_E  -(f27_W  *(q*omegaD-one)-omegaD*feq27_W  *(q-one))/(omegaD-one)+f27_E  *q)/(q+one);
+      //q = q_dirN[k];   if (q>=zero && q<=one) (D27.f[DIR_0M0])[ks  ]=(two*feqW27_S  -(f27_N  *(q*omegaD-one)-omegaD*feq27_N  *(q-one))/(omegaD-one)+f27_S  *q)/(q+one);
+      //q = q_dirS[k];   if (q>=zero && q<=one) (D27.f[DIR_0P0])[kn  ]=(two*feqW27_N  -(f27_S  *(q*omegaD-one)-omegaD*feq27_S  *(q-one))/(omegaD-one)+f27_N  *q)/(q+one);
+      //q = q_dirT[k];   if (q>=zero && q<=one) (D27.f[DIR_00M])[kb  ]=(two*feqW27_B  -(f27_T  *(q*omegaD-one)-omegaD*feq27_T  *(q-one))/(omegaD-one)+f27_B  *q)/(q+one);
+      //q = q_dirB[k];   if (q>=zero && q<=one) (D27.f[DIR_00P])[kt  ]=(two*feqW27_T  -(f27_B  *(q*omegaD-one)-omegaD*feq27_B  *(q-one))/(omegaD-one)+f27_T  *q)/(q+one);
+      //q = q_dirNE[k];  if (q>=zero && q<=one) (D27.f[DIR_MM0])[ksw ]=(two*feqW27_SW -(f27_NE *(q*omegaD-one)-omegaD*feq27_NE *(q-one))/(omegaD-one)+f27_SW *q)/(q+one);
+      //q = q_dirSW[k];  if (q>=zero && q<=one) (D27.f[DIR_PP0])[kne ]=(two*feqW27_NE -(f27_SW *(q*omegaD-one)-omegaD*feq27_SW *(q-one))/(omegaD-one)+f27_NE *q)/(q+one);
+      //q = q_dirSE[k];  if (q>=zero && q<=one) (D27.f[DIR_MP0])[knw ]=(two*feqW27_NW -(f27_SE *(q*omegaD-one)-omegaD*feq27_SE *(q-one))/(omegaD-one)+f27_NW *q)/(q+one);
+      //q = q_dirNW[k];  if (q>=zero && q<=one) (D27.f[DIR_PM0])[kse ]=(two*feqW27_SE -(f27_NW *(q*omegaD-one)-omegaD*feq27_NW *(q-one))/(omegaD-one)+f27_SE *q)/(q+one);
+      //q = q_dirTE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0M])[kbw ]=(two*feqW27_BW -(f27_TE *(q*omegaD-one)-omegaD*feq27_TE *(q-one))/(omegaD-one)+f27_BW *q)/(q+one);
+      //q = q_dirBW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0P])[kte ]=(two*feqW27_TE -(f27_BW *(q*omegaD-one)-omegaD*feq27_BW *(q-one))/(omegaD-one)+f27_TE *q)/(q+one);
+      //q = q_dirBE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0P])[ktw ]=(two*feqW27_TW -(f27_BE *(q*omegaD-one)-omegaD*feq27_BE *(q-one))/(omegaD-one)+f27_TW *q)/(q+one);
+      //q = q_dirTW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0M])[kbe ]=(two*feqW27_BE -(f27_TW *(q*omegaD-one)-omegaD*feq27_TW *(q-one))/(omegaD-one)+f27_BE *q)/(q+one);
+      //q = q_dirTN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MM])[kbs ]=(two*feqW27_BS -(f27_TN *(q*omegaD-one)-omegaD*feq27_TN *(q-one))/(omegaD-one)+f27_BS *q)/(q+one);
+      //q = q_dirBS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PP])[ktn ]=(two*feqW27_TN -(f27_BS *(q*omegaD-one)-omegaD*feq27_BS *(q-one))/(omegaD-one)+f27_TN *q)/(q+one);
+      //q = q_dirBN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MP])[kts ]=(two*feqW27_TS -(f27_BN *(q*omegaD-one)-omegaD*feq27_BN *(q-one))/(omegaD-one)+f27_TS *q)/(q+one);
+      //q = q_dirTS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PM])[kbn ]=(two*feqW27_BN -(f27_TS *(q*omegaD-one)-omegaD*feq27_TS *(q-one))/(omegaD-one)+f27_BN *q)/(q+one);
       //q = q_dirTNE[k]; if (q>=zero && q<=one) (D27.f[DIR_MMM])[kbsw]=(two*feqW27_BSW-(f27_TNE*(q*omegaD-one)-omegaD*feq27_TNE*(q-one))/(omegaD-one)+f27_BSW*q)/(q+one);
       //q = q_dirBSW[k]; if (q>=zero && q<=one) (D27.f[DIR_PPP])[ktne]=(two*feqW27_TNE-(f27_BSW*(q*omegaD-one)-omegaD*feq27_BSW*(q-one))/(omegaD-one)+f27_TNE*q)/(q+one);
       //q = q_dirBNE[k]; if (q>=zero && q<=one) (D27.f[DIR_MMP])[ktsw]=(two*feqW27_TSW-(f27_BNE*(q*omegaD-one)-omegaD*feq27_BNE*(q-one))/(omegaD-one)+f27_TSW*q)/(q+one);
@@ -3389,131 +3389,131 @@ __global__ void QADBB27( real* DD,
                                    unsigned int* neighborX,
                                    unsigned int* neighborY,
                                    unsigned int* neighborZ,
-                                   unsigned int size_Mat, 
+                                   unsigned long long numberOfLBnodes, 
                                    bool isEvenTimestep)
 {
    //Distributions27 D;
    //if (isEvenTimestep==true)
    //{
-   //   D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-   //   D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-   //   D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-   //   D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-   //   D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-   //   D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-   //   D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-   //   D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-   //   D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-   //   D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-   //   D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-   //   D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-   //   D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-   //   D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-   //   D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-   //   D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-   //   D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-   //   D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-   //   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   //   D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-   //   D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-   //   D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-   //   D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-   //   D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-   //   D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-   //   D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-   //   D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+   //   D.f[DIR_P00] = &DD[DIR_P00 * size_Mat];
+   //   D.f[DIR_M00] = &DD[DIR_M00 * size_Mat];
+   //   D.f[DIR_0P0] = &DD[DIR_0P0 * size_Mat];
+   //   D.f[DIR_0M0] = &DD[DIR_0M0 * size_Mat];
+   //   D.f[DIR_00P] = &DD[DIR_00P * size_Mat];
+   //   D.f[DIR_00M] = &DD[DIR_00M * size_Mat];
+   //   D.f[DIR_PP0] = &DD[DIR_PP0 * size_Mat];
+   //   D.f[DIR_MM0] = &DD[DIR_MM0 * size_Mat];
+   //   D.f[DIR_PM0] = &DD[DIR_PM0 * size_Mat];
+   //   D.f[DIR_MP0] = &DD[DIR_MP0 * size_Mat];
+   //   D.f[DIR_P0P] = &DD[DIR_P0P * size_Mat];
+   //   D.f[DIR_M0M] = &DD[DIR_M0M * size_Mat];
+   //   D.f[DIR_P0M] = &DD[DIR_P0M * size_Mat];
+   //   D.f[DIR_M0P] = &DD[DIR_M0P * size_Mat];
+   //   D.f[DIR_0PP] = &DD[DIR_0PP * size_Mat];
+   //   D.f[DIR_0MM] = &DD[DIR_0MM * size_Mat];
+   //   D.f[DIR_0PM] = &DD[DIR_0PM * size_Mat];
+   //   D.f[DIR_0MP] = &DD[DIR_0MP * size_Mat];
+   //   D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+   //   D.f[DIR_PPP] = &DD[DIR_PPP * size_Mat];
+   //   D.f[DIR_MMP] = &DD[DIR_MMP * size_Mat];
+   //   D.f[DIR_PMP] = &DD[DIR_PMP * size_Mat];
+   //   D.f[DIR_MPP] = &DD[DIR_MPP * size_Mat];
+   //   D.f[DIR_PPM] = &DD[DIR_PPM * size_Mat];
+   //   D.f[DIR_MMM] = &DD[DIR_MMM * size_Mat];
+   //   D.f[DIR_PMM] = &DD[DIR_PMM * size_Mat];
+   //   D.f[DIR_MPM] = &DD[DIR_MPM * size_Mat];
    //} 
    //else
    //{
-   //   D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-   //   D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-   //   D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-   //   D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-   //   D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-   //   D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-   //   D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-   //   D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-   //   D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-   //   D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-   //   D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-   //   D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-   //   D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-   //   D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-   //   D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-   //   D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-   //   D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-   //   D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-   //   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   //   D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-   //   D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-   //   D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-   //   D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-   //   D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-   //   D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-   //   D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-   //   D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+   //   D.f[DIR_M00] = &DD[DIR_P00 * size_Mat];
+   //   D.f[DIR_P00] = &DD[DIR_M00 * size_Mat];
+   //   D.f[DIR_0M0] = &DD[DIR_0P0 * size_Mat];
+   //   D.f[DIR_0P0] = &DD[DIR_0M0 * size_Mat];
+   //   D.f[DIR_00M] = &DD[DIR_00P * size_Mat];
+   //   D.f[DIR_00P] = &DD[DIR_00M * size_Mat];
+   //   D.f[DIR_MM0] = &DD[DIR_PP0 * size_Mat];
+   //   D.f[DIR_PP0] = &DD[DIR_MM0 * size_Mat];
+   //   D.f[DIR_MP0] = &DD[DIR_PM0 * size_Mat];
+   //   D.f[DIR_PM0] = &DD[DIR_MP0 * size_Mat];
+   //   D.f[DIR_M0M] = &DD[DIR_P0P * size_Mat];
+   //   D.f[DIR_P0P] = &DD[DIR_M0M * size_Mat];
+   //   D.f[DIR_M0P] = &DD[DIR_P0M * size_Mat];
+   //   D.f[DIR_P0M] = &DD[DIR_M0P * size_Mat];
+   //   D.f[DIR_0MM] = &DD[DIR_0PP * size_Mat];
+   //   D.f[DIR_0PP] = &DD[DIR_0MM * size_Mat];
+   //   D.f[DIR_0MP] = &DD[DIR_0PM * size_Mat];
+   //   D.f[DIR_0PM] = &DD[DIR_0MP * size_Mat];
+   //   D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+   //   D.f[DIR_PPP] = &DD[DIR_MMM * size_Mat];
+   //   D.f[DIR_MMP] = &DD[DIR_PPM * size_Mat];
+   //   D.f[DIR_PMP] = &DD[DIR_MPM * size_Mat];
+   //   D.f[DIR_MPP] = &DD[DIR_PMM * size_Mat];
+   //   D.f[DIR_PPM] = &DD[DIR_MMP * size_Mat];
+   //   D.f[DIR_MMM] = &DD[DIR_PPP * size_Mat];
+   //   D.f[DIR_PMM] = &DD[DIR_MPP * size_Mat];
+   //   D.f[DIR_MPM] = &DD[DIR_PMP * size_Mat];
    //}
 
    Distributions27 D27;
    if (isEvenTimestep==true)
    {
-      D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+      D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+      D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -3534,24 +3534,24 @@ __global__ void QADBB27( real* DD,
          *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
          *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
          *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -3591,33 +3591,33 @@ __global__ void QADBB27( real* DD,
       unsigned int ktne = KQK;
       unsigned int kbsw = neighborZ[ksw];
       ////////////////////////////////////////////////////////////////////////////////
-      //real f_W    = (D.f[DIR_P00   ])[ke   ];
-      //real f_E    = (D.f[DIR_M00   ])[kw   ];
-      //real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      //real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      //real f_B    = (D.f[DIR_00P   ])[kt   ];
-      //real f_T    = (D.f[DIR_00M   ])[kb   ];
-      //real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      //real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      //real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      //real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      //real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      //real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      //real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      //real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      //real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      //real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      //real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      //real f_BN   = (D.f[DIR_0MP  ])[kts  ];
+      //real f_W    = (D.f[DIR_P00])[ke   ];
+      //real f_E    = (D.f[DIR_M00])[kw   ];
+      //real f_S    = (D.f[DIR_0P0])[kn   ];
+      //real f_N    = (D.f[DIR_0M0])[ks   ];
+      //real f_B    = (D.f[DIR_00P])[kt   ];
+      //real f_T    = (D.f[DIR_00M])[kb   ];
+      //real f_SW   = (D.f[DIR_PP0])[kne  ];
+      //real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      //real f_NW   = (D.f[DIR_PM0])[kse  ];
+      //real f_SE   = (D.f[DIR_MP0])[knw  ];
+      //real f_BW   = (D.f[DIR_P0P])[kte  ];
+      //real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      //real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      //real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      //real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      //real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      //real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      //real f_BN   = (D.f[DIR_0MP])[kts  ];
       //real f_ZERO = (D.f[DIR_000])[kzero];
-      //real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      //real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      //real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      //real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      //real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      //real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      //real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      //real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      //real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      //real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      //real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      //real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      //real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      //real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      //real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      //real f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       //real vx1, vx2, vx3, /*drho, feq,*/ q;
       real q;
@@ -3644,33 +3644,33 @@ __global__ void QADBB27( real* DD,
       //vx2     =  OORho*((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_BNW-f_TSE)+(f_TNW-f_BSE) +(f_NE-f_SW)+(f_NW-f_SE)+(f_TN-f_BS)+(f_BN-f_TS)+(f_N-f_S));
       //vx3     =  OORho*((f_TNE-f_BSW)+(f_TSW-f_BNE)+(f_TSE-f_BNW)+(f_TNW-f_BSE) +(f_TE-f_BW)+(f_TW-f_BE)+(f_TN-f_BS)+(f_TS-f_BN)+(f_T-f_B));
       ////////////////////////////////////////////////////////////////////////////////
-      real f27_W    = (D27.f[DIR_P00   ])[ke   ];
-      real f27_E    = (D27.f[DIR_M00   ])[kw   ];
-      real f27_S    = (D27.f[DIR_0P0   ])[kn   ];
-      real f27_N    = (D27.f[DIR_0M0   ])[ks   ];
-      real f27_B    = (D27.f[DIR_00P   ])[kt   ];
-      real f27_T    = (D27.f[DIR_00M   ])[kb   ];
-      real f27_SW   = (D27.f[DIR_PP0  ])[kne  ];
-      real f27_NE   = (D27.f[DIR_MM0  ])[ksw  ];
-      real f27_NW   = (D27.f[DIR_PM0  ])[kse  ];
-      real f27_SE   = (D27.f[DIR_MP0  ])[knw  ];
-      real f27_BW   = (D27.f[DIR_P0P  ])[kte  ];
-      real f27_TE   = (D27.f[DIR_M0M  ])[kbw  ];
-      real f27_TW   = (D27.f[DIR_P0M  ])[kbe  ];
-      real f27_BE   = (D27.f[DIR_M0P  ])[ktw  ];
-      real f27_BS   = (D27.f[DIR_0PP  ])[ktn  ];
-      real f27_TN   = (D27.f[DIR_0MM  ])[kbs  ];
-      real f27_TS   = (D27.f[DIR_0PM  ])[kbn  ];
-      real f27_BN   = (D27.f[DIR_0MP  ])[kts  ];
+      real f27_W    = (D27.f[DIR_P00])[ke   ];
+      real f27_E    = (D27.f[DIR_M00])[kw   ];
+      real f27_S    = (D27.f[DIR_0P0])[kn   ];
+      real f27_N    = (D27.f[DIR_0M0])[ks   ];
+      real f27_B    = (D27.f[DIR_00P])[kt   ];
+      real f27_T    = (D27.f[DIR_00M])[kb   ];
+      real f27_SW   = (D27.f[DIR_PP0])[kne  ];
+      real f27_NE   = (D27.f[DIR_MM0])[ksw  ];
+      real f27_NW   = (D27.f[DIR_PM0])[kse  ];
+      real f27_SE   = (D27.f[DIR_MP0])[knw  ];
+      real f27_BW   = (D27.f[DIR_P0P])[kte  ];
+      real f27_TE   = (D27.f[DIR_M0M])[kbw  ];
+      real f27_TW   = (D27.f[DIR_P0M])[kbe  ];
+      real f27_BE   = (D27.f[DIR_M0P])[ktw  ];
+      real f27_BS   = (D27.f[DIR_0PP])[ktn  ];
+      real f27_TN   = (D27.f[DIR_0MM])[kbs  ];
+      real f27_TS   = (D27.f[DIR_0PM])[kbn  ];
+      real f27_BN   = (D27.f[DIR_0MP])[kts  ];
       //real f27_ZERO = (D27.f[DIR_000])[kzero];
-      real f27_BSW  = (D27.f[DIR_PPP ])[ktne ];
-      real f27_BNE  = (D27.f[DIR_MMP ])[ktsw ];
-      real f27_BNW  = (D27.f[DIR_PMP ])[ktse ];
-      real f27_BSE  = (D27.f[DIR_MPP ])[ktnw ];
-      real f27_TSW  = (D27.f[DIR_PPM ])[kbne ];
-      real f27_TNE  = (D27.f[DIR_MMM ])[kbsw ];
-      real f27_TNW  = (D27.f[DIR_PMM ])[kbse ];
-      real f27_TSE  = (D27.f[DIR_MPM ])[kbnw ];
+      real f27_BSW  = (D27.f[DIR_PPP])[ktne ];
+      real f27_BNE  = (D27.f[DIR_MMP])[ktsw ];
+      real f27_BNW  = (D27.f[DIR_PMP])[ktse ];
+      real f27_BSE  = (D27.f[DIR_MPP])[ktnw ];
+      real f27_TSW  = (D27.f[DIR_PPM])[kbne ];
+      real f27_TNE  = (D27.f[DIR_MMM])[kbsw ];
+      real f27_TNW  = (D27.f[DIR_PMM])[kbse ];
+      real f27_TSE  = (D27.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       //real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
       ////////////////////////////////////////////////////////////////////////////////
@@ -3746,86 +3746,86 @@ __global__ void QADBB27( real* DD,
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+         D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+         D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
       //(D.f[DIR_000])[k]=0.1f;
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00  ])[kw  ]=f27_E  ;
-      q = q_dirW[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00  ])[ke  ]=f27_W  ;
-      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0  ])[ks  ]=f27_N  ;
-      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0  ])[kn  ]=f27_S  ;
-      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M  ])[kb  ]=f27_T  ;
-      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P  ])[kt  ]=f27_B  ;
-      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0 ])[ksw ]=f27_NE ;
-      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0 ])[kne ]=f27_SW ;
-      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0 ])[knw ]=f27_SE ;
-      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0 ])[kse ]=f27_NW ;
-      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M ])[kbw ]=f27_TE ;
-      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P ])[kte ]=f27_BW ;
-      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P ])[ktw ]=f27_BE ;
-      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M ])[kbe ]=f27_TW ;
-      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM ])[kbs ]=f27_TN ;
-      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP ])[ktn ]=f27_BS ;
-      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP ])[kts ]=f27_BN ;
-      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM ])[kbn ]=f27_TS ;
+      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00])[kw  ]=f27_E  ;
+      q = q_dirW[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00])[ke  ]=f27_W  ;
+      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0])[ks  ]=f27_N  ;
+      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0])[kn  ]=f27_S  ;
+      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M])[kb  ]=f27_T  ;
+      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P])[kt  ]=f27_B  ;
+      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0])[ksw ]=f27_NE ;
+      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0])[kne ]=f27_SW ;
+      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0])[knw ]=f27_SE ;
+      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0])[kse ]=f27_NW ;
+      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M])[kbw ]=f27_TE ;
+      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P])[kte ]=f27_BW ;
+      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P])[ktw ]=f27_BE ;
+      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M])[kbe ]=f27_TW ;
+      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM])[kbs ]=f27_TN ;
+      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP])[ktn ]=f27_BS ;
+      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP])[kts ]=f27_BN ;
+      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM])[kbn ]=f27_TS ;
       q = q_dirTNE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMM])[kbsw]=f27_TNE;
       q = q_dirBSW[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PPP])[ktne]=f27_BSW;
       q = q_dirBNE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMP])[ktsw]=f27_BNE;
@@ -3905,91 +3905,91 @@ __global__ void QNoSlipADincomp7(
 											 unsigned int* neighborX,
 											 unsigned int* neighborY,
 											 unsigned int* neighborZ,
-											 unsigned int size_Mat, 
+											 unsigned long long numberOfLBnodes, 
 											 bool isEvenTimestep)
 {
    //Distributions27 D;
    //if (isEvenTimestep==true)
    //{
-   //   D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-   //   D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-   //   D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-   //   D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-   //   D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-   //   D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-   //   D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-   //   D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-   //   D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-   //   D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-   //   D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-   //   D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-   //   D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-   //   D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-   //   D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-   //   D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-   //   D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-   //   D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-   //   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   //   D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-   //   D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-   //   D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-   //   D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-   //   D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-   //   D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-   //   D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-   //   D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+   //   D.f[DIR_P00] = &DD[DIR_P00 * size_Mat];
+   //   D.f[DIR_M00] = &DD[DIR_M00 * size_Mat];
+   //   D.f[DIR_0P0] = &DD[DIR_0P0 * size_Mat];
+   //   D.f[DIR_0M0] = &DD[DIR_0M0 * size_Mat];
+   //   D.f[DIR_00P] = &DD[DIR_00P * size_Mat];
+   //   D.f[DIR_00M] = &DD[DIR_00M * size_Mat];
+   //   D.f[DIR_PP0] = &DD[DIR_PP0 * size_Mat];
+   //   D.f[DIR_MM0] = &DD[DIR_MM0 * size_Mat];
+   //   D.f[DIR_PM0] = &DD[DIR_PM0 * size_Mat];
+   //   D.f[DIR_MP0] = &DD[DIR_MP0 * size_Mat];
+   //   D.f[DIR_P0P] = &DD[DIR_P0P * size_Mat];
+   //   D.f[DIR_M0M] = &DD[DIR_M0M * size_Mat];
+   //   D.f[DIR_P0M] = &DD[DIR_P0M * size_Mat];
+   //   D.f[DIR_M0P] = &DD[DIR_M0P * size_Mat];
+   //   D.f[DIR_0PP] = &DD[DIR_0PP * size_Mat];
+   //   D.f[DIR_0MM] = &DD[DIR_0MM * size_Mat];
+   //   D.f[DIR_0PM] = &DD[DIR_0PM * size_Mat];
+   //   D.f[DIR_0MP] = &DD[DIR_0MP * size_Mat];
+   //   D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+   //   D.f[DIR_PPP] = &DD[DIR_PPP * size_Mat];
+   //   D.f[DIR_MMP] = &DD[DIR_MMP * size_Mat];
+   //   D.f[DIR_PMP] = &DD[DIR_PMP * size_Mat];
+   //   D.f[DIR_MPP] = &DD[DIR_MPP * size_Mat];
+   //   D.f[DIR_PPM] = &DD[DIR_PPM * size_Mat];
+   //   D.f[DIR_MMM] = &DD[DIR_MMM * size_Mat];
+   //   D.f[DIR_PMM] = &DD[DIR_PMM * size_Mat];
+   //   D.f[DIR_MPM] = &DD[DIR_MPM * size_Mat];
    //} 
    //else
    //{
-   //   D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-   //   D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-   //   D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-   //   D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-   //   D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-   //   D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-   //   D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-   //   D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-   //   D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-   //   D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-   //   D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-   //   D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-   //   D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-   //   D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-   //   D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-   //   D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-   //   D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-   //   D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-   //   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   //   D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-   //   D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-   //   D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-   //   D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-   //   D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-   //   D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-   //   D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-   //   D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+   //   D.f[DIR_M00] = &DD[DIR_P00 * size_Mat];
+   //   D.f[DIR_P00] = &DD[DIR_M00 * size_Mat];
+   //   D.f[DIR_0M0] = &DD[DIR_0P0 * size_Mat];
+   //   D.f[DIR_0P0] = &DD[DIR_0M0 * size_Mat];
+   //   D.f[DIR_00M] = &DD[DIR_00P * size_Mat];
+   //   D.f[DIR_00P] = &DD[DIR_00M * size_Mat];
+   //   D.f[DIR_MM0] = &DD[DIR_PP0 * size_Mat];
+   //   D.f[DIR_PP0] = &DD[DIR_MM0 * size_Mat];
+   //   D.f[DIR_MP0] = &DD[DIR_PM0 * size_Mat];
+   //   D.f[DIR_PM0] = &DD[DIR_MP0 * size_Mat];
+   //   D.f[DIR_M0M] = &DD[DIR_P0P * size_Mat];
+   //   D.f[DIR_P0P] = &DD[DIR_M0M * size_Mat];
+   //   D.f[DIR_M0P] = &DD[DIR_P0M * size_Mat];
+   //   D.f[DIR_P0M] = &DD[DIR_M0P * size_Mat];
+   //   D.f[DIR_0MM] = &DD[DIR_0PP * size_Mat];
+   //   D.f[DIR_0PP] = &DD[DIR_0MM * size_Mat];
+   //   D.f[DIR_0MP] = &DD[DIR_0PM * size_Mat];
+   //   D.f[DIR_0PM] = &DD[DIR_0MP * size_Mat];
+   //   D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+   //   D.f[DIR_PPP] = &DD[DIR_MMM * size_Mat];
+   //   D.f[DIR_MMP] = &DD[DIR_PPM * size_Mat];
+   //   D.f[DIR_PMP] = &DD[DIR_MPM * size_Mat];
+   //   D.f[DIR_MPP] = &DD[DIR_PMM * size_Mat];
+   //   D.f[DIR_PPM] = &DD[DIR_MMP * size_Mat];
+   //   D.f[DIR_MMM] = &DD[DIR_PPP * size_Mat];
+   //   D.f[DIR_PMM] = &DD[DIR_MPP * size_Mat];
+   //   D.f[DIR_MPM] = &DD[DIR_PMP * size_Mat];
    //}
 
    Distributions7 D7;
    if (isEvenTimestep==true)
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[1] = &DD7[1*size_Mat];
-      D7.f[2] = &DD7[2*size_Mat];
-      D7.f[3] = &DD7[3*size_Mat];
-      D7.f[4] = &DD7[4*size_Mat];
-      D7.f[5] = &DD7[5*size_Mat];
-      D7.f[6] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[1] = &DD7[1*numberOfLBnodes];
+      D7.f[2] = &DD7[2*numberOfLBnodes];
+      D7.f[3] = &DD7[3*numberOfLBnodes];
+      D7.f[4] = &DD7[4*numberOfLBnodes];
+      D7.f[5] = &DD7[5*numberOfLBnodes];
+      D7.f[6] = &DD7[6*numberOfLBnodes];
    }
    else
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[2] = &DD7[1*size_Mat];
-      D7.f[1] = &DD7[2*size_Mat];
-      D7.f[4] = &DD7[3*size_Mat];
-      D7.f[3] = &DD7[4*size_Mat];
-      D7.f[6] = &DD7[5*size_Mat];
-      D7.f[5] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[2] = &DD7[1*numberOfLBnodes];
+      D7.f[1] = &DD7[2*numberOfLBnodes];
+      D7.f[4] = &DD7[3*numberOfLBnodes];
+      D7.f[3] = &DD7[4*numberOfLBnodes];
+      D7.f[6] = &DD7[5*numberOfLBnodes];
+      D7.f[5] = &DD7[6*numberOfLBnodes];
    }
 
 
@@ -4009,12 +4009,12 @@ __global__ void QNoSlipADincomp7(
       //////////////////////////////////////////////////////////////////////////////////
       real  *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB;
 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
       //////////////////////////////////////////////////////////////////////////////////
       //index
       unsigned int KQK  = k_Q[k];
@@ -4046,32 +4046,32 @@ __global__ void QNoSlipADincomp7(
       //unsigned int ktne = KQK;
       //unsigned int kbsw = neighborZ[ksw];
       ////////////////////////////////////////////////////////////////////////////////
-      //real f_W    = (D.f[DIR_P00   ])[ke   ];
-      //real f_E    = (D.f[DIR_M00   ])[kw   ];
-      //real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      //real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      //real f_B    = (D.f[DIR_00P   ])[kt   ];
-      //real f_T    = (D.f[DIR_00M   ])[kb   ];
-      //real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      //real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      //real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      //real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      //real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      //real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      //real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      //real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      //real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      //real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      //real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      //real f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      //real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      //real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      //real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      //real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      //real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      //real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      //real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      //real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      //real f_W    = (D.f[DIR_P00])[ke   ];
+      //real f_E    = (D.f[DIR_M00])[kw   ];
+      //real f_S    = (D.f[DIR_0P0])[kn   ];
+      //real f_N    = (D.f[DIR_0M0])[ks   ];
+      //real f_B    = (D.f[DIR_00P])[kt   ];
+      //real f_T    = (D.f[DIR_00M])[kb   ];
+      //real f_SW   = (D.f[DIR_PP0])[kne  ];
+      //real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      //real f_NW   = (D.f[DIR_PM0])[kse  ];
+      //real f_SE   = (D.f[DIR_MP0])[knw  ];
+      //real f_BW   = (D.f[DIR_P0P])[kte  ];
+      //real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      //real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      //real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      //real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      //real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      //real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      //real f_BN   = (D.f[DIR_0MP])[kts  ];
+      //real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      //real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      //real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      //real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      //real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      //real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      //real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      //real f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       //real vx1 =  ((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_TSE-f_BNW)+(f_BSE-f_TNW) +(f_NE-f_SW)+(f_SE-f_NW)+(f_TE-f_BW)+(f_BE-f_TW)+(f_E-f_W));
       //real vx2 =  ((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_BNW-f_TSE)+(f_TNW-f_BSE) +(f_NE-f_SW)+(f_NW-f_SE)+(f_TN-f_BS)+(f_BN-f_TS)+(f_N-f_S));
@@ -4131,23 +4131,23 @@ __global__ void QNoSlipADincomp7(
       //pointertausch
       if (isEvenTimestep==false)
       {
-         D7.f[0] = &DD7[0*size_Mat];
-         D7.f[1] = &DD7[1*size_Mat];
-         D7.f[2] = &DD7[2*size_Mat];
-         D7.f[3] = &DD7[3*size_Mat];
-         D7.f[4] = &DD7[4*size_Mat];
-         D7.f[5] = &DD7[5*size_Mat];
-         D7.f[6] = &DD7[6*size_Mat];
+         D7.f[0] = &DD7[0*numberOfLBnodes];
+         D7.f[1] = &DD7[1*numberOfLBnodes];
+         D7.f[2] = &DD7[2*numberOfLBnodes];
+         D7.f[3] = &DD7[3*numberOfLBnodes];
+         D7.f[4] = &DD7[4*numberOfLBnodes];
+         D7.f[5] = &DD7[5*numberOfLBnodes];
+         D7.f[6] = &DD7[6*numberOfLBnodes];
       }
       else
       {
-         D7.f[0] = &DD7[0*size_Mat];
-         D7.f[2] = &DD7[1*size_Mat];
-         D7.f[1] = &DD7[2*size_Mat];
-         D7.f[4] = &DD7[3*size_Mat];
-         D7.f[3] = &DD7[4*size_Mat];
-         D7.f[6] = &DD7[5*size_Mat];
-         D7.f[5] = &DD7[6*size_Mat];
+         D7.f[0] = &DD7[0*numberOfLBnodes];
+         D7.f[2] = &DD7[1*numberOfLBnodes];
+         D7.f[1] = &DD7[2*numberOfLBnodes];
+         D7.f[4] = &DD7[3*numberOfLBnodes];
+         D7.f[3] = &DD7[4*numberOfLBnodes];
+         D7.f[6] = &DD7[5*numberOfLBnodes];
+         D7.f[5] = &DD7[6*numberOfLBnodes];
       }
 
       ////////////////////////////////////////////////////////////////////////////
@@ -4329,131 +4329,131 @@ __global__ void QNoSlipADincomp27(
 											 unsigned int* neighborX,
 											 unsigned int* neighborY,
 											 unsigned int* neighborZ,
-											 unsigned int size_Mat, 
+											 unsigned long long numberOfLBnodes, 
 											 bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
 
    Distributions27 D27;
    if (isEvenTimestep==true)
    {
-      D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+      D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+      D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -4474,24 +4474,24 @@ __global__ void QNoSlipADincomp27(
          *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
          *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
          *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -4531,65 +4531,65 @@ __global__ void QNoSlipADincomp27(
       unsigned int ktne = KQK;
       unsigned int kbsw = neighborZ[ksw];
       ////////////////////////////////////////////////////////////////////////////////
-      real f_W    = (D.f[DIR_P00   ])[ke   ];
-      real f_E    = (D.f[DIR_M00   ])[kw   ];
-      real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      real f_B    = (D.f[DIR_00P   ])[kt   ];
-      real f_T    = (D.f[DIR_00M   ])[kb   ];
-      real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (D.f[DIR_0MP  ])[kts  ];
+      real f_W    = (D.f[DIR_P00])[ke   ];
+      real f_E    = (D.f[DIR_M00])[kw   ];
+      real f_S    = (D.f[DIR_0P0])[kn   ];
+      real f_N    = (D.f[DIR_0M0])[ks   ];
+      real f_B    = (D.f[DIR_00P])[kt   ];
+      real f_T    = (D.f[DIR_00M])[kb   ];
+      real f_SW   = (D.f[DIR_PP0])[kne  ];
+      real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      real f_NW   = (D.f[DIR_PM0])[kse  ];
+      real f_SE   = (D.f[DIR_MP0])[knw  ];
+      real f_BW   = (D.f[DIR_P0P])[kte  ];
+      real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      real f_BN   = (D.f[DIR_0MP])[kts  ];
       //real f_ZERO = (D.f[DIR_000])[kzero];
-      real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      real f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1 =  ((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_TSE-f_BNW)+(f_BSE-f_TNW) +(f_NE-f_SW)+(f_SE-f_NW)+(f_TE-f_BW)+(f_BE-f_TW)+(f_E-f_W));
       real vx2 =  ((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_BNW-f_TSE)+(f_TNW-f_BSE) +(f_NE-f_SW)+(f_NW-f_SE)+(f_TN-f_BS)+(f_BN-f_TS)+(f_N-f_S));
       real vx3 =  ((f_TNE-f_BSW)+(f_TSW-f_BNE)+(f_TSE-f_BNW)+(f_TNW-f_BSE) +(f_TE-f_BW)+(f_TW-f_BE)+(f_TN-f_BS)+(f_TS-f_BN)+(f_T-f_B));
       ////////////////////////////////////////////////////////////////////////////////
-      real f27_W    = (D27.f[DIR_P00   ])[ke   ];
-      real f27_E    = (D27.f[DIR_M00   ])[kw   ];
-      real f27_S    = (D27.f[DIR_0P0   ])[kn   ];
-      real f27_N    = (D27.f[DIR_0M0   ])[ks   ];
-      real f27_B    = (D27.f[DIR_00P   ])[kt   ];
-      real f27_T    = (D27.f[DIR_00M   ])[kb   ];
-      real f27_SW   = (D27.f[DIR_PP0  ])[kne  ];
-      real f27_NE   = (D27.f[DIR_MM0  ])[ksw  ];
-      real f27_NW   = (D27.f[DIR_PM0  ])[kse  ];
-      real f27_SE   = (D27.f[DIR_MP0  ])[knw  ];
-      real f27_BW   = (D27.f[DIR_P0P  ])[kte  ];
-      real f27_TE   = (D27.f[DIR_M0M  ])[kbw  ];
-      real f27_TW   = (D27.f[DIR_P0M  ])[kbe  ];
-      real f27_BE   = (D27.f[DIR_M0P  ])[ktw  ];
-      real f27_BS   = (D27.f[DIR_0PP  ])[ktn  ];
-      real f27_TN   = (D27.f[DIR_0MM  ])[kbs  ];
-      real f27_TS   = (D27.f[DIR_0PM  ])[kbn  ];
-      real f27_BN   = (D27.f[DIR_0MP  ])[kts  ];
+      real f27_W    = (D27.f[DIR_P00])[ke   ];
+      real f27_E    = (D27.f[DIR_M00])[kw   ];
+      real f27_S    = (D27.f[DIR_0P0])[kn   ];
+      real f27_N    = (D27.f[DIR_0M0])[ks   ];
+      real f27_B    = (D27.f[DIR_00P])[kt   ];
+      real f27_T    = (D27.f[DIR_00M])[kb   ];
+      real f27_SW   = (D27.f[DIR_PP0])[kne  ];
+      real f27_NE   = (D27.f[DIR_MM0])[ksw  ];
+      real f27_NW   = (D27.f[DIR_PM0])[kse  ];
+      real f27_SE   = (D27.f[DIR_MP0])[knw  ];
+      real f27_BW   = (D27.f[DIR_P0P])[kte  ];
+      real f27_TE   = (D27.f[DIR_M0M])[kbw  ];
+      real f27_TW   = (D27.f[DIR_P0M])[kbe  ];
+      real f27_BE   = (D27.f[DIR_M0P])[ktw  ];
+      real f27_BS   = (D27.f[DIR_0PP])[ktn  ];
+      real f27_TN   = (D27.f[DIR_0MM])[kbs  ];
+      real f27_TS   = (D27.f[DIR_0PM])[kbn  ];
+      real f27_BN   = (D27.f[DIR_0MP])[kts  ];
       real f27_ZERO = (D27.f[DIR_000])[kzero];
-      real f27_BSW  = (D27.f[DIR_PPP ])[ktne ];
-      real f27_BNE  = (D27.f[DIR_MMP ])[ktsw ];
-      real f27_BNW  = (D27.f[DIR_PMP ])[ktse ];
-      real f27_BSE  = (D27.f[DIR_MPP ])[ktnw ];
-      real f27_TSW  = (D27.f[DIR_PPM ])[kbne ];
-      real f27_TNE  = (D27.f[DIR_MMM ])[kbsw ];
-      real f27_TNW  = (D27.f[DIR_PMM ])[kbse ];
-      real f27_TSE  = (D27.f[DIR_MPM ])[kbnw ];
+      real f27_BSW  = (D27.f[DIR_PPP])[ktne ];
+      real f27_BNE  = (D27.f[DIR_MMP])[ktsw ];
+      real f27_BNW  = (D27.f[DIR_PMP])[ktse ];
+      real f27_BSE  = (D27.f[DIR_MPP])[ktnw ];
+      real f27_TSW  = (D27.f[DIR_PPM])[kbne ];
+      real f27_TNE  = (D27.f[DIR_MMM])[kbsw ];
+      real f27_TNW  = (D27.f[DIR_PMM])[kbse ];
+      real f27_TSE  = (D27.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
       ////////////////////////////////////////////////////////////////////////////////
@@ -4665,63 +4665,63 @@ __global__ void QNoSlipADincomp27(
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+         D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+         D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
@@ -4729,24 +4729,24 @@ __global__ void QNoSlipADincomp27(
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  real q;
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00  ])[kw  ]=(c2o1*feqW27_W  -(f27_E  *(q*omegaD-c1o1)-omegaD*feq27_E  *(q-c1o1))/(omegaD-c1o1)+f27_W  *q)/(q+c1o1);
-      q = q_dirW[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00  ])[ke  ]=(c2o1*feqW27_E  -(f27_W  *(q*omegaD-c1o1)-omegaD*feq27_W  *(q-c1o1))/(omegaD-c1o1)+f27_E  *q)/(q+c1o1);
-      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0  ])[ks  ]=(c2o1*feqW27_S  -(f27_N  *(q*omegaD-c1o1)-omegaD*feq27_N  *(q-c1o1))/(omegaD-c1o1)+f27_S  *q)/(q+c1o1);
-      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0  ])[kn  ]=(c2o1*feqW27_N  -(f27_S  *(q*omegaD-c1o1)-omegaD*feq27_S  *(q-c1o1))/(omegaD-c1o1)+f27_N  *q)/(q+c1o1);
-      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M  ])[kb  ]=(c2o1*feqW27_B  -(f27_T  *(q*omegaD-c1o1)-omegaD*feq27_T  *(q-c1o1))/(omegaD-c1o1)+f27_B  *q)/(q+c1o1);
-      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P  ])[kt  ]=(c2o1*feqW27_T  -(f27_B  *(q*omegaD-c1o1)-omegaD*feq27_B  *(q-c1o1))/(omegaD-c1o1)+f27_T  *q)/(q+c1o1);
-      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0 ])[ksw ]=(c2o1*feqW27_SW -(f27_NE *(q*omegaD-c1o1)-omegaD*feq27_NE *(q-c1o1))/(omegaD-c1o1)+f27_SW *q)/(q+c1o1);
-      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0 ])[kne ]=(c2o1*feqW27_NE -(f27_SW *(q*omegaD-c1o1)-omegaD*feq27_SW *(q-c1o1))/(omegaD-c1o1)+f27_NE *q)/(q+c1o1);
-      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0 ])[knw ]=(c2o1*feqW27_NW -(f27_SE *(q*omegaD-c1o1)-omegaD*feq27_SE *(q-c1o1))/(omegaD-c1o1)+f27_NW *q)/(q+c1o1);
-      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0 ])[kse ]=(c2o1*feqW27_SE -(f27_NW *(q*omegaD-c1o1)-omegaD*feq27_NW *(q-c1o1))/(omegaD-c1o1)+f27_SE *q)/(q+c1o1);
-      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M ])[kbw ]=(c2o1*feqW27_BW -(f27_TE *(q*omegaD-c1o1)-omegaD*feq27_TE *(q-c1o1))/(omegaD-c1o1)+f27_BW *q)/(q+c1o1);
-      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P ])[kte ]=(c2o1*feqW27_TE -(f27_BW *(q*omegaD-c1o1)-omegaD*feq27_BW *(q-c1o1))/(omegaD-c1o1)+f27_TE *q)/(q+c1o1);
-      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P ])[ktw ]=(c2o1*feqW27_TW -(f27_BE *(q*omegaD-c1o1)-omegaD*feq27_BE *(q-c1o1))/(omegaD-c1o1)+f27_TW *q)/(q+c1o1);
-      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M ])[kbe ]=(c2o1*feqW27_BE -(f27_TW *(q*omegaD-c1o1)-omegaD*feq27_TW *(q-c1o1))/(omegaD-c1o1)+f27_BE *q)/(q+c1o1);
-      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM ])[kbs ]=(c2o1*feqW27_BS -(f27_TN *(q*omegaD-c1o1)-omegaD*feq27_TN *(q-c1o1))/(omegaD-c1o1)+f27_BS *q)/(q+c1o1);
-      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP ])[ktn ]=(c2o1*feqW27_TN -(f27_BS *(q*omegaD-c1o1)-omegaD*feq27_BS *(q-c1o1))/(omegaD-c1o1)+f27_TN *q)/(q+c1o1);
-      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP ])[kts ]=(c2o1*feqW27_TS -(f27_BN *(q*omegaD-c1o1)-omegaD*feq27_BN *(q-c1o1))/(omegaD-c1o1)+f27_TS *q)/(q+c1o1);
-      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM ])[kbn ]=(c2o1*feqW27_BN -(f27_TS *(q*omegaD-c1o1)-omegaD*feq27_TS *(q-c1o1))/(omegaD-c1o1)+f27_BN *q)/(q+c1o1);
+      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00])[kw  ]=(c2o1*feqW27_W  -(f27_E  *(q*omegaD-c1o1)-omegaD*feq27_E  *(q-c1o1))/(omegaD-c1o1)+f27_W  *q)/(q+c1o1);
+      q = q_dirW[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00])[ke  ]=(c2o1*feqW27_E  -(f27_W  *(q*omegaD-c1o1)-omegaD*feq27_W  *(q-c1o1))/(omegaD-c1o1)+f27_E  *q)/(q+c1o1);
+      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0])[ks  ]=(c2o1*feqW27_S  -(f27_N  *(q*omegaD-c1o1)-omegaD*feq27_N  *(q-c1o1))/(omegaD-c1o1)+f27_S  *q)/(q+c1o1);
+      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0])[kn  ]=(c2o1*feqW27_N  -(f27_S  *(q*omegaD-c1o1)-omegaD*feq27_S  *(q-c1o1))/(omegaD-c1o1)+f27_N  *q)/(q+c1o1);
+      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M])[kb  ]=(c2o1*feqW27_B  -(f27_T  *(q*omegaD-c1o1)-omegaD*feq27_T  *(q-c1o1))/(omegaD-c1o1)+f27_B  *q)/(q+c1o1);
+      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P])[kt  ]=(c2o1*feqW27_T  -(f27_B  *(q*omegaD-c1o1)-omegaD*feq27_B  *(q-c1o1))/(omegaD-c1o1)+f27_T  *q)/(q+c1o1);
+      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0])[ksw ]=(c2o1*feqW27_SW -(f27_NE *(q*omegaD-c1o1)-omegaD*feq27_NE *(q-c1o1))/(omegaD-c1o1)+f27_SW *q)/(q+c1o1);
+      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0])[kne ]=(c2o1*feqW27_NE -(f27_SW *(q*omegaD-c1o1)-omegaD*feq27_SW *(q-c1o1))/(omegaD-c1o1)+f27_NE *q)/(q+c1o1);
+      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0])[knw ]=(c2o1*feqW27_NW -(f27_SE *(q*omegaD-c1o1)-omegaD*feq27_SE *(q-c1o1))/(omegaD-c1o1)+f27_NW *q)/(q+c1o1);
+      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0])[kse ]=(c2o1*feqW27_SE -(f27_NW *(q*omegaD-c1o1)-omegaD*feq27_NW *(q-c1o1))/(omegaD-c1o1)+f27_SE *q)/(q+c1o1);
+      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M])[kbw ]=(c2o1*feqW27_BW -(f27_TE *(q*omegaD-c1o1)-omegaD*feq27_TE *(q-c1o1))/(omegaD-c1o1)+f27_BW *q)/(q+c1o1);
+      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P])[kte ]=(c2o1*feqW27_TE -(f27_BW *(q*omegaD-c1o1)-omegaD*feq27_BW *(q-c1o1))/(omegaD-c1o1)+f27_TE *q)/(q+c1o1);
+      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P])[ktw ]=(c2o1*feqW27_TW -(f27_BE *(q*omegaD-c1o1)-omegaD*feq27_BE *(q-c1o1))/(omegaD-c1o1)+f27_TW *q)/(q+c1o1);
+      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M])[kbe ]=(c2o1*feqW27_BE -(f27_TW *(q*omegaD-c1o1)-omegaD*feq27_TW *(q-c1o1))/(omegaD-c1o1)+f27_BE *q)/(q+c1o1);
+      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM])[kbs ]=(c2o1*feqW27_BS -(f27_TN *(q*omegaD-c1o1)-omegaD*feq27_TN *(q-c1o1))/(omegaD-c1o1)+f27_BS *q)/(q+c1o1);
+      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP])[ktn ]=(c2o1*feqW27_TN -(f27_BS *(q*omegaD-c1o1)-omegaD*feq27_BS *(q-c1o1))/(omegaD-c1o1)+f27_TN *q)/(q+c1o1);
+      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP])[kts ]=(c2o1*feqW27_TS -(f27_BN *(q*omegaD-c1o1)-omegaD*feq27_BN *(q-c1o1))/(omegaD-c1o1)+f27_TS *q)/(q+c1o1);
+      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM])[kbn ]=(c2o1*feqW27_BN -(f27_TS *(q*omegaD-c1o1)-omegaD*feq27_TS *(q-c1o1))/(omegaD-c1o1)+f27_BN *q)/(q+c1o1);
       q = q_dirTNE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMM])[kbsw]=(c2o1*feqW27_BSW-(f27_TNE*(q*omegaD-c1o1)-omegaD*feq27_TNE*(q-c1o1))/(omegaD-c1o1)+f27_BSW*q)/(q+c1o1);
       q = q_dirBSW[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PPP])[ktne]=(c2o1*feqW27_TNE-(f27_BSW*(q*omegaD-c1o1)-omegaD*feq27_BSW*(q-c1o1))/(omegaD-c1o1)+f27_TNE*q)/(q+c1o1);
       q = q_dirBNE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMP])[ktsw]=(c2o1*feqW27_TSW-(f27_BNE*(q*omegaD-c1o1)-omegaD*feq27_BNE*(q-c1o1))/(omegaD-c1o1)+f27_TSW*q)/(q+c1o1);
@@ -4811,91 +4811,91 @@ __global__ void QADVeloIncomp7(
 											unsigned int* neighborX,
 											unsigned int* neighborY,
 											unsigned int* neighborZ,
-											unsigned int size_Mat, 
+											unsigned long long numberOfLBnodes, 
 											bool isEvenTimestep)
 {
    //Distributions27 D;
    //if (isEvenTimestep==true)
    //{
-   //   D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-   //   D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-   //   D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-   //   D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-   //   D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-   //   D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-   //   D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-   //   D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-   //   D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-   //   D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-   //   D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-   //   D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-   //   D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-   //   D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-   //   D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-   //   D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-   //   D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-   //   D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-   //   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   //   D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-   //   D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-   //   D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-   //   D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-   //   D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-   //   D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-   //   D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-   //   D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+   //   D.f[DIR_P00] = &DD[DIR_P00 * size_Mat];
+   //   D.f[DIR_M00] = &DD[DIR_M00 * size_Mat];
+   //   D.f[DIR_0P0] = &DD[DIR_0P0 * size_Mat];
+   //   D.f[DIR_0M0] = &DD[DIR_0M0 * size_Mat];
+   //   D.f[DIR_00P] = &DD[DIR_00P * size_Mat];
+   //   D.f[DIR_00M] = &DD[DIR_00M * size_Mat];
+   //   D.f[DIR_PP0] = &DD[DIR_PP0 * size_Mat];
+   //   D.f[DIR_MM0] = &DD[DIR_MM0 * size_Mat];
+   //   D.f[DIR_PM0] = &DD[DIR_PM0 * size_Mat];
+   //   D.f[DIR_MP0] = &DD[DIR_MP0 * size_Mat];
+   //   D.f[DIR_P0P] = &DD[DIR_P0P * size_Mat];
+   //   D.f[DIR_M0M] = &DD[DIR_M0M * size_Mat];
+   //   D.f[DIR_P0M] = &DD[DIR_P0M * size_Mat];
+   //   D.f[DIR_M0P] = &DD[DIR_M0P * size_Mat];
+   //   D.f[DIR_0PP] = &DD[DIR_0PP * size_Mat];
+   //   D.f[DIR_0MM] = &DD[DIR_0MM * size_Mat];
+   //   D.f[DIR_0PM] = &DD[DIR_0PM * size_Mat];
+   //   D.f[DIR_0MP] = &DD[DIR_0MP * size_Mat];
+   //   D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+   //   D.f[DIR_PPP] = &DD[DIR_PPP * size_Mat];
+   //   D.f[DIR_MMP] = &DD[DIR_MMP * size_Mat];
+   //   D.f[DIR_PMP] = &DD[DIR_PMP * size_Mat];
+   //   D.f[DIR_MPP] = &DD[DIR_MPP * size_Mat];
+   //   D.f[DIR_PPM] = &DD[DIR_PPM * size_Mat];
+   //   D.f[DIR_MMM] = &DD[DIR_MMM * size_Mat];
+   //   D.f[DIR_PMM] = &DD[DIR_PMM * size_Mat];
+   //   D.f[DIR_MPM] = &DD[DIR_MPM * size_Mat];
    //} 
    //else
    //{
-   //   D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-   //   D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-   //   D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-   //   D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-   //   D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-   //   D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-   //   D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-   //   D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-   //   D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-   //   D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-   //   D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-   //   D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-   //   D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-   //   D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-   //   D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-   //   D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-   //   D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-   //   D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-   //   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   //   D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-   //   D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-   //   D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-   //   D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-   //   D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-   //   D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-   //   D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-   //   D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+   //   D.f[DIR_M00] = &DD[DIR_P00 * size_Mat];
+   //   D.f[DIR_P00] = &DD[DIR_M00 * size_Mat];
+   //   D.f[DIR_0M0] = &DD[DIR_0P0 * size_Mat];
+   //   D.f[DIR_0P0] = &DD[DIR_0M0 * size_Mat];
+   //   D.f[DIR_00M] = &DD[DIR_00P * size_Mat];
+   //   D.f[DIR_00P] = &DD[DIR_00M * size_Mat];
+   //   D.f[DIR_MM0] = &DD[DIR_PP0 * size_Mat];
+   //   D.f[DIR_PP0] = &DD[DIR_MM0 * size_Mat];
+   //   D.f[DIR_MP0] = &DD[DIR_PM0 * size_Mat];
+   //   D.f[DIR_PM0] = &DD[DIR_MP0 * size_Mat];
+   //   D.f[DIR_M0M] = &DD[DIR_P0P * size_Mat];
+   //   D.f[DIR_P0P] = &DD[DIR_M0M * size_Mat];
+   //   D.f[DIR_M0P] = &DD[DIR_P0M * size_Mat];
+   //   D.f[DIR_P0M] = &DD[DIR_M0P * size_Mat];
+   //   D.f[DIR_0MM] = &DD[DIR_0PP * size_Mat];
+   //   D.f[DIR_0PP] = &DD[DIR_0MM * size_Mat];
+   //   D.f[DIR_0MP] = &DD[DIR_0PM * size_Mat];
+   //   D.f[DIR_0PM] = &DD[DIR_0MP * size_Mat];
+   //   D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+   //   D.f[DIR_PPP] = &DD[DIR_MMM * size_Mat];
+   //   D.f[DIR_MMP] = &DD[DIR_PPM * size_Mat];
+   //   D.f[DIR_PMP] = &DD[DIR_MPM * size_Mat];
+   //   D.f[DIR_MPP] = &DD[DIR_PMM * size_Mat];
+   //   D.f[DIR_PPM] = &DD[DIR_MMP * size_Mat];
+   //   D.f[DIR_MMM] = &DD[DIR_PPP * size_Mat];
+   //   D.f[DIR_PMM] = &DD[DIR_MPP * size_Mat];
+   //   D.f[DIR_MPM] = &DD[DIR_PMP * size_Mat];
    //}
 
    Distributions7 D7;
    if (isEvenTimestep==true)
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[1] = &DD7[1*size_Mat];
-      D7.f[2] = &DD7[2*size_Mat];
-      D7.f[3] = &DD7[3*size_Mat];
-      D7.f[4] = &DD7[4*size_Mat];
-      D7.f[5] = &DD7[5*size_Mat];
-      D7.f[6] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[1] = &DD7[1*numberOfLBnodes];
+      D7.f[2] = &DD7[2*numberOfLBnodes];
+      D7.f[3] = &DD7[3*numberOfLBnodes];
+      D7.f[4] = &DD7[4*numberOfLBnodes];
+      D7.f[5] = &DD7[5*numberOfLBnodes];
+      D7.f[6] = &DD7[6*numberOfLBnodes];
    }
    else
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[2] = &DD7[1*size_Mat];
-      D7.f[1] = &DD7[2*size_Mat];
-      D7.f[4] = &DD7[3*size_Mat];
-      D7.f[3] = &DD7[4*size_Mat];
-      D7.f[6] = &DD7[5*size_Mat];
-      D7.f[5] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[2] = &DD7[1*numberOfLBnodes];
+      D7.f[1] = &DD7[2*numberOfLBnodes];
+      D7.f[4] = &DD7[3*numberOfLBnodes];
+      D7.f[3] = &DD7[4*numberOfLBnodes];
+      D7.f[6] = &DD7[5*numberOfLBnodes];
+      D7.f[5] = &DD7[6*numberOfLBnodes];
    }
 
 
@@ -4915,12 +4915,12 @@ __global__ void QADVeloIncomp7(
       //////////////////////////////////////////////////////////////////////////////////
       real  *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB; 
 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
       //////////////////////////////////////////////////////////////////////////////////
       //index
       unsigned int KQK  = k_Q[k];
@@ -4952,32 +4952,32 @@ __global__ void QADVeloIncomp7(
       //unsigned int ktne = KQK;
       //unsigned int kbsw = neighborZ[ksw];
       ////////////////////////////////////////////////////////////////////////////////
-      //real f_W    = (D.f[DIR_P00   ])[ke   ];
-      //real f_E    = (D.f[DIR_M00   ])[kw   ];
-      //real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      //real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      //real f_B    = (D.f[DIR_00P   ])[kt   ];
-      //real f_T    = (D.f[DIR_00M   ])[kb   ];
-      //real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      //real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      //real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      //real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      //real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      //real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      //real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      //real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      //real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      //real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      //real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      //real f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      //real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      //real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      //real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      //real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      //real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      //real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      //real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      //real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      //real f_W    = (D.f[DIR_P00])[ke   ];
+      //real f_E    = (D.f[DIR_M00])[kw   ];
+      //real f_S    = (D.f[DIR_0P0])[kn   ];
+      //real f_N    = (D.f[DIR_0M0])[ks   ];
+      //real f_B    = (D.f[DIR_00P])[kt   ];
+      //real f_T    = (D.f[DIR_00M])[kb   ];
+      //real f_SW   = (D.f[DIR_PP0])[kne  ];
+      //real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      //real f_NW   = (D.f[DIR_PM0])[kse  ];
+      //real f_SE   = (D.f[DIR_MP0])[knw  ];
+      //real f_BW   = (D.f[DIR_P0P])[kte  ];
+      //real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      //real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      //real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      //real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      //real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      //real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      //real f_BN   = (D.f[DIR_0MP])[kts  ];
+      //real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      //real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      //real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      //real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      //real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      //real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      //real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      //real f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       //real vx1_Inflow   = c0o1;
       //real vx2_Inflow   = velo[k];
@@ -5091,23 +5091,23 @@ __global__ void QADVeloIncomp7(
       //pointertausch
       if (isEvenTimestep==false)
       {
-         D7.f[0] = &DD7[0*size_Mat];
-         D7.f[1] = &DD7[1*size_Mat];
-         D7.f[2] = &DD7[2*size_Mat];
-         D7.f[3] = &DD7[3*size_Mat];
-         D7.f[4] = &DD7[4*size_Mat];
-         D7.f[5] = &DD7[5*size_Mat];
-         D7.f[6] = &DD7[6*size_Mat];
+         D7.f[0] = &DD7[0*numberOfLBnodes];
+         D7.f[1] = &DD7[1*numberOfLBnodes];
+         D7.f[2] = &DD7[2*numberOfLBnodes];
+         D7.f[3] = &DD7[3*numberOfLBnodes];
+         D7.f[4] = &DD7[4*numberOfLBnodes];
+         D7.f[5] = &DD7[5*numberOfLBnodes];
+         D7.f[6] = &DD7[6*numberOfLBnodes];
       }
       else
       {
-         D7.f[0] = &DD7[0*size_Mat];
-         D7.f[2] = &DD7[1*size_Mat];
-         D7.f[1] = &DD7[2*size_Mat];
-         D7.f[4] = &DD7[3*size_Mat];
-         D7.f[3] = &DD7[4*size_Mat];
-         D7.f[6] = &DD7[5*size_Mat];
-         D7.f[5] = &DD7[6*size_Mat];
+         D7.f[0] = &DD7[0*numberOfLBnodes];
+         D7.f[2] = &DD7[1*numberOfLBnodes];
+         D7.f[1] = &DD7[2*numberOfLBnodes];
+         D7.f[4] = &DD7[3*numberOfLBnodes];
+         D7.f[3] = &DD7[4*numberOfLBnodes];
+         D7.f[6] = &DD7[5*numberOfLBnodes];
+         D7.f[5] = &DD7[6*numberOfLBnodes];
       }
 
       ////////////////////////////////////////////////////////////////////////////
@@ -5289,131 +5289,131 @@ __global__ void QADVeloIncomp27(
 											unsigned int* neighborX,
 											unsigned int* neighborY,
 											unsigned int* neighborZ,
-											unsigned int size_Mat, 
+											unsigned long long numberOfLBnodes, 
 											bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
 
    Distributions27 D27;
    if (isEvenTimestep==true)
    {
-      D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+      D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+      D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -5434,24 +5434,24 @@ __global__ void QADVeloIncomp27(
          *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
          *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
          *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -5491,65 +5491,65 @@ __global__ void QADVeloIncomp27(
       unsigned int ktne = KQK;
       unsigned int kbsw = neighborZ[ksw];
       ////////////////////////////////////////////////////////////////////////////////
-      real f_W    = (D.f[DIR_P00   ])[ke   ];
-      real f_E    = (D.f[DIR_M00   ])[kw   ];
-      real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      real f_B    = (D.f[DIR_00P   ])[kt   ];
-      real f_T    = (D.f[DIR_00M   ])[kb   ];
-      real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (D.f[DIR_0MP  ])[kts  ];
+      real f_W    = (D.f[DIR_P00])[ke   ];
+      real f_E    = (D.f[DIR_M00])[kw   ];
+      real f_S    = (D.f[DIR_0P0])[kn   ];
+      real f_N    = (D.f[DIR_0M0])[ks   ];
+      real f_B    = (D.f[DIR_00P])[kt   ];
+      real f_T    = (D.f[DIR_00M])[kb   ];
+      real f_SW   = (D.f[DIR_PP0])[kne  ];
+      real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      real f_NW   = (D.f[DIR_PM0])[kse  ];
+      real f_SE   = (D.f[DIR_MP0])[knw  ];
+      real f_BW   = (D.f[DIR_P0P])[kte  ];
+      real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      real f_BN   = (D.f[DIR_0MP])[kts  ];
       //real f_ZERO = (D.f[DIR_000])[kzero];
-      real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      real f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1 = ((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_TSE-f_BNW)+(f_BSE-f_TNW) +(f_NE-f_SW)+(f_SE-f_NW)+(f_TE-f_BW)+(f_BE-f_TW)+(f_E-f_W));
       real vx2 = ((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_BNW-f_TSE)+(f_TNW-f_BSE) +(f_NE-f_SW)+(f_NW-f_SE)+(f_TN-f_BS)+(f_BN-f_TS)+(f_N-f_S));
       real vx3 = ((f_TNE-f_BSW)+(f_TSW-f_BNE)+(f_TSE-f_BNW)+(f_TNW-f_BSE) +(f_TE-f_BW)+(f_TW-f_BE)+(f_TN-f_BS)+(f_TS-f_BN)+(f_T-f_B));
       ////////////////////////////////////////////////////////////////////////////////
-      //real f27_W    = (D27.f[DIR_P00   ])[ke   ];
-      //real f27_E    = (D27.f[DIR_M00   ])[kw   ];
-      //real f27_S    = (D27.f[DIR_0P0   ])[kn   ];
-      //real f27_N    = (D27.f[DIR_0M0   ])[ks   ];
-      //real f27_B    = (D27.f[DIR_00P   ])[kt   ];
-      //real f27_T    = (D27.f[DIR_00M   ])[kb   ];
-      //real f27_SW   = (D27.f[DIR_PP0  ])[kne  ];
-      //real f27_NE   = (D27.f[DIR_MM0  ])[ksw  ];
-      //real f27_NW   = (D27.f[DIR_PM0  ])[kse  ];
-      //real f27_SE   = (D27.f[DIR_MP0  ])[knw  ];
-      //real f27_BW   = (D27.f[DIR_P0P  ])[kte  ];
-      //real f27_TE   = (D27.f[DIR_M0M  ])[kbw  ];
-      //real f27_TW   = (D27.f[DIR_P0M  ])[kbe  ];
-      //real f27_BE   = (D27.f[DIR_M0P  ])[ktw  ];
-      //real f27_BS   = (D27.f[DIR_0PP  ])[ktn  ];
-      //real f27_TN   = (D27.f[DIR_0MM  ])[kbs  ];
-      //real f27_TS   = (D27.f[DIR_0PM  ])[kbn  ];
-      //real f27_BN   = (D27.f[DIR_0MP  ])[kts  ];
+      //real f27_W    = (D27.f[DIR_P00])[ke   ];
+      //real f27_E    = (D27.f[DIR_M00])[kw   ];
+      //real f27_S    = (D27.f[DIR_0P0])[kn   ];
+      //real f27_N    = (D27.f[DIR_0M0])[ks   ];
+      //real f27_B    = (D27.f[DIR_00P])[kt   ];
+      //real f27_T    = (D27.f[DIR_00M])[kb   ];
+      //real f27_SW   = (D27.f[DIR_PP0])[kne  ];
+      //real f27_NE   = (D27.f[DIR_MM0])[ksw  ];
+      //real f27_NW   = (D27.f[DIR_PM0])[kse  ];
+      //real f27_SE   = (D27.f[DIR_MP0])[knw  ];
+      //real f27_BW   = (D27.f[DIR_P0P])[kte  ];
+      //real f27_TE   = (D27.f[DIR_M0M])[kbw  ];
+      //real f27_TW   = (D27.f[DIR_P0M])[kbe  ];
+      //real f27_BE   = (D27.f[DIR_M0P])[ktw  ];
+      //real f27_BS   = (D27.f[DIR_0PP])[ktn  ];
+      //real f27_TN   = (D27.f[DIR_0MM])[kbs  ];
+      //real f27_TS   = (D27.f[DIR_0PM])[kbn  ];
+      //real f27_BN   = (D27.f[DIR_0MP])[kts  ];
       //real f27_ZERO = (D27.f[DIR_000])[kzero];
-      //real f27_BSW  = (D27.f[DIR_PPP ])[ktne ];
-      //real f27_BNE  = (D27.f[DIR_MMP ])[ktsw ];
-      //real f27_BNW  = (D27.f[DIR_PMP ])[ktse ];
-      //real f27_BSE  = (D27.f[DIR_MPP ])[ktnw ];
-      //real f27_TSW  = (D27.f[DIR_PPM ])[kbne ];
-      //real f27_TNE  = (D27.f[DIR_MMM ])[kbsw ];
-      //real f27_TNW  = (D27.f[DIR_PMM ])[kbse ];
-      //real f27_TSE  = (D27.f[DIR_MPM ])[kbnw ];
+      //real f27_BSW  = (D27.f[DIR_PPP])[ktne ];
+      //real f27_BNE  = (D27.f[DIR_MMP])[ktsw ];
+      //real f27_BNW  = (D27.f[DIR_PMP])[ktse ];
+      //real f27_BSE  = (D27.f[DIR_MPP])[ktnw ];
+      //real f27_TSW  = (D27.f[DIR_PPM])[kbne ];
+      //real f27_TNE  = (D27.f[DIR_MMM])[kbsw ];
+      //real f27_TNW  = (D27.f[DIR_PMM])[kbse ];
+      //real f27_TSE  = (D27.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
       ////////////////////////////////////////////////////////////////////////////////
@@ -5630,63 +5630,63 @@ __global__ void QADVeloIncomp27(
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+         D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+         D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
@@ -5694,24 +5694,24 @@ __global__ void QADVeloIncomp27(
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real q;
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00  ])[kw  ]= -feqW27_W  + c2o1 * c2o27  * TempD;
-      q = q_dirW[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00  ])[ke  ]= -feqW27_E  + c2o1 * c2o27  * TempD;
-      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0  ])[ks  ]= -feqW27_S  + c2o1 * c2o27  * TempD;
-      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0  ])[kn  ]= -feqW27_N  + c2o1 * c2o27  * TempD;
-      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M  ])[kb  ]= -feqW27_B  + c2o1 * c2o27  * TempD;
-      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P  ])[kt  ]= -feqW27_T  + c2o1 * c2o27  * TempD;
-      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0 ])[ksw ]= -feqW27_SW + c2o1 * c1o54  * TempD;
-      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0 ])[kne ]= -feqW27_NE + c2o1 * c1o54  * TempD;
-      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0 ])[knw ]= -feqW27_NW + c2o1 * c1o54  * TempD;
-      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0 ])[kse ]= -feqW27_SE + c2o1 * c1o54  * TempD;
-      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M ])[kbw ]= -feqW27_BW + c2o1 * c1o54  * TempD;
-      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P ])[kte ]= -feqW27_TE + c2o1 * c1o54  * TempD;
-      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P ])[ktw ]= -feqW27_TW + c2o1 * c1o54  * TempD;
-      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M ])[kbe ]= -feqW27_BE + c2o1 * c1o54  * TempD;
-      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM ])[kbs ]= -feqW27_BS + c2o1 * c1o54  * TempD;
-      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP ])[ktn ]= -feqW27_TN + c2o1 * c1o54  * TempD;
-      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP ])[kts ]= -feqW27_TS + c2o1 * c1o54  * TempD;
-      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM ])[kbn ]= -feqW27_BN + c2o1 * c1o54  * TempD;
+      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00])[kw  ]= -feqW27_W  + c2o1 * c2o27  * TempD;
+      q = q_dirW[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00])[ke  ]= -feqW27_E  + c2o1 * c2o27  * TempD;
+      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0])[ks  ]= -feqW27_S  + c2o1 * c2o27  * TempD;
+      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0])[kn  ]= -feqW27_N  + c2o1 * c2o27  * TempD;
+      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M])[kb  ]= -feqW27_B  + c2o1 * c2o27  * TempD;
+      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P])[kt  ]= -feqW27_T  + c2o1 * c2o27  * TempD;
+      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0])[ksw ]= -feqW27_SW + c2o1 * c1o54  * TempD;
+      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0])[kne ]= -feqW27_NE + c2o1 * c1o54  * TempD;
+      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0])[knw ]= -feqW27_NW + c2o1 * c1o54  * TempD;
+      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0])[kse ]= -feqW27_SE + c2o1 * c1o54  * TempD;
+      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M])[kbw ]= -feqW27_BW + c2o1 * c1o54  * TempD;
+      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P])[kte ]= -feqW27_TE + c2o1 * c1o54  * TempD;
+      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P])[ktw ]= -feqW27_TW + c2o1 * c1o54  * TempD;
+      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M])[kbe ]= -feqW27_BE + c2o1 * c1o54  * TempD;
+      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM])[kbs ]= -feqW27_BS + c2o1 * c1o54  * TempD;
+      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP])[ktn ]= -feqW27_TN + c2o1 * c1o54  * TempD;
+      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP])[kts ]= -feqW27_TS + c2o1 * c1o54  * TempD;
+      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM])[kbn ]= -feqW27_BN + c2o1 * c1o54  * TempD;
       q = q_dirTNE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMM])[kbsw]= -feqW27_BSW+ c2o1 * c1o216 * TempD;
       q = q_dirBSW[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PPP])[ktne]= -feqW27_TNE+ c2o1 * c1o216 * TempD;
       q = q_dirBNE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMP])[ktsw]= -feqW27_TSW+ c2o1 * c1o216 * TempD;
@@ -5720,24 +5720,24 @@ __global__ void QADVeloIncomp27(
       q = q_dirBNW[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PMP])[ktse]= -feqW27_TSE+ c2o1 * c1o216 * TempD;
       q = q_dirBSE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MPP])[ktnw]= -feqW27_TNW+ c2o1 * c1o216 * TempD;
       q = q_dirTNW[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PMM])[kbse]= -feqW27_BSE+ c2o1 * c1o216 * TempD;
-      //q = q_dirE[k];   if (q>=zero && q<=one) (D27.f[DIR_M00  ])[kw  ]=(two*feqW27_W  -(f27_E  *(q*omegaD-one)-omegaD*feq27_E  *(q-one))/(omegaD-one)+f27_W  *q)/(q+one);
-      //q = q_dirW[k];   if (q>=zero && q<=one) (D27.f[DIR_P00  ])[ke  ]=(two*feqW27_E  -(f27_W  *(q*omegaD-one)-omegaD*feq27_W  *(q-one))/(omegaD-one)+f27_E  *q)/(q+one);
-      //q = q_dirN[k];   if (q>=zero && q<=one) (D27.f[DIR_0M0  ])[ks  ]=(two*feqW27_S  -(f27_N  *(q*omegaD-one)-omegaD*feq27_N  *(q-one))/(omegaD-one)+f27_S  *q)/(q+one);
-      //q = q_dirS[k];   if (q>=zero && q<=one) (D27.f[DIR_0P0  ])[kn  ]=(two*feqW27_N  -(f27_S  *(q*omegaD-one)-omegaD*feq27_S  *(q-one))/(omegaD-one)+f27_N  *q)/(q+one);
-      //q = q_dirT[k];   if (q>=zero && q<=one) (D27.f[DIR_00M  ])[kb  ]=(two*feqW27_B  -(f27_T  *(q*omegaD-one)-omegaD*feq27_T  *(q-one))/(omegaD-one)+f27_B  *q)/(q+one);
-      //q = q_dirB[k];   if (q>=zero && q<=one) (D27.f[DIR_00P  ])[kt  ]=(two*feqW27_T  -(f27_B  *(q*omegaD-one)-omegaD*feq27_B  *(q-one))/(omegaD-one)+f27_T  *q)/(q+one);
-      //q = q_dirNE[k];  if (q>=zero && q<=one) (D27.f[DIR_MM0 ])[ksw ]=(two*feqW27_SW -(f27_NE *(q*omegaD-one)-omegaD*feq27_NE *(q-one))/(omegaD-one)+f27_SW *q)/(q+one);
-      //q = q_dirSW[k];  if (q>=zero && q<=one) (D27.f[DIR_PP0 ])[kne ]=(two*feqW27_NE -(f27_SW *(q*omegaD-one)-omegaD*feq27_SW *(q-one))/(omegaD-one)+f27_NE *q)/(q+one);
-      //q = q_dirSE[k];  if (q>=zero && q<=one) (D27.f[DIR_MP0 ])[knw ]=(two*feqW27_NW -(f27_SE *(q*omegaD-one)-omegaD*feq27_SE *(q-one))/(omegaD-one)+f27_NW *q)/(q+one);
-      //q = q_dirNW[k];  if (q>=zero && q<=one) (D27.f[DIR_PM0 ])[kse ]=(two*feqW27_SE -(f27_NW *(q*omegaD-one)-omegaD*feq27_NW *(q-one))/(omegaD-one)+f27_SE *q)/(q+one);
-      //q = q_dirTE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0M ])[kbw ]=(two*feqW27_BW -(f27_TE *(q*omegaD-one)-omegaD*feq27_TE *(q-one))/(omegaD-one)+f27_BW *q)/(q+one);
-      //q = q_dirBW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0P ])[kte ]=(two*feqW27_TE -(f27_BW *(q*omegaD-one)-omegaD*feq27_BW *(q-one))/(omegaD-one)+f27_TE *q)/(q+one);
-      //q = q_dirBE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0P ])[ktw ]=(two*feqW27_TW -(f27_BE *(q*omegaD-one)-omegaD*feq27_BE *(q-one))/(omegaD-one)+f27_TW *q)/(q+one);
-      //q = q_dirTW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0M ])[kbe ]=(two*feqW27_BE -(f27_TW *(q*omegaD-one)-omegaD*feq27_TW *(q-one))/(omegaD-one)+f27_BE *q)/(q+one);
-      //q = q_dirTN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MM ])[kbs ]=(two*feqW27_BS -(f27_TN *(q*omegaD-one)-omegaD*feq27_TN *(q-one))/(omegaD-one)+f27_BS *q)/(q+one);
-      //q = q_dirBS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PP ])[ktn ]=(two*feqW27_TN -(f27_BS *(q*omegaD-one)-omegaD*feq27_BS *(q-one))/(omegaD-one)+f27_TN *q)/(q+one);
-      //q = q_dirBN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MP ])[kts ]=(two*feqW27_TS -(f27_BN *(q*omegaD-one)-omegaD*feq27_BN *(q-one))/(omegaD-one)+f27_TS *q)/(q+one);
-      //q = q_dirTS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PM ])[kbn ]=(two*feqW27_BN -(f27_TS *(q*omegaD-one)-omegaD*feq27_TS *(q-one))/(omegaD-one)+f27_BN *q)/(q+one);
+      //q = q_dirE[k];   if (q>=zero && q<=one) (D27.f[DIR_M00])[kw  ]=(two*feqW27_W  -(f27_E  *(q*omegaD-one)-omegaD*feq27_E  *(q-one))/(omegaD-one)+f27_W  *q)/(q+one);
+      //q = q_dirW[k];   if (q>=zero && q<=one) (D27.f[DIR_P00])[ke  ]=(two*feqW27_E  -(f27_W  *(q*omegaD-one)-omegaD*feq27_W  *(q-one))/(omegaD-one)+f27_E  *q)/(q+one);
+      //q = q_dirN[k];   if (q>=zero && q<=one) (D27.f[DIR_0M0])[ks  ]=(two*feqW27_S  -(f27_N  *(q*omegaD-one)-omegaD*feq27_N  *(q-one))/(omegaD-one)+f27_S  *q)/(q+one);
+      //q = q_dirS[k];   if (q>=zero && q<=one) (D27.f[DIR_0P0])[kn  ]=(two*feqW27_N  -(f27_S  *(q*omegaD-one)-omegaD*feq27_S  *(q-one))/(omegaD-one)+f27_N  *q)/(q+one);
+      //q = q_dirT[k];   if (q>=zero && q<=one) (D27.f[DIR_00M])[kb  ]=(two*feqW27_B  -(f27_T  *(q*omegaD-one)-omegaD*feq27_T  *(q-one))/(omegaD-one)+f27_B  *q)/(q+one);
+      //q = q_dirB[k];   if (q>=zero && q<=one) (D27.f[DIR_00P])[kt  ]=(two*feqW27_T  -(f27_B  *(q*omegaD-one)-omegaD*feq27_B  *(q-one))/(omegaD-one)+f27_T  *q)/(q+one);
+      //q = q_dirNE[k];  if (q>=zero && q<=one) (D27.f[DIR_MM0])[ksw ]=(two*feqW27_SW -(f27_NE *(q*omegaD-one)-omegaD*feq27_NE *(q-one))/(omegaD-one)+f27_SW *q)/(q+one);
+      //q = q_dirSW[k];  if (q>=zero && q<=one) (D27.f[DIR_PP0])[kne ]=(two*feqW27_NE -(f27_SW *(q*omegaD-one)-omegaD*feq27_SW *(q-one))/(omegaD-one)+f27_NE *q)/(q+one);
+      //q = q_dirSE[k];  if (q>=zero && q<=one) (D27.f[DIR_MP0])[knw ]=(two*feqW27_NW -(f27_SE *(q*omegaD-one)-omegaD*feq27_SE *(q-one))/(omegaD-one)+f27_NW *q)/(q+one);
+      //q = q_dirNW[k];  if (q>=zero && q<=one) (D27.f[DIR_PM0])[kse ]=(two*feqW27_SE -(f27_NW *(q*omegaD-one)-omegaD*feq27_NW *(q-one))/(omegaD-one)+f27_SE *q)/(q+one);
+      //q = q_dirTE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0M])[kbw ]=(two*feqW27_BW -(f27_TE *(q*omegaD-one)-omegaD*feq27_TE *(q-one))/(omegaD-one)+f27_BW *q)/(q+one);
+      //q = q_dirBW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0P])[kte ]=(two*feqW27_TE -(f27_BW *(q*omegaD-one)-omegaD*feq27_BW *(q-one))/(omegaD-one)+f27_TE *q)/(q+one);
+      //q = q_dirBE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0P])[ktw ]=(two*feqW27_TW -(f27_BE *(q*omegaD-one)-omegaD*feq27_BE *(q-one))/(omegaD-one)+f27_TW *q)/(q+one);
+      //q = q_dirTW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0M])[kbe ]=(two*feqW27_BE -(f27_TW *(q*omegaD-one)-omegaD*feq27_TW *(q-one))/(omegaD-one)+f27_BE *q)/(q+one);
+      //q = q_dirTN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MM])[kbs ]=(two*feqW27_BS -(f27_TN *(q*omegaD-one)-omegaD*feq27_TN *(q-one))/(omegaD-one)+f27_BS *q)/(q+one);
+      //q = q_dirBS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PP])[ktn ]=(two*feqW27_TN -(f27_BS *(q*omegaD-one)-omegaD*feq27_BS *(q-one))/(omegaD-one)+f27_TN *q)/(q+one);
+      //q = q_dirBN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MP])[kts ]=(two*feqW27_TS -(f27_BN *(q*omegaD-one)-omegaD*feq27_BN *(q-one))/(omegaD-one)+f27_TS *q)/(q+one);
+      //q = q_dirTS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PM])[kbn ]=(two*feqW27_BN -(f27_TS *(q*omegaD-one)-omegaD*feq27_TS *(q-one))/(omegaD-one)+f27_BN *q)/(q+one);
       //q = q_dirTNE[k]; if (q>=zero && q<=one) (D27.f[DIR_MMM])[kbsw]=(two*feqW27_BSW-(f27_TNE*(q*omegaD-one)-omegaD*feq27_TNE*(q-one))/(omegaD-one)+f27_BSW*q)/(q+one);
       //q = q_dirBSW[k]; if (q>=zero && q<=one) (D27.f[DIR_PPP])[ktne]=(two*feqW27_TNE-(f27_BSW*(q*omegaD-one)-omegaD*feq27_BSW*(q-one))/(omegaD-one)+f27_TNE*q)/(q+one);
       //q = q_dirBNE[k]; if (q>=zero && q<=one) (D27.f[DIR_MMP])[ktsw]=(two*feqW27_TSW-(f27_BNE*(q*omegaD-one)-omegaD*feq27_BNE*(q-one))/(omegaD-one)+f27_TSW*q)/(q+one);
@@ -5801,91 +5801,91 @@ __global__ void QADPressIncomp7( real* DD,
 										   unsigned int* neighborX,
 										   unsigned int* neighborY,
 										   unsigned int* neighborZ,
-										   unsigned int size_Mat, 
+										   unsigned long long numberOfLBnodes, 
 										   bool isEvenTimestep)
 {
   /* Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * size_Mat];
+      D.f[DIR_M00] = &DD[DIR_M00 * size_Mat];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * size_Mat];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * size_Mat];
+      D.f[DIR_00P] = &DD[DIR_00P * size_Mat];
+      D.f[DIR_00M] = &DD[DIR_00M * size_Mat];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * size_Mat];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * size_Mat];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * size_Mat];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * size_Mat];
+      D.f[DIR_P0P] = &DD[DIR_P0P * size_Mat];
+      D.f[DIR_M0M] = &DD[DIR_M0M * size_Mat];
+      D.f[DIR_P0M] = &DD[DIR_P0M * size_Mat];
+      D.f[DIR_M0P] = &DD[DIR_M0P * size_Mat];
+      D.f[DIR_0PP] = &DD[DIR_0PP * size_Mat];
+      D.f[DIR_0MM] = &DD[DIR_0MM * size_Mat];
+      D.f[DIR_0PM] = &DD[DIR_0PM * size_Mat];
+      D.f[DIR_0MP] = &DD[DIR_0MP * size_Mat];
+      D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+      D.f[DIR_PPP] = &DD[DIR_PPP * size_Mat];
+      D.f[DIR_MMP] = &DD[DIR_MMP * size_Mat];
+      D.f[DIR_PMP] = &DD[DIR_PMP * size_Mat];
+      D.f[DIR_MPP] = &DD[DIR_MPP * size_Mat];
+      D.f[DIR_PPM] = &DD[DIR_PPM * size_Mat];
+      D.f[DIR_MMM] = &DD[DIR_MMM * size_Mat];
+      D.f[DIR_PMM] = &DD[DIR_PMM * size_Mat];
+      D.f[DIR_MPM] = &DD[DIR_MPM * size_Mat];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * size_Mat];
+      D.f[DIR_P00] = &DD[DIR_M00 * size_Mat];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * size_Mat];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * size_Mat];
+      D.f[DIR_00M] = &DD[DIR_00P * size_Mat];
+      D.f[DIR_00P] = &DD[DIR_00M * size_Mat];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * size_Mat];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * size_Mat];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * size_Mat];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * size_Mat];
+      D.f[DIR_M0M] = &DD[DIR_P0P * size_Mat];
+      D.f[DIR_P0P] = &DD[DIR_M0M * size_Mat];
+      D.f[DIR_M0P] = &DD[DIR_P0M * size_Mat];
+      D.f[DIR_P0M] = &DD[DIR_M0P * size_Mat];
+      D.f[DIR_0MM] = &DD[DIR_0PP * size_Mat];
+      D.f[DIR_0PP] = &DD[DIR_0MM * size_Mat];
+      D.f[DIR_0MP] = &DD[DIR_0PM * size_Mat];
+      D.f[DIR_0PM] = &DD[DIR_0MP * size_Mat];
+      D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+      D.f[DIR_PPP] = &DD[DIR_MMM * size_Mat];
+      D.f[DIR_MMP] = &DD[DIR_PPM * size_Mat];
+      D.f[DIR_PMP] = &DD[DIR_MPM * size_Mat];
+      D.f[DIR_MPP] = &DD[DIR_PMM * size_Mat];
+      D.f[DIR_PPM] = &DD[DIR_MMP * size_Mat];
+      D.f[DIR_MMM] = &DD[DIR_PPP * size_Mat];
+      D.f[DIR_PMM] = &DD[DIR_MPP * size_Mat];
+      D.f[DIR_MPM] = &DD[DIR_PMP * size_Mat];
    }*/
 
    Distributions7 D7;
    if (isEvenTimestep==true)
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[1] = &DD7[1*size_Mat];
-      D7.f[2] = &DD7[2*size_Mat];
-      D7.f[3] = &DD7[3*size_Mat];
-      D7.f[4] = &DD7[4*size_Mat];
-      D7.f[5] = &DD7[5*size_Mat];
-      D7.f[6] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[1] = &DD7[1*numberOfLBnodes];
+      D7.f[2] = &DD7[2*numberOfLBnodes];
+      D7.f[3] = &DD7[3*numberOfLBnodes];
+      D7.f[4] = &DD7[4*numberOfLBnodes];
+      D7.f[5] = &DD7[5*numberOfLBnodes];
+      D7.f[6] = &DD7[6*numberOfLBnodes];
    }
    else
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[2] = &DD7[1*size_Mat];
-      D7.f[1] = &DD7[2*size_Mat];
-      D7.f[4] = &DD7[3*size_Mat];
-      D7.f[3] = &DD7[4*size_Mat];
-      D7.f[6] = &DD7[5*size_Mat];
-      D7.f[5] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[2] = &DD7[1*numberOfLBnodes];
+      D7.f[1] = &DD7[2*numberOfLBnodes];
+      D7.f[4] = &DD7[3*numberOfLBnodes];
+      D7.f[3] = &DD7[4*numberOfLBnodes];
+      D7.f[6] = &DD7[5*numberOfLBnodes];
+      D7.f[5] = &DD7[6*numberOfLBnodes];
    }
 
 
@@ -5905,12 +5905,12 @@ __global__ void QADPressIncomp7( real* DD,
       //////////////////////////////////////////////////////////////////////////////////
       real  *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB; 
 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
       //////////////////////////////////////////////////////////////////////////////////
       //index
       unsigned int KQK  = k_Q[k];
@@ -5945,32 +5945,32 @@ __global__ void QADPressIncomp7( real* DD,
     /*  real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];*/
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];*/
       ////////////////////////////////////////////////////////////////////////////////
       //real vx1 = ((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_TSE-f_BNW)+(f_BSE-f_TNW) +(f_NE-f_SW)+(f_SE-f_NW)+(f_TE-f_BW)+(f_BE-f_TW)+(f_E-f_W));
       //real vx2 = ((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_BNW-f_TSE)+(f_TNW-f_BSE) +(f_NE-f_SW)+(f_NW-f_SE)+(f_TN-f_BS)+(f_BN-f_TS)+(f_N-f_S));
@@ -6035,23 +6035,23 @@ __global__ void QADPressIncomp7( real* DD,
       //pointertausch
       if (isEvenTimestep==false)
       {
-         D7.f[0] = &DD7[0*size_Mat];
-         D7.f[1] = &DD7[1*size_Mat];
-         D7.f[2] = &DD7[2*size_Mat];
-         D7.f[3] = &DD7[3*size_Mat];
-         D7.f[4] = &DD7[4*size_Mat];
-         D7.f[5] = &DD7[5*size_Mat];
-         D7.f[6] = &DD7[6*size_Mat];
+         D7.f[0] = &DD7[0*numberOfLBnodes];
+         D7.f[1] = &DD7[1*numberOfLBnodes];
+         D7.f[2] = &DD7[2*numberOfLBnodes];
+         D7.f[3] = &DD7[3*numberOfLBnodes];
+         D7.f[4] = &DD7[4*numberOfLBnodes];
+         D7.f[5] = &DD7[5*numberOfLBnodes];
+         D7.f[6] = &DD7[6*numberOfLBnodes];
       }
       else
       {
-         D7.f[0] = &DD7[0*size_Mat];
-         D7.f[2] = &DD7[1*size_Mat];
-         D7.f[1] = &DD7[2*size_Mat];
-         D7.f[4] = &DD7[3*size_Mat];
-         D7.f[3] = &DD7[4*size_Mat];
-         D7.f[6] = &DD7[5*size_Mat];
-         D7.f[5] = &DD7[6*size_Mat];
+         D7.f[0] = &DD7[0*numberOfLBnodes];
+         D7.f[2] = &DD7[1*numberOfLBnodes];
+         D7.f[1] = &DD7[2*numberOfLBnodes];
+         D7.f[4] = &DD7[3*numberOfLBnodes];
+         D7.f[3] = &DD7[4*numberOfLBnodes];
+         D7.f[6] = &DD7[5*numberOfLBnodes];
+         D7.f[5] = &DD7[6*numberOfLBnodes];
       }
 
       ////////////////////////////////////////////////////////////////////////////
@@ -6240,131 +6240,131 @@ __global__ void QADPressIncomp27(
 											   unsigned int* neighborX,
 											   unsigned int* neighborY,
 											   unsigned int* neighborZ,
-											   unsigned int size_Mat, 
+											   unsigned long long numberOfLBnodes, 
 											   bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
 
    Distributions27 D27;
    if (isEvenTimestep==true)
    {
-      D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+      D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+      D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -6385,24 +6385,24 @@ __global__ void QADPressIncomp27(
          *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
          *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
          *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -6442,65 +6442,65 @@ __global__ void QADPressIncomp27(
       unsigned int ktne = KQK;
       unsigned int kbsw = neighborZ[ksw];
       ////////////////////////////////////////////////////////////////////////////////
-      real f_W    = (D.f[DIR_P00   ])[ke   ];
-      real f_E    = (D.f[DIR_M00   ])[kw   ];
-      real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      real f_B    = (D.f[DIR_00P   ])[kt   ];
-      real f_T    = (D.f[DIR_00M   ])[kb   ];
-      real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (D.f[DIR_0MP  ])[kts  ];
+      real f_W    = (D.f[DIR_P00])[ke   ];
+      real f_E    = (D.f[DIR_M00])[kw   ];
+      real f_S    = (D.f[DIR_0P0])[kn   ];
+      real f_N    = (D.f[DIR_0M0])[ks   ];
+      real f_B    = (D.f[DIR_00P])[kt   ];
+      real f_T    = (D.f[DIR_00M])[kb   ];
+      real f_SW   = (D.f[DIR_PP0])[kne  ];
+      real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      real f_NW   = (D.f[DIR_PM0])[kse  ];
+      real f_SE   = (D.f[DIR_MP0])[knw  ];
+      real f_BW   = (D.f[DIR_P0P])[kte  ];
+      real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      real f_BN   = (D.f[DIR_0MP])[kts  ];
       //real f_ZERO = (D.f[DIR_000])[kzero];
-      real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      real f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1      = ((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_TSE-f_BNW)+(f_BSE-f_TNW) +(f_NE-f_SW)+(f_SE-f_NW)+(f_TE-f_BW)+(f_BE-f_TW)+(f_E-f_W));
       real vx2      = ((f_TNE-f_BSW)+(f_BNE-f_TSW)+(f_BNW-f_TSE)+(f_TNW-f_BSE) +(f_NE-f_SW)+(f_NW-f_SE)+(f_TN-f_BS)+(f_BN-f_TS)+(f_N-f_S));
       real vx3      = ((f_TNE-f_BSW)+(f_TSW-f_BNE)+(f_TSE-f_BNW)+(f_TNW-f_BSE) +(f_TE-f_BW)+(f_TW-f_BE)+(f_TN-f_BS)+(f_TS-f_BN)+(f_T-f_B));
       ////////////////////////////////////////////////////////////////////////////////
-      //real f27_W    = (D27.f[DIR_P00   ])[ke   ];
-      //real f27_E    = (D27.f[DIR_M00   ])[kw   ];
-      //real f27_S    = (D27.f[DIR_0P0   ])[kn   ];
-      //real f27_N    = (D27.f[DIR_0M0   ])[ks   ];
-      //real f27_B    = (D27.f[DIR_00P   ])[kt   ];
-      //real f27_T    = (D27.f[DIR_00M   ])[kb   ];
-      //real f27_SW   = (D27.f[DIR_PP0  ])[kne  ];
-      //real f27_NE   = (D27.f[DIR_MM0  ])[ksw  ];
-      //real f27_NW   = (D27.f[DIR_PM0  ])[kse  ];
-      //real f27_SE   = (D27.f[DIR_MP0  ])[knw  ];
-      //real f27_BW   = (D27.f[DIR_P0P  ])[kte  ];
-      //real f27_TE   = (D27.f[DIR_M0M  ])[kbw  ];
-      //real f27_TW   = (D27.f[DIR_P0M  ])[kbe  ];
-      //real f27_BE   = (D27.f[DIR_M0P  ])[ktw  ];
-      //real f27_BS   = (D27.f[DIR_0PP  ])[ktn  ];
-      //real f27_TN   = (D27.f[DIR_0MM  ])[kbs  ];
-      //real f27_TS   = (D27.f[DIR_0PM  ])[kbn  ];
-      //real f27_BN   = (D27.f[DIR_0MP  ])[kts  ];
+      //real f27_W    = (D27.f[DIR_P00])[ke   ];
+      //real f27_E    = (D27.f[DIR_M00])[kw   ];
+      //real f27_S    = (D27.f[DIR_0P0])[kn   ];
+      //real f27_N    = (D27.f[DIR_0M0])[ks   ];
+      //real f27_B    = (D27.f[DIR_00P])[kt   ];
+      //real f27_T    = (D27.f[DIR_00M])[kb   ];
+      //real f27_SW   = (D27.f[DIR_PP0])[kne  ];
+      //real f27_NE   = (D27.f[DIR_MM0])[ksw  ];
+      //real f27_NW   = (D27.f[DIR_PM0])[kse  ];
+      //real f27_SE   = (D27.f[DIR_MP0])[knw  ];
+      //real f27_BW   = (D27.f[DIR_P0P])[kte  ];
+      //real f27_TE   = (D27.f[DIR_M0M])[kbw  ];
+      //real f27_TW   = (D27.f[DIR_P0M])[kbe  ];
+      //real f27_BE   = (D27.f[DIR_M0P])[ktw  ];
+      //real f27_BS   = (D27.f[DIR_0PP])[ktn  ];
+      //real f27_TN   = (D27.f[DIR_0MM])[kbs  ];
+      //real f27_TS   = (D27.f[DIR_0PM])[kbn  ];
+      //real f27_BN   = (D27.f[DIR_0MP])[kts  ];
       //real f27_ZERO = (D27.f[DIR_000])[kzero];
-      //real f27_BSW  = (D27.f[DIR_PPP ])[ktne ];
-      //real f27_BNE  = (D27.f[DIR_MMP ])[ktsw ];
-      //real f27_BNW  = (D27.f[DIR_PMP ])[ktse ];
-      //real f27_BSE  = (D27.f[DIR_MPP ])[ktnw ];
-      //real f27_TSW  = (D27.f[DIR_PPM ])[kbne ];
-      //real f27_TNE  = (D27.f[DIR_MMM ])[kbsw ];
-      //real f27_TNW  = (D27.f[DIR_PMM ])[kbse ];
-      //real f27_TSE  = (D27.f[DIR_MPM ])[kbnw ];
+      //real f27_BSW  = (D27.f[DIR_PPP])[ktne ];
+      //real f27_BNE  = (D27.f[DIR_MMP])[ktsw ];
+      //real f27_BNW  = (D27.f[DIR_PMP])[ktse ];
+      //real f27_BSE  = (D27.f[DIR_MPP])[ktnw ];
+      //real f27_TSW  = (D27.f[DIR_PPM])[kbne ];
+      //real f27_TNE  = (D27.f[DIR_MMM])[kbsw ];
+      //real f27_TNW  = (D27.f[DIR_PMM])[kbse ];
+      //real f27_TSE  = (D27.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
       ////////////////////////////////////////////////////////////////////////////////
@@ -6581,63 +6581,63 @@ __global__ void QADPressIncomp27(
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+         D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-         D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-         D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-         D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-         D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-         D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-         D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-         D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-         D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-         D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-         D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-         D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-         D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-         D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-         D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-         D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-         D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-         D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-         D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-         D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-         D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-         D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
-         D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-         D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-         D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-         D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-         D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
+         D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+         D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+         D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+         D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+         D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+         D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+         D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+         D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+         D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+         D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+         D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+         D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+         D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+         D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+         D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+         D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+         D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+         D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+         D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+         D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+         D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+         D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
+         D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+         D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+         D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+         D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+         D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
@@ -6645,24 +6645,24 @@ __global__ void QADPressIncomp27(
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real q;
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00  ])[kw  ]= -feqW27_W  + c2o1 * c2o27  * TempD;
-      q = q_dirW[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00  ])[ke  ]= -feqW27_E  + c2o1 * c2o27  * TempD;
-      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0  ])[ks  ]= -feqW27_S  + c2o1 * c2o27  * TempD;
-      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0  ])[kn  ]= -feqW27_N  + c2o1 * c2o27  * TempD;
-      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M  ])[kb  ]= -feqW27_B  + c2o1 * c2o27  * TempD;
-      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P  ])[kt  ]= -feqW27_T  + c2o1 * c2o27  * TempD;
-      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0 ])[ksw ]= -feqW27_SW + c2o1 * c1o54  * TempD;
-      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0 ])[kne ]= -feqW27_NE + c2o1 * c1o54  * TempD;
-      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0 ])[knw ]= -feqW27_NW + c2o1 * c1o54  * TempD;
-      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0 ])[kse ]= -feqW27_SE + c2o1 * c1o54  * TempD;
-      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M ])[kbw ]= -feqW27_BW + c2o1 * c1o54  * TempD;
-      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P ])[kte ]= -feqW27_TE + c2o1 * c1o54  * TempD;
-      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P ])[ktw ]= -feqW27_TW + c2o1 * c1o54  * TempD;
-      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M ])[kbe ]= -feqW27_BE + c2o1 * c1o54  * TempD;
-      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM ])[kbs ]= -feqW27_BS + c2o1 * c1o54  * TempD;
-      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP ])[ktn ]= -feqW27_TN + c2o1 * c1o54  * TempD;
-      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP ])[kts ]= -feqW27_TS + c2o1 * c1o54  * TempD;
-      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM ])[kbn ]= -feqW27_BN + c2o1 * c1o54  * TempD;
+      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M00])[kw  ]= -feqW27_W  + c2o1 * c2o27  * TempD;
+      q = q_dirW[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P00])[ke  ]= -feqW27_E  + c2o1 * c2o27  * TempD;
+      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0M0])[ks  ]= -feqW27_S  + c2o1 * c2o27  * TempD;
+      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0P0])[kn  ]= -feqW27_N  + c2o1 * c2o27  * TempD;
+      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00M])[kb  ]= -feqW27_B  + c2o1 * c2o27  * TempD;
+      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1) (D27.f[DIR_00P])[kt  ]= -feqW27_T  + c2o1 * c2o27  * TempD;
+      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MM0])[ksw ]= -feqW27_SW + c2o1 * c1o54  * TempD;
+      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PP0])[kne ]= -feqW27_NE + c2o1 * c1o54  * TempD;
+      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MP0])[knw ]= -feqW27_NW + c2o1 * c1o54  * TempD;
+      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PM0])[kse ]= -feqW27_SE + c2o1 * c1o54  * TempD;
+      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0M])[kbw ]= -feqW27_BW + c2o1 * c1o54  * TempD;
+      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0P])[kte ]= -feqW27_TE + c2o1 * c1o54  * TempD;
+      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_M0P])[ktw ]= -feqW27_TW + c2o1 * c1o54  * TempD;
+      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_P0M])[kbe ]= -feqW27_BE + c2o1 * c1o54  * TempD;
+      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MM])[kbs ]= -feqW27_BS + c2o1 * c1o54  * TempD;
+      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PP])[ktn ]= -feqW27_TN + c2o1 * c1o54  * TempD;
+      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0MP])[kts ]= -feqW27_TS + c2o1 * c1o54  * TempD;
+      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1) (D27.f[DIR_0PM])[kbn ]= -feqW27_BN + c2o1 * c1o54  * TempD;
       q = q_dirTNE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMM])[kbsw]= -feqW27_BSW+ c2o1 * c1o216 * TempD;
       q = q_dirBSW[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PPP])[ktne]= -feqW27_TNE+ c2o1 * c1o216 * TempD;
       q = q_dirBNE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MMP])[ktsw]= -feqW27_TSW+ c2o1 * c1o216 * TempD;
@@ -6671,24 +6671,24 @@ __global__ void QADPressIncomp27(
       q = q_dirBNW[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PMP])[ktse]= -feqW27_TSE+ c2o1 * c1o216 * TempD;
       q = q_dirBSE[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_MPP])[ktnw]= -feqW27_TNW+ c2o1 * c1o216 * TempD;
       q = q_dirTNW[k]; if (q>=c0o1 && q<=c1o1) (D27.f[DIR_PMM])[kbse]= -feqW27_BSE+ c2o1 * c1o216 * TempD;
-      //q = q_dirE[k];   if (q>=zero && q<=one) (D27.f[DIR_M00  ])[kw  ]=(two*feqW27_W  -(f27_E  *(q*omegaD-one)-omegaD*feq27_E  *(q-one))/(omegaD-one)+f27_W  *q)/(q+one);
-      //q = q_dirW[k];   if (q>=zero && q<=one) (D27.f[DIR_P00  ])[ke  ]=(two*feqW27_E  -(f27_W  *(q*omegaD-one)-omegaD*feq27_W  *(q-one))/(omegaD-one)+f27_E  *q)/(q+one);
-      //q = q_dirN[k];   if (q>=zero && q<=one) (D27.f[DIR_0M0  ])[ks  ]=(two*feqW27_S  -(f27_N  *(q*omegaD-one)-omegaD*feq27_N  *(q-one))/(omegaD-one)+f27_S  *q)/(q+one);
-      //q = q_dirS[k];   if (q>=zero && q<=one) (D27.f[DIR_0P0  ])[kn  ]=(two*feqW27_N  -(f27_S  *(q*omegaD-one)-omegaD*feq27_S  *(q-one))/(omegaD-one)+f27_N  *q)/(q+one);
-      //q = q_dirT[k];   if (q>=zero && q<=one) (D27.f[DIR_00M  ])[kb  ]=(two*feqW27_B  -(f27_T  *(q*omegaD-one)-omegaD*feq27_T  *(q-one))/(omegaD-one)+f27_B  *q)/(q+one);
-      //q = q_dirB[k];   if (q>=zero && q<=one) (D27.f[DIR_00P  ])[kt  ]=(two*feqW27_T  -(f27_B  *(q*omegaD-one)-omegaD*feq27_B  *(q-one))/(omegaD-one)+f27_T  *q)/(q+one);
-      //q = q_dirNE[k];  if (q>=zero && q<=one) (D27.f[DIR_MM0 ])[ksw ]=(two*feqW27_SW -(f27_NE *(q*omegaD-one)-omegaD*feq27_NE *(q-one))/(omegaD-one)+f27_SW *q)/(q+one);
-      //q = q_dirSW[k];  if (q>=zero && q<=one) (D27.f[DIR_PP0 ])[kne ]=(two*feqW27_NE -(f27_SW *(q*omegaD-one)-omegaD*feq27_SW *(q-one))/(omegaD-one)+f27_NE *q)/(q+one);
-      //q = q_dirSE[k];  if (q>=zero && q<=one) (D27.f[DIR_MP0 ])[knw ]=(two*feqW27_NW -(f27_SE *(q*omegaD-one)-omegaD*feq27_SE *(q-one))/(omegaD-one)+f27_NW *q)/(q+one);
-      //q = q_dirNW[k];  if (q>=zero && q<=one) (D27.f[DIR_PM0 ])[kse ]=(two*feqW27_SE -(f27_NW *(q*omegaD-one)-omegaD*feq27_NW *(q-one))/(omegaD-one)+f27_SE *q)/(q+one);
-      //q = q_dirTE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0M ])[kbw ]=(two*feqW27_BW -(f27_TE *(q*omegaD-one)-omegaD*feq27_TE *(q-one))/(omegaD-one)+f27_BW *q)/(q+one);
-      //q = q_dirBW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0P ])[kte ]=(two*feqW27_TE -(f27_BW *(q*omegaD-one)-omegaD*feq27_BW *(q-one))/(omegaD-one)+f27_TE *q)/(q+one);
-      //q = q_dirBE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0P ])[ktw ]=(two*feqW27_TW -(f27_BE *(q*omegaD-one)-omegaD*feq27_BE *(q-one))/(omegaD-one)+f27_TW *q)/(q+one);
-      //q = q_dirTW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0M ])[kbe ]=(two*feqW27_BE -(f27_TW *(q*omegaD-one)-omegaD*feq27_TW *(q-one))/(omegaD-one)+f27_BE *q)/(q+one);
-      //q = q_dirTN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MM ])[kbs ]=(two*feqW27_BS -(f27_TN *(q*omegaD-one)-omegaD*feq27_TN *(q-one))/(omegaD-one)+f27_BS *q)/(q+one);
-      //q = q_dirBS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PP ])[ktn ]=(two*feqW27_TN -(f27_BS *(q*omegaD-one)-omegaD*feq27_BS *(q-one))/(omegaD-one)+f27_TN *q)/(q+one);
-      //q = q_dirBN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MP ])[kts ]=(two*feqW27_TS -(f27_BN *(q*omegaD-one)-omegaD*feq27_BN *(q-one))/(omegaD-one)+f27_TS *q)/(q+one);
-      //q = q_dirTS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PM ])[kbn ]=(two*feqW27_BN -(f27_TS *(q*omegaD-one)-omegaD*feq27_TS *(q-one))/(omegaD-one)+f27_BN *q)/(q+one);
+      //q = q_dirE[k];   if (q>=zero && q<=one) (D27.f[DIR_M00])[kw  ]=(two*feqW27_W  -(f27_E  *(q*omegaD-one)-omegaD*feq27_E  *(q-one))/(omegaD-one)+f27_W  *q)/(q+one);
+      //q = q_dirW[k];   if (q>=zero && q<=one) (D27.f[DIR_P00])[ke  ]=(two*feqW27_E  -(f27_W  *(q*omegaD-one)-omegaD*feq27_W  *(q-one))/(omegaD-one)+f27_E  *q)/(q+one);
+      //q = q_dirN[k];   if (q>=zero && q<=one) (D27.f[DIR_0M0])[ks  ]=(two*feqW27_S  -(f27_N  *(q*omegaD-one)-omegaD*feq27_N  *(q-one))/(omegaD-one)+f27_S  *q)/(q+one);
+      //q = q_dirS[k];   if (q>=zero && q<=one) (D27.f[DIR_0P0])[kn  ]=(two*feqW27_N  -(f27_S  *(q*omegaD-one)-omegaD*feq27_S  *(q-one))/(omegaD-one)+f27_N  *q)/(q+one);
+      //q = q_dirT[k];   if (q>=zero && q<=one) (D27.f[DIR_00M])[kb  ]=(two*feqW27_B  -(f27_T  *(q*omegaD-one)-omegaD*feq27_T  *(q-one))/(omegaD-one)+f27_B  *q)/(q+one);
+      //q = q_dirB[k];   if (q>=zero && q<=one) (D27.f[DIR_00P])[kt  ]=(two*feqW27_T  -(f27_B  *(q*omegaD-one)-omegaD*feq27_B  *(q-one))/(omegaD-one)+f27_T  *q)/(q+one);
+      //q = q_dirNE[k];  if (q>=zero && q<=one) (D27.f[DIR_MM0])[ksw ]=(two*feqW27_SW -(f27_NE *(q*omegaD-one)-omegaD*feq27_NE *(q-one))/(omegaD-one)+f27_SW *q)/(q+one);
+      //q = q_dirSW[k];  if (q>=zero && q<=one) (D27.f[DIR_PP0])[kne ]=(two*feqW27_NE -(f27_SW *(q*omegaD-one)-omegaD*feq27_SW *(q-one))/(omegaD-one)+f27_NE *q)/(q+one);
+      //q = q_dirSE[k];  if (q>=zero && q<=one) (D27.f[DIR_MP0])[knw ]=(two*feqW27_NW -(f27_SE *(q*omegaD-one)-omegaD*feq27_SE *(q-one))/(omegaD-one)+f27_NW *q)/(q+one);
+      //q = q_dirNW[k];  if (q>=zero && q<=one) (D27.f[DIR_PM0])[kse ]=(two*feqW27_SE -(f27_NW *(q*omegaD-one)-omegaD*feq27_NW *(q-one))/(omegaD-one)+f27_SE *q)/(q+one);
+      //q = q_dirTE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0M])[kbw ]=(two*feqW27_BW -(f27_TE *(q*omegaD-one)-omegaD*feq27_TE *(q-one))/(omegaD-one)+f27_BW *q)/(q+one);
+      //q = q_dirBW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0P])[kte ]=(two*feqW27_TE -(f27_BW *(q*omegaD-one)-omegaD*feq27_BW *(q-one))/(omegaD-one)+f27_TE *q)/(q+one);
+      //q = q_dirBE[k];  if (q>=zero && q<=one) (D27.f[DIR_M0P])[ktw ]=(two*feqW27_TW -(f27_BE *(q*omegaD-one)-omegaD*feq27_BE *(q-one))/(omegaD-one)+f27_TW *q)/(q+one);
+      //q = q_dirTW[k];  if (q>=zero && q<=one) (D27.f[DIR_P0M])[kbe ]=(two*feqW27_BE -(f27_TW *(q*omegaD-one)-omegaD*feq27_TW *(q-one))/(omegaD-one)+f27_BE *q)/(q+one);
+      //q = q_dirTN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MM])[kbs ]=(two*feqW27_BS -(f27_TN *(q*omegaD-one)-omegaD*feq27_TN *(q-one))/(omegaD-one)+f27_BS *q)/(q+one);
+      //q = q_dirBS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PP])[ktn ]=(two*feqW27_TN -(f27_BS *(q*omegaD-one)-omegaD*feq27_BS *(q-one))/(omegaD-one)+f27_TN *q)/(q+one);
+      //q = q_dirBN[k];  if (q>=zero && q<=one) (D27.f[DIR_0MP])[kts ]=(two*feqW27_TS -(f27_BN *(q*omegaD-one)-omegaD*feq27_BN *(q-one))/(omegaD-one)+f27_TS *q)/(q+one);
+      //q = q_dirTS[k];  if (q>=zero && q<=one) (D27.f[DIR_0PM])[kbn ]=(two*feqW27_BN -(f27_TS *(q*omegaD-one)-omegaD*feq27_TS *(q-one))/(omegaD-one)+f27_BN *q)/(q+one);
       //q = q_dirTNE[k]; if (q>=zero && q<=one) (D27.f[DIR_MMM])[kbsw]=(two*feqW27_BSW-(f27_TNE*(q*omegaD-one)-omegaD*feq27_TNE*(q-one))/(omegaD-one)+f27_BSW*q)/(q+one);
       //q = q_dirBSW[k]; if (q>=zero && q<=one) (D27.f[DIR_PPP])[ktne]=(two*feqW27_TNE-(f27_BSW*(q*omegaD-one)-omegaD*feq27_BSW*(q-one))/(omegaD-one)+f27_TNE*q)/(q+one);
       //q = q_dirBNE[k]; if (q>=zero && q<=one) (D27.f[DIR_MMP])[ktsw]=(two*feqW27_TSW-(f27_BNE*(q*omegaD-one)-omegaD*feq27_BNE*(q-one))/(omegaD-one)+f27_TSW*q)/(q+one);
@@ -6739,131 +6739,131 @@ __global__ void AD_SlipVelDeviceComp(
     uint* neighborX,
     uint* neighborY,
     uint* neighborZ,
-    uint size_Mat,
+    unsigned long long numberOfLBnodes,
     bool isEvenTimestep)
 {
     Distributions27 D;
     if (isEvenTimestep)
     {
-        D.f[DIR_P00   ] = &distributions[DIR_P00    * size_Mat];
-        D.f[DIR_M00   ] = &distributions[DIR_M00    * size_Mat];
-        D.f[DIR_0P0   ] = &distributions[DIR_0P0    * size_Mat];
-        D.f[DIR_0M0   ] = &distributions[DIR_0M0    * size_Mat];
-        D.f[DIR_00P   ] = &distributions[DIR_00P    * size_Mat];
-        D.f[DIR_00M   ] = &distributions[DIR_00M    * size_Mat];
-        D.f[DIR_PP0  ] = &distributions[DIR_PP0   * size_Mat];
-        D.f[DIR_MM0  ] = &distributions[DIR_MM0   * size_Mat];
-        D.f[DIR_PM0  ] = &distributions[DIR_PM0   * size_Mat];
-        D.f[DIR_MP0  ] = &distributions[DIR_MP0   * size_Mat];
-        D.f[DIR_P0P  ] = &distributions[DIR_P0P   * size_Mat];
-        D.f[DIR_M0M  ] = &distributions[DIR_M0M   * size_Mat];
-        D.f[DIR_P0M  ] = &distributions[DIR_P0M   * size_Mat];
-        D.f[DIR_M0P  ] = &distributions[DIR_M0P   * size_Mat];
-        D.f[DIR_0PP  ] = &distributions[DIR_0PP   * size_Mat];
-        D.f[DIR_0MM  ] = &distributions[DIR_0MM   * size_Mat];
-        D.f[DIR_0PM  ] = &distributions[DIR_0PM   * size_Mat];
-        D.f[DIR_0MP  ] = &distributions[DIR_0MP   * size_Mat];
-        D.f[DIR_000] = &distributions[DIR_000 * size_Mat];
-        D.f[DIR_PPP ] = &distributions[DIR_PPP  * size_Mat];
-        D.f[DIR_MMP ] = &distributions[DIR_MMP  * size_Mat];
-        D.f[DIR_PMP ] = &distributions[DIR_PMP  * size_Mat];
-        D.f[DIR_MPP ] = &distributions[DIR_MPP  * size_Mat];
-        D.f[DIR_PPM ] = &distributions[DIR_PPM  * size_Mat];
-        D.f[DIR_MMM ] = &distributions[DIR_MMM  * size_Mat];
-        D.f[DIR_PMM ] = &distributions[DIR_PMM  * size_Mat];
-        D.f[DIR_MPM ] = &distributions[DIR_MPM  * size_Mat];
+        D.f[DIR_P00] = &distributions[DIR_P00 * numberOfLBnodes];
+        D.f[DIR_M00] = &distributions[DIR_M00 * numberOfLBnodes];
+        D.f[DIR_0P0] = &distributions[DIR_0P0 * numberOfLBnodes];
+        D.f[DIR_0M0] = &distributions[DIR_0M0 * numberOfLBnodes];
+        D.f[DIR_00P] = &distributions[DIR_00P * numberOfLBnodes];
+        D.f[DIR_00M] = &distributions[DIR_00M * numberOfLBnodes];
+        D.f[DIR_PP0] = &distributions[DIR_PP0 * numberOfLBnodes];
+        D.f[DIR_MM0] = &distributions[DIR_MM0 * numberOfLBnodes];
+        D.f[DIR_PM0] = &distributions[DIR_PM0 * numberOfLBnodes];
+        D.f[DIR_MP0] = &distributions[DIR_MP0 * numberOfLBnodes];
+        D.f[DIR_P0P] = &distributions[DIR_P0P * numberOfLBnodes];
+        D.f[DIR_M0M] = &distributions[DIR_M0M * numberOfLBnodes];
+        D.f[DIR_P0M] = &distributions[DIR_P0M * numberOfLBnodes];
+        D.f[DIR_M0P] = &distributions[DIR_M0P * numberOfLBnodes];
+        D.f[DIR_0PP] = &distributions[DIR_0PP * numberOfLBnodes];
+        D.f[DIR_0MM] = &distributions[DIR_0MM * numberOfLBnodes];
+        D.f[DIR_0PM] = &distributions[DIR_0PM * numberOfLBnodes];
+        D.f[DIR_0MP] = &distributions[DIR_0MP * numberOfLBnodes];
+        D.f[DIR_000] = &distributions[DIR_000 * numberOfLBnodes];
+        D.f[DIR_PPP] = &distributions[DIR_PPP * numberOfLBnodes];
+        D.f[DIR_MMP] = &distributions[DIR_MMP * numberOfLBnodes];
+        D.f[DIR_PMP] = &distributions[DIR_PMP * numberOfLBnodes];
+        D.f[DIR_MPP] = &distributions[DIR_MPP * numberOfLBnodes];
+        D.f[DIR_PPM] = &distributions[DIR_PPM * numberOfLBnodes];
+        D.f[DIR_MMM] = &distributions[DIR_MMM * numberOfLBnodes];
+        D.f[DIR_PMM] = &distributions[DIR_PMM * numberOfLBnodes];
+        D.f[DIR_MPM] = &distributions[DIR_MPM * numberOfLBnodes];
     }
     else
     {
-        D.f[DIR_M00   ] = &distributions[DIR_P00    * size_Mat];
-        D.f[DIR_P00   ] = &distributions[DIR_M00    * size_Mat];
-        D.f[DIR_0M0   ] = &distributions[DIR_0P0    * size_Mat];
-        D.f[DIR_0P0   ] = &distributions[DIR_0M0    * size_Mat];
-        D.f[DIR_00M   ] = &distributions[DIR_00P    * size_Mat];
-        D.f[DIR_00P   ] = &distributions[DIR_00M    * size_Mat];
-        D.f[DIR_MM0  ] = &distributions[DIR_PP0   * size_Mat];
-        D.f[DIR_PP0  ] = &distributions[DIR_MM0   * size_Mat];
-        D.f[DIR_MP0  ] = &distributions[DIR_PM0   * size_Mat];
-        D.f[DIR_PM0  ] = &distributions[DIR_MP0   * size_Mat];
-        D.f[DIR_M0M  ] = &distributions[DIR_P0P   * size_Mat];
-        D.f[DIR_P0P  ] = &distributions[DIR_M0M   * size_Mat];
-        D.f[DIR_M0P  ] = &distributions[DIR_P0M   * size_Mat];
-        D.f[DIR_P0M  ] = &distributions[DIR_M0P   * size_Mat];
-        D.f[DIR_0MM  ] = &distributions[DIR_0PP   * size_Mat];
-        D.f[DIR_0PP  ] = &distributions[DIR_0MM   * size_Mat];
-        D.f[DIR_0MP  ] = &distributions[DIR_0PM   * size_Mat];
-        D.f[DIR_0PM  ] = &distributions[DIR_0MP   * size_Mat];
-        D.f[DIR_000] = &distributions[DIR_000 * size_Mat];
-        D.f[DIR_PPP ] = &distributions[DIR_MMM  * size_Mat];
-        D.f[DIR_MMP ] = &distributions[DIR_PPM  * size_Mat];
-        D.f[DIR_PMP ] = &distributions[DIR_MPM  * size_Mat];
-        D.f[DIR_MPP ] = &distributions[DIR_PMM  * size_Mat];
-        D.f[DIR_PPM ] = &distributions[DIR_MMP  * size_Mat];
-        D.f[DIR_MMM ] = &distributions[DIR_PPP  * size_Mat];
-        D.f[DIR_PMM ] = &distributions[DIR_MPP  * size_Mat];
-        D.f[DIR_MPM ] = &distributions[DIR_PMP  * size_Mat];
+        D.f[DIR_M00] = &distributions[DIR_P00 * numberOfLBnodes];
+        D.f[DIR_P00] = &distributions[DIR_M00 * numberOfLBnodes];
+        D.f[DIR_0M0] = &distributions[DIR_0P0 * numberOfLBnodes];
+        D.f[DIR_0P0] = &distributions[DIR_0M0 * numberOfLBnodes];
+        D.f[DIR_00M] = &distributions[DIR_00P * numberOfLBnodes];
+        D.f[DIR_00P] = &distributions[DIR_00M * numberOfLBnodes];
+        D.f[DIR_MM0] = &distributions[DIR_PP0 * numberOfLBnodes];
+        D.f[DIR_PP0] = &distributions[DIR_MM0 * numberOfLBnodes];
+        D.f[DIR_MP0] = &distributions[DIR_PM0 * numberOfLBnodes];
+        D.f[DIR_PM0] = &distributions[DIR_MP0 * numberOfLBnodes];
+        D.f[DIR_M0M] = &distributions[DIR_P0P * numberOfLBnodes];
+        D.f[DIR_P0P] = &distributions[DIR_M0M * numberOfLBnodes];
+        D.f[DIR_M0P] = &distributions[DIR_P0M * numberOfLBnodes];
+        D.f[DIR_P0M] = &distributions[DIR_M0P * numberOfLBnodes];
+        D.f[DIR_0MM] = &distributions[DIR_0PP * numberOfLBnodes];
+        D.f[DIR_0PP] = &distributions[DIR_0MM * numberOfLBnodes];
+        D.f[DIR_0MP] = &distributions[DIR_0PM * numberOfLBnodes];
+        D.f[DIR_0PM] = &distributions[DIR_0MP * numberOfLBnodes];
+        D.f[DIR_000] = &distributions[DIR_000 * numberOfLBnodes];
+        D.f[DIR_PPP] = &distributions[DIR_MMM * numberOfLBnodes];
+        D.f[DIR_MMP] = &distributions[DIR_PPM * numberOfLBnodes];
+        D.f[DIR_PMP] = &distributions[DIR_MPM * numberOfLBnodes];
+        D.f[DIR_MPP] = &distributions[DIR_PMM * numberOfLBnodes];
+        D.f[DIR_PPM] = &distributions[DIR_MMP * numberOfLBnodes];
+        D.f[DIR_MMM] = &distributions[DIR_PPP * numberOfLBnodes];
+        D.f[DIR_PMM] = &distributions[DIR_MPP * numberOfLBnodes];
+        D.f[DIR_MPM] = &distributions[DIR_PMP * numberOfLBnodes];
     }
     ////////////////////////////////////////////////////////////////////////////////
     Distributions27 DAD;
     if (isEvenTimestep)
     {
-        DAD.f[DIR_P00   ] = &distributionsAD[DIR_P00    * size_Mat];
-        DAD.f[DIR_M00   ] = &distributionsAD[DIR_M00    * size_Mat];
-        DAD.f[DIR_0P0   ] = &distributionsAD[DIR_0P0    * size_Mat];
-        DAD.f[DIR_0M0   ] = &distributionsAD[DIR_0M0    * size_Mat];
-        DAD.f[DIR_00P   ] = &distributionsAD[DIR_00P    * size_Mat];
-        DAD.f[DIR_00M   ] = &distributionsAD[DIR_00M    * size_Mat];
-        DAD.f[DIR_PP0  ] = &distributionsAD[DIR_PP0   * size_Mat];
-        DAD.f[DIR_MM0  ] = &distributionsAD[DIR_MM0   * size_Mat];
-        DAD.f[DIR_PM0  ] = &distributionsAD[DIR_PM0   * size_Mat];
-        DAD.f[DIR_MP0  ] = &distributionsAD[DIR_MP0   * size_Mat];
-        DAD.f[DIR_P0P  ] = &distributionsAD[DIR_P0P   * size_Mat];
-        DAD.f[DIR_M0M  ] = &distributionsAD[DIR_M0M   * size_Mat];
-        DAD.f[DIR_P0M  ] = &distributionsAD[DIR_P0M   * size_Mat];
-        DAD.f[DIR_M0P  ] = &distributionsAD[DIR_M0P   * size_Mat];
-        DAD.f[DIR_0PP  ] = &distributionsAD[DIR_0PP   * size_Mat];
-        DAD.f[DIR_0MM  ] = &distributionsAD[DIR_0MM   * size_Mat];
-        DAD.f[DIR_0PM  ] = &distributionsAD[DIR_0PM   * size_Mat];
-        DAD.f[DIR_0MP  ] = &distributionsAD[DIR_0MP   * size_Mat];
-        DAD.f[DIR_000] = &distributionsAD[DIR_000 * size_Mat];
-        DAD.f[DIR_PPP ] = &distributionsAD[DIR_PPP  * size_Mat];
-        DAD.f[DIR_MMP ] = &distributionsAD[DIR_MMP  * size_Mat];
-        DAD.f[DIR_PMP ] = &distributionsAD[DIR_PMP  * size_Mat];
-        DAD.f[DIR_MPP ] = &distributionsAD[DIR_MPP  * size_Mat];
-        DAD.f[DIR_PPM ] = &distributionsAD[DIR_PPM  * size_Mat];
-        DAD.f[DIR_MMM ] = &distributionsAD[DIR_MMM  * size_Mat];
-        DAD.f[DIR_PMM ] = &distributionsAD[DIR_PMM  * size_Mat];
-        DAD.f[DIR_MPM ] = &distributionsAD[DIR_MPM  * size_Mat];
+        DAD.f[DIR_P00] = &distributionsAD[DIR_P00 * numberOfLBnodes];
+        DAD.f[DIR_M00] = &distributionsAD[DIR_M00 * numberOfLBnodes];
+        DAD.f[DIR_0P0] = &distributionsAD[DIR_0P0 * numberOfLBnodes];
+        DAD.f[DIR_0M0] = &distributionsAD[DIR_0M0 * numberOfLBnodes];
+        DAD.f[DIR_00P] = &distributionsAD[DIR_00P * numberOfLBnodes];
+        DAD.f[DIR_00M] = &distributionsAD[DIR_00M * numberOfLBnodes];
+        DAD.f[DIR_PP0] = &distributionsAD[DIR_PP0 * numberOfLBnodes];
+        DAD.f[DIR_MM0] = &distributionsAD[DIR_MM0 * numberOfLBnodes];
+        DAD.f[DIR_PM0] = &distributionsAD[DIR_PM0 * numberOfLBnodes];
+        DAD.f[DIR_MP0] = &distributionsAD[DIR_MP0 * numberOfLBnodes];
+        DAD.f[DIR_P0P] = &distributionsAD[DIR_P0P * numberOfLBnodes];
+        DAD.f[DIR_M0M] = &distributionsAD[DIR_M0M * numberOfLBnodes];
+        DAD.f[DIR_P0M] = &distributionsAD[DIR_P0M * numberOfLBnodes];
+        DAD.f[DIR_M0P] = &distributionsAD[DIR_M0P * numberOfLBnodes];
+        DAD.f[DIR_0PP] = &distributionsAD[DIR_0PP * numberOfLBnodes];
+        DAD.f[DIR_0MM] = &distributionsAD[DIR_0MM * numberOfLBnodes];
+        DAD.f[DIR_0PM] = &distributionsAD[DIR_0PM * numberOfLBnodes];
+        DAD.f[DIR_0MP] = &distributionsAD[DIR_0MP * numberOfLBnodes];
+        DAD.f[DIR_000] = &distributionsAD[DIR_000 * numberOfLBnodes];
+        DAD.f[DIR_PPP] = &distributionsAD[DIR_PPP * numberOfLBnodes];
+        DAD.f[DIR_MMP] = &distributionsAD[DIR_MMP * numberOfLBnodes];
+        DAD.f[DIR_PMP] = &distributionsAD[DIR_PMP * numberOfLBnodes];
+        DAD.f[DIR_MPP] = &distributionsAD[DIR_MPP * numberOfLBnodes];
+        DAD.f[DIR_PPM] = &distributionsAD[DIR_PPM * numberOfLBnodes];
+        DAD.f[DIR_MMM] = &distributionsAD[DIR_MMM * numberOfLBnodes];
+        DAD.f[DIR_PMM] = &distributionsAD[DIR_PMM * numberOfLBnodes];
+        DAD.f[DIR_MPM] = &distributionsAD[DIR_MPM * numberOfLBnodes];
     }
     else
     {
-        DAD.f[DIR_M00   ] = &distributionsAD[DIR_P00    * size_Mat];
-        DAD.f[DIR_P00   ] = &distributionsAD[DIR_M00    * size_Mat];
-        DAD.f[DIR_0M0   ] = &distributionsAD[DIR_0P0    * size_Mat];
-        DAD.f[DIR_0P0   ] = &distributionsAD[DIR_0M0    * size_Mat];
-        DAD.f[DIR_00M   ] = &distributionsAD[DIR_00P    * size_Mat];
-        DAD.f[DIR_00P   ] = &distributionsAD[DIR_00M    * size_Mat];
-        DAD.f[DIR_MM0  ] = &distributionsAD[DIR_PP0   * size_Mat];
-        DAD.f[DIR_PP0  ] = &distributionsAD[DIR_MM0   * size_Mat];
-        DAD.f[DIR_MP0  ] = &distributionsAD[DIR_PM0   * size_Mat];
-        DAD.f[DIR_PM0  ] = &distributionsAD[DIR_MP0   * size_Mat];
-        DAD.f[DIR_M0M  ] = &distributionsAD[DIR_P0P   * size_Mat];
-        DAD.f[DIR_P0P  ] = &distributionsAD[DIR_M0M   * size_Mat];
-        DAD.f[DIR_M0P  ] = &distributionsAD[DIR_P0M   * size_Mat];
-        DAD.f[DIR_P0M  ] = &distributionsAD[DIR_M0P   * size_Mat];
-        DAD.f[DIR_0MM  ] = &distributionsAD[DIR_0PP   * size_Mat];
-        DAD.f[DIR_0PP  ] = &distributionsAD[DIR_0MM   * size_Mat];
-        DAD.f[DIR_0MP  ] = &distributionsAD[DIR_0PM   * size_Mat];
-        DAD.f[DIR_0PM  ] = &distributionsAD[DIR_0MP   * size_Mat];
-        DAD.f[DIR_000] = &distributionsAD[DIR_000 * size_Mat];
-        DAD.f[DIR_PPP ] = &distributionsAD[DIR_MMM  * size_Mat];
-        DAD.f[DIR_MMP ] = &distributionsAD[DIR_PPM  * size_Mat];
-        DAD.f[DIR_PMP ] = &distributionsAD[DIR_MPM  * size_Mat];
-        DAD.f[DIR_MPP ] = &distributionsAD[DIR_PMM  * size_Mat];
-        DAD.f[DIR_PPM ] = &distributionsAD[DIR_MMP  * size_Mat];
-        DAD.f[DIR_MMM ] = &distributionsAD[DIR_PPP  * size_Mat];
-        DAD.f[DIR_PMM ] = &distributionsAD[DIR_MPP  * size_Mat];
-        DAD.f[DIR_MPM ] = &distributionsAD[DIR_PMP  * size_Mat];
+        DAD.f[DIR_M00] = &distributionsAD[DIR_P00 * numberOfLBnodes];
+        DAD.f[DIR_P00] = &distributionsAD[DIR_M00 * numberOfLBnodes];
+        DAD.f[DIR_0M0] = &distributionsAD[DIR_0P0 * numberOfLBnodes];
+        DAD.f[DIR_0P0] = &distributionsAD[DIR_0M0 * numberOfLBnodes];
+        DAD.f[DIR_00M] = &distributionsAD[DIR_00P * numberOfLBnodes];
+        DAD.f[DIR_00P] = &distributionsAD[DIR_00M * numberOfLBnodes];
+        DAD.f[DIR_MM0] = &distributionsAD[DIR_PP0 * numberOfLBnodes];
+        DAD.f[DIR_PP0] = &distributionsAD[DIR_MM0 * numberOfLBnodes];
+        DAD.f[DIR_MP0] = &distributionsAD[DIR_PM0 * numberOfLBnodes];
+        DAD.f[DIR_PM0] = &distributionsAD[DIR_MP0 * numberOfLBnodes];
+        DAD.f[DIR_M0M] = &distributionsAD[DIR_P0P * numberOfLBnodes];
+        DAD.f[DIR_P0P] = &distributionsAD[DIR_M0M * numberOfLBnodes];
+        DAD.f[DIR_M0P] = &distributionsAD[DIR_P0M * numberOfLBnodes];
+        DAD.f[DIR_P0M] = &distributionsAD[DIR_M0P * numberOfLBnodes];
+        DAD.f[DIR_0MM] = &distributionsAD[DIR_0PP * numberOfLBnodes];
+        DAD.f[DIR_0PP] = &distributionsAD[DIR_0MM * numberOfLBnodes];
+        DAD.f[DIR_0MP] = &distributionsAD[DIR_0PM * numberOfLBnodes];
+        DAD.f[DIR_0PM] = &distributionsAD[DIR_0MP * numberOfLBnodes];
+        DAD.f[DIR_000] = &distributionsAD[DIR_000 * numberOfLBnodes];
+        DAD.f[DIR_PPP] = &distributionsAD[DIR_MMM * numberOfLBnodes];
+        DAD.f[DIR_MMP] = &distributionsAD[DIR_PPM * numberOfLBnodes];
+        DAD.f[DIR_PMP] = &distributionsAD[DIR_MPM * numberOfLBnodes];
+        DAD.f[DIR_MPP] = &distributionsAD[DIR_PMM * numberOfLBnodes];
+        DAD.f[DIR_PPM] = &distributionsAD[DIR_MMP * numberOfLBnodes];
+        DAD.f[DIR_MMM] = &distributionsAD[DIR_PPP * numberOfLBnodes];
+        DAD.f[DIR_PMM] = &distributionsAD[DIR_MPP * numberOfLBnodes];
+        DAD.f[DIR_MPM] = &distributionsAD[DIR_PMP * numberOfLBnodes];
     }
     ////////////////////////////////////////////////////////////////////////////////
     const unsigned  x = threadIdx.x;  // Globaler x-Index
@@ -6888,24 +6888,24 @@ __global__ void AD_SlipVelDeviceComp(
             * q_dirBE, * q_dirTW, * q_dirTN, * q_dirBS, * q_dirBN, * q_dirTS,
             * q_dirTNE, * q_dirTSW, * q_dirTSE, * q_dirTNW, * q_dirBNE, * q_dirBSW,
             * q_dirBSE, * q_dirBNW;
-        q_dirE   = &Qarrays[DIR_P00   * numberOfBCnodes];
-        q_dirW   = &Qarrays[DIR_M00   * numberOfBCnodes];
-        q_dirN   = &Qarrays[DIR_0P0   * numberOfBCnodes];
-        q_dirS   = &Qarrays[DIR_0M0   * numberOfBCnodes];
-        q_dirT   = &Qarrays[DIR_00P   * numberOfBCnodes];
-        q_dirB   = &Qarrays[DIR_00M   * numberOfBCnodes];
-        q_dirNE  = &Qarrays[DIR_PP0  * numberOfBCnodes];
-        q_dirSW  = &Qarrays[DIR_MM0  * numberOfBCnodes];
-        q_dirSE  = &Qarrays[DIR_PM0  * numberOfBCnodes];
-        q_dirNW  = &Qarrays[DIR_MP0  * numberOfBCnodes];
-        q_dirTE  = &Qarrays[DIR_P0P  * numberOfBCnodes];
-        q_dirBW  = &Qarrays[DIR_M0M  * numberOfBCnodes];
-        q_dirBE  = &Qarrays[DIR_P0M  * numberOfBCnodes];
-        q_dirTW  = &Qarrays[DIR_M0P  * numberOfBCnodes];
-        q_dirTN  = &Qarrays[DIR_0PP  * numberOfBCnodes];
-        q_dirBS  = &Qarrays[DIR_0MM  * numberOfBCnodes];
-        q_dirBN  = &Qarrays[DIR_0PM  * numberOfBCnodes];
-        q_dirTS  = &Qarrays[DIR_0MP  * numberOfBCnodes];
+        q_dirE   = &Qarrays[DIR_P00 * numberOfBCnodes];
+        q_dirW   = &Qarrays[DIR_M00 * numberOfBCnodes];
+        q_dirN   = &Qarrays[DIR_0P0 * numberOfBCnodes];
+        q_dirS   = &Qarrays[DIR_0M0 * numberOfBCnodes];
+        q_dirT   = &Qarrays[DIR_00P * numberOfBCnodes];
+        q_dirB   = &Qarrays[DIR_00M * numberOfBCnodes];
+        q_dirNE  = &Qarrays[DIR_PP0 * numberOfBCnodes];
+        q_dirSW  = &Qarrays[DIR_MM0 * numberOfBCnodes];
+        q_dirSE  = &Qarrays[DIR_PM0 * numberOfBCnodes];
+        q_dirNW  = &Qarrays[DIR_MP0 * numberOfBCnodes];
+        q_dirTE  = &Qarrays[DIR_P0P * numberOfBCnodes];
+        q_dirBW  = &Qarrays[DIR_M0M * numberOfBCnodes];
+        q_dirBE  = &Qarrays[DIR_P0M * numberOfBCnodes];
+        q_dirTW  = &Qarrays[DIR_M0P * numberOfBCnodes];
+        q_dirTN  = &Qarrays[DIR_0PP * numberOfBCnodes];
+        q_dirBS  = &Qarrays[DIR_0MM * numberOfBCnodes];
+        q_dirBN  = &Qarrays[DIR_0PM * numberOfBCnodes];
+        q_dirTS  = &Qarrays[DIR_0MP * numberOfBCnodes];
         q_dirTNE = &Qarrays[DIR_PPP * numberOfBCnodes];
         q_dirTSW = &Qarrays[DIR_MMP * numberOfBCnodes];
         q_dirTSE = &Qarrays[DIR_PMP * numberOfBCnodes];
@@ -7025,63 +7025,63 @@ __global__ void AD_SlipVelDeviceComp(
         //////////////////////////////////////////////////////////////////////////
         if (!isEvenTimestep)
         {
-            DAD.f[DIR_P00   ] = &distributionsAD[DIR_P00    * size_Mat];
-            DAD.f[DIR_M00   ] = &distributionsAD[DIR_M00    * size_Mat];
-            DAD.f[DIR_0P0   ] = &distributionsAD[DIR_0P0    * size_Mat];
-            DAD.f[DIR_0M0   ] = &distributionsAD[DIR_0M0    * size_Mat];
-            DAD.f[DIR_00P   ] = &distributionsAD[DIR_00P    * size_Mat];
-            DAD.f[DIR_00M   ] = &distributionsAD[DIR_00M    * size_Mat];
-            DAD.f[DIR_PP0  ] = &distributionsAD[DIR_PP0   * size_Mat];
-            DAD.f[DIR_MM0  ] = &distributionsAD[DIR_MM0   * size_Mat];
-            DAD.f[DIR_PM0  ] = &distributionsAD[DIR_PM0   * size_Mat];
-            DAD.f[DIR_MP0  ] = &distributionsAD[DIR_MP0   * size_Mat];
-            DAD.f[DIR_P0P  ] = &distributionsAD[DIR_P0P   * size_Mat];
-            DAD.f[DIR_M0M  ] = &distributionsAD[DIR_M0M   * size_Mat];
-            DAD.f[DIR_P0M  ] = &distributionsAD[DIR_P0M   * size_Mat];
-            DAD.f[DIR_M0P  ] = &distributionsAD[DIR_M0P   * size_Mat];
-            DAD.f[DIR_0PP  ] = &distributionsAD[DIR_0PP   * size_Mat];
-            DAD.f[DIR_0MM  ] = &distributionsAD[DIR_0MM   * size_Mat];
-            DAD.f[DIR_0PM  ] = &distributionsAD[DIR_0PM   * size_Mat];
-            DAD.f[DIR_0MP  ] = &distributionsAD[DIR_0MP   * size_Mat];
-            DAD.f[DIR_000] = &distributionsAD[DIR_000 * size_Mat];
-            DAD.f[DIR_PPP ] = &distributionsAD[DIR_PPP  * size_Mat];
-            DAD.f[DIR_MMP ] = &distributionsAD[DIR_MMP  * size_Mat];
-            DAD.f[DIR_PMP ] = &distributionsAD[DIR_PMP  * size_Mat];
-            DAD.f[DIR_MPP ] = &distributionsAD[DIR_MPP  * size_Mat];
-            DAD.f[DIR_PPM ] = &distributionsAD[DIR_PPM  * size_Mat];
-            DAD.f[DIR_MMM ] = &distributionsAD[DIR_MMM  * size_Mat];
-            DAD.f[DIR_PMM ] = &distributionsAD[DIR_PMM  * size_Mat];
-            DAD.f[DIR_MPM ] = &distributionsAD[DIR_MPM  * size_Mat];
+            DAD.f[DIR_P00] = &distributionsAD[DIR_P00 * numberOfLBnodes];
+            DAD.f[DIR_M00] = &distributionsAD[DIR_M00 * numberOfLBnodes];
+            DAD.f[DIR_0P0] = &distributionsAD[DIR_0P0 * numberOfLBnodes];
+            DAD.f[DIR_0M0] = &distributionsAD[DIR_0M0 * numberOfLBnodes];
+            DAD.f[DIR_00P] = &distributionsAD[DIR_00P * numberOfLBnodes];
+            DAD.f[DIR_00M] = &distributionsAD[DIR_00M * numberOfLBnodes];
+            DAD.f[DIR_PP0] = &distributionsAD[DIR_PP0 * numberOfLBnodes];
+            DAD.f[DIR_MM0] = &distributionsAD[DIR_MM0 * numberOfLBnodes];
+            DAD.f[DIR_PM0] = &distributionsAD[DIR_PM0 * numberOfLBnodes];
+            DAD.f[DIR_MP0] = &distributionsAD[DIR_MP0 * numberOfLBnodes];
+            DAD.f[DIR_P0P] = &distributionsAD[DIR_P0P * numberOfLBnodes];
+            DAD.f[DIR_M0M] = &distributionsAD[DIR_M0M * numberOfLBnodes];
+            DAD.f[DIR_P0M] = &distributionsAD[DIR_P0M * numberOfLBnodes];
+            DAD.f[DIR_M0P] = &distributionsAD[DIR_M0P * numberOfLBnodes];
+            DAD.f[DIR_0PP] = &distributionsAD[DIR_0PP * numberOfLBnodes];
+            DAD.f[DIR_0MM] = &distributionsAD[DIR_0MM * numberOfLBnodes];
+            DAD.f[DIR_0PM] = &distributionsAD[DIR_0PM * numberOfLBnodes];
+            DAD.f[DIR_0MP] = &distributionsAD[DIR_0MP * numberOfLBnodes];
+            DAD.f[DIR_000] = &distributionsAD[DIR_000 * numberOfLBnodes];
+            DAD.f[DIR_PPP] = &distributionsAD[DIR_PPP * numberOfLBnodes];
+            DAD.f[DIR_MMP] = &distributionsAD[DIR_MMP * numberOfLBnodes];
+            DAD.f[DIR_PMP] = &distributionsAD[DIR_PMP * numberOfLBnodes];
+            DAD.f[DIR_MPP] = &distributionsAD[DIR_MPP * numberOfLBnodes];
+            DAD.f[DIR_PPM] = &distributionsAD[DIR_PPM * numberOfLBnodes];
+            DAD.f[DIR_MMM] = &distributionsAD[DIR_MMM * numberOfLBnodes];
+            DAD.f[DIR_PMM] = &distributionsAD[DIR_PMM * numberOfLBnodes];
+            DAD.f[DIR_MPM] = &distributionsAD[DIR_MPM * numberOfLBnodes];
         }
         else
         {
-            DAD.f[DIR_M00   ] = &distributionsAD[DIR_P00    * size_Mat];
-            DAD.f[DIR_P00   ] = &distributionsAD[DIR_M00    * size_Mat];
-            DAD.f[DIR_0M0   ] = &distributionsAD[DIR_0P0    * size_Mat];
-            DAD.f[DIR_0P0   ] = &distributionsAD[DIR_0M0    * size_Mat];
-            DAD.f[DIR_00M   ] = &distributionsAD[DIR_00P    * size_Mat];
-            DAD.f[DIR_00P   ] = &distributionsAD[DIR_00M    * size_Mat];
-            DAD.f[DIR_MM0  ] = &distributionsAD[DIR_PP0   * size_Mat];
-            DAD.f[DIR_PP0  ] = &distributionsAD[DIR_MM0   * size_Mat];
-            DAD.f[DIR_MP0  ] = &distributionsAD[DIR_PM0   * size_Mat];
-            DAD.f[DIR_PM0  ] = &distributionsAD[DIR_MP0   * size_Mat];
-            DAD.f[DIR_M0M  ] = &distributionsAD[DIR_P0P   * size_Mat];
-            DAD.f[DIR_P0P  ] = &distributionsAD[DIR_M0M   * size_Mat];
-            DAD.f[DIR_M0P  ] = &distributionsAD[DIR_P0M   * size_Mat];
-            DAD.f[DIR_P0M  ] = &distributionsAD[DIR_M0P   * size_Mat];
-            DAD.f[DIR_0MM  ] = &distributionsAD[DIR_0PP   * size_Mat];
-            DAD.f[DIR_0PP  ] = &distributionsAD[DIR_0MM   * size_Mat];
-            DAD.f[DIR_0MP  ] = &distributionsAD[DIR_0PM   * size_Mat];
-            DAD.f[DIR_0PM  ] = &distributionsAD[DIR_0MP   * size_Mat];
-            DAD.f[DIR_000] = &distributionsAD[DIR_000 * size_Mat];
-            DAD.f[DIR_PPP ] = &distributionsAD[DIR_MMM  * size_Mat];
-            DAD.f[DIR_MMP ] = &distributionsAD[DIR_PPM  * size_Mat];
-            DAD.f[DIR_PMP ] = &distributionsAD[DIR_MPM  * size_Mat];
-            DAD.f[DIR_MPP ] = &distributionsAD[DIR_PMM  * size_Mat];
-            DAD.f[DIR_PPM ] = &distributionsAD[DIR_MMP  * size_Mat];
-            DAD.f[DIR_MMM ] = &distributionsAD[DIR_PPP  * size_Mat];
-            DAD.f[DIR_PMM ] = &distributionsAD[DIR_MPP  * size_Mat];
-            DAD.f[DIR_MPM ] = &distributionsAD[DIR_PMP  * size_Mat];
+            DAD.f[DIR_M00] = &distributionsAD[DIR_P00 * numberOfLBnodes];
+            DAD.f[DIR_P00] = &distributionsAD[DIR_M00 * numberOfLBnodes];
+            DAD.f[DIR_0M0] = &distributionsAD[DIR_0P0 * numberOfLBnodes];
+            DAD.f[DIR_0P0] = &distributionsAD[DIR_0M0 * numberOfLBnodes];
+            DAD.f[DIR_00M] = &distributionsAD[DIR_00P * numberOfLBnodes];
+            DAD.f[DIR_00P] = &distributionsAD[DIR_00M * numberOfLBnodes];
+            DAD.f[DIR_MM0] = &distributionsAD[DIR_PP0 * numberOfLBnodes];
+            DAD.f[DIR_PP0] = &distributionsAD[DIR_MM0 * numberOfLBnodes];
+            DAD.f[DIR_MP0] = &distributionsAD[DIR_PM0 * numberOfLBnodes];
+            DAD.f[DIR_PM0] = &distributionsAD[DIR_MP0 * numberOfLBnodes];
+            DAD.f[DIR_M0M] = &distributionsAD[DIR_P0P * numberOfLBnodes];
+            DAD.f[DIR_P0P] = &distributionsAD[DIR_M0M * numberOfLBnodes];
+            DAD.f[DIR_M0P] = &distributionsAD[DIR_P0M * numberOfLBnodes];
+            DAD.f[DIR_P0M] = &distributionsAD[DIR_M0P * numberOfLBnodes];
+            DAD.f[DIR_0MM] = &distributionsAD[DIR_0PP * numberOfLBnodes];
+            DAD.f[DIR_0PP] = &distributionsAD[DIR_0MM * numberOfLBnodes];
+            DAD.f[DIR_0MP] = &distributionsAD[DIR_0PM * numberOfLBnodes];
+            DAD.f[DIR_0PM] = &distributionsAD[DIR_0MP * numberOfLBnodes];
+            DAD.f[DIR_000] = &distributionsAD[DIR_000 * numberOfLBnodes];
+            DAD.f[DIR_PPP] = &distributionsAD[DIR_MMM * numberOfLBnodes];
+            DAD.f[DIR_MMP] = &distributionsAD[DIR_PPM * numberOfLBnodes];
+            DAD.f[DIR_PMP] = &distributionsAD[DIR_MPM * numberOfLBnodes];
+            DAD.f[DIR_MPP] = &distributionsAD[DIR_PMM * numberOfLBnodes];
+            DAD.f[DIR_PPM] = &distributionsAD[DIR_MMP * numberOfLBnodes];
+            DAD.f[DIR_MMM] = &distributionsAD[DIR_PPP * numberOfLBnodes];
+            DAD.f[DIR_PMM] = &distributionsAD[DIR_MPP * numberOfLBnodes];
+            DAD.f[DIR_MPM] = &distributionsAD[DIR_PMP * numberOfLBnodes];
         }
         ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
         real concentration =
@@ -7115,24 +7115,24 @@ __global__ void AD_SlipVelDeviceComp(
         real jTan3 = jx3 - NormJ * NormZ;
 
         ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-        q = q_dirE[k];   if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_M00  ])[kw  ] = calcDistributionBC_AD(q, c2o27,   vx1,         cu_sq, f_E,   f_W,   omegaDiffusivity,        jTan1,       concentration); }
-        q = q_dirW[k];   if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_P00  ])[ke  ] = calcDistributionBC_AD(q, c2o27,  -vx1,         cu_sq, f_W,   f_E,   omegaDiffusivity,       -jTan1,       concentration); }
-        q = q_dirN[k];   if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_0M0  ])[ks  ] = calcDistributionBC_AD(q, c2o27,   vx2,         cu_sq, f_N,   f_S,   omegaDiffusivity,        jTan2,       concentration); }
-        q = q_dirS[k];   if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_0P0  ])[kn  ] = calcDistributionBC_AD(q, c2o27,  -vx2,         cu_sq, f_S,   f_N,   omegaDiffusivity,       -jTan2,       concentration); }
-        q = q_dirT[k];   if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_00M  ])[kb  ] = calcDistributionBC_AD(q, c2o27,   vx3,         cu_sq, f_T,   f_B,   omegaDiffusivity,        jTan3,       concentration); }
-        q = q_dirB[k];   if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_00P  ])[kt  ] = calcDistributionBC_AD(q, c2o27,  -vx3,         cu_sq, f_B,   f_T,   omegaDiffusivity,       -jTan3,       concentration); }
-        q = q_dirNE[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_MM0 ])[ksw ] = calcDistributionBC_AD(q, c1o54,   vx1+vx2,     cu_sq, f_NE,  f_SW,  omegaDiffusivity,  jTan1+jTan2,       concentration); }
-        q = q_dirSW[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_PP0 ])[kne ] = calcDistributionBC_AD(q, c1o54,  -vx1-vx2,     cu_sq, f_SW,  f_NE,  omegaDiffusivity, -jTan1-jTan2,       concentration); }
-        q = q_dirSE[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_MP0 ])[knw ] = calcDistributionBC_AD(q, c1o54,   vx1-vx2,     cu_sq, f_SE,  f_NW,  omegaDiffusivity,  jTan1-jTan2,       concentration); }
-        q = q_dirNW[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_PM0 ])[kse ] = calcDistributionBC_AD(q, c1o54,  -vx1+vx2,     cu_sq, f_NW,  f_SE,  omegaDiffusivity, -jTan1+jTan2,       concentration); }
-        q = q_dirTE[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_M0M ])[kbw ] = calcDistributionBC_AD(q, c1o54,   vx1    +vx3, cu_sq, f_TE,  f_BW,  omegaDiffusivity,  jTan1      +jTan3, concentration); }
-        q = q_dirBW[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_P0P ])[kte ] = calcDistributionBC_AD(q, c1o54,  -vx1    -vx3, cu_sq, f_BW,  f_TE,  omegaDiffusivity, -jTan1      -jTan3, concentration); }
-        q = q_dirBE[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_M0P ])[ktw ] = calcDistributionBC_AD(q, c1o54,   vx1    -vx3, cu_sq, f_BE,  f_TW,  omegaDiffusivity,  jTan1      -jTan3, concentration); }
-        q = q_dirTW[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_P0M ])[kbe ] = calcDistributionBC_AD(q, c1o54,  -vx1    +vx3, cu_sq, f_TW,  f_BE,  omegaDiffusivity, -jTan1      +jTan3, concentration); }
-        q = q_dirTN[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_0MM ])[kbs ] = calcDistributionBC_AD(q, c1o54,       vx2+vx3, cu_sq, f_TN,  f_BS,  omegaDiffusivity,        jTan2+jTan3, concentration); }
-        q = q_dirBS[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_0PP ])[ktn ] = calcDistributionBC_AD(q, c1o54,      -vx2-vx3, cu_sq, f_BS,  f_TN,  omegaDiffusivity,       -jTan2-jTan3, concentration); }
-        q = q_dirBN[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_0MP ])[kts ] = calcDistributionBC_AD(q, c1o54,       vx2-vx3, cu_sq, f_BN,  f_TS,  omegaDiffusivity,        jTan2-jTan3, concentration); }
-        q = q_dirTS[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_0PM ])[kbn ] = calcDistributionBC_AD(q, c1o54,      -vx2+vx3, cu_sq, f_TS,  f_BN,  omegaDiffusivity,       -jTan2+jTan3, concentration); }
+        q = q_dirE[k];   if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_M00])[kw  ] = calcDistributionBC_AD(q, c2o27,   vx1,         cu_sq, f_E,   f_W,   omegaDiffusivity,        jTan1,       concentration); }
+        q = q_dirW[k];   if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_P00])[ke  ] = calcDistributionBC_AD(q, c2o27,  -vx1,         cu_sq, f_W,   f_E,   omegaDiffusivity,       -jTan1,       concentration); }
+        q = q_dirN[k];   if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_0M0])[ks  ] = calcDistributionBC_AD(q, c2o27,   vx2,         cu_sq, f_N,   f_S,   omegaDiffusivity,        jTan2,       concentration); }
+        q = q_dirS[k];   if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_0P0])[kn  ] = calcDistributionBC_AD(q, c2o27,  -vx2,         cu_sq, f_S,   f_N,   omegaDiffusivity,       -jTan2,       concentration); }
+        q = q_dirT[k];   if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_00M])[kb  ] = calcDistributionBC_AD(q, c2o27,   vx3,         cu_sq, f_T,   f_B,   omegaDiffusivity,        jTan3,       concentration); }
+        q = q_dirB[k];   if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_00P])[kt  ] = calcDistributionBC_AD(q, c2o27,  -vx3,         cu_sq, f_B,   f_T,   omegaDiffusivity,       -jTan3,       concentration); }
+        q = q_dirNE[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_MM0])[ksw ] = calcDistributionBC_AD(q, c1o54,   vx1+vx2,     cu_sq, f_NE,  f_SW,  omegaDiffusivity,  jTan1+jTan2,       concentration); }
+        q = q_dirSW[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_PP0])[kne ] = calcDistributionBC_AD(q, c1o54,  -vx1-vx2,     cu_sq, f_SW,  f_NE,  omegaDiffusivity, -jTan1-jTan2,       concentration); }
+        q = q_dirSE[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_MP0])[knw ] = calcDistributionBC_AD(q, c1o54,   vx1-vx2,     cu_sq, f_SE,  f_NW,  omegaDiffusivity,  jTan1-jTan2,       concentration); }
+        q = q_dirNW[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_PM0])[kse ] = calcDistributionBC_AD(q, c1o54,  -vx1+vx2,     cu_sq, f_NW,  f_SE,  omegaDiffusivity, -jTan1+jTan2,       concentration); }
+        q = q_dirTE[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_M0M])[kbw ] = calcDistributionBC_AD(q, c1o54,   vx1    +vx3, cu_sq, f_TE,  f_BW,  omegaDiffusivity,  jTan1      +jTan3, concentration); }
+        q = q_dirBW[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_P0P])[kte ] = calcDistributionBC_AD(q, c1o54,  -vx1    -vx3, cu_sq, f_BW,  f_TE,  omegaDiffusivity, -jTan1      -jTan3, concentration); }
+        q = q_dirBE[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_M0P])[ktw ] = calcDistributionBC_AD(q, c1o54,   vx1    -vx3, cu_sq, f_BE,  f_TW,  omegaDiffusivity,  jTan1      -jTan3, concentration); }
+        q = q_dirTW[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_P0M])[kbe ] = calcDistributionBC_AD(q, c1o54,  -vx1    +vx3, cu_sq, f_TW,  f_BE,  omegaDiffusivity, -jTan1      +jTan3, concentration); }
+        q = q_dirTN[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_0MM])[kbs ] = calcDistributionBC_AD(q, c1o54,       vx2+vx3, cu_sq, f_TN,  f_BS,  omegaDiffusivity,        jTan2+jTan3, concentration); }
+        q = q_dirBS[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_0PP])[ktn ] = calcDistributionBC_AD(q, c1o54,      -vx2-vx3, cu_sq, f_BS,  f_TN,  omegaDiffusivity,       -jTan2-jTan3, concentration); }
+        q = q_dirBN[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_0MP])[kts ] = calcDistributionBC_AD(q, c1o54,       vx2-vx3, cu_sq, f_BN,  f_TS,  omegaDiffusivity,        jTan2-jTan3, concentration); }
+        q = q_dirTS[k];  if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_0PM])[kbn ] = calcDistributionBC_AD(q, c1o54,      -vx2+vx3, cu_sq, f_TS,  f_BN,  omegaDiffusivity,       -jTan2+jTan3, concentration); }
         q = q_dirTNE[k]; if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_MMM])[kbsw] = calcDistributionBC_AD(q, c1o216,  vx1+vx2+vx3, cu_sq, f_TNE, f_BSW, omegaDiffusivity,  jTan1+jTan2+jTan3, concentration); }
         q = q_dirBSW[k]; if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_PPP])[ktne] = calcDistributionBC_AD(q, c1o216, -vx1-vx2-vx3, cu_sq, f_BSW, f_TNE, omegaDiffusivity, -jTan1-jTan2-jTan3, concentration); }
         q = q_dirBNE[k]; if (q >= c0o1 && q <= c1o1) { (DAD.f[DIR_MMP])[ktsw] = calcDistributionBC_AD(q, c1o216,  vx1+vx2-vx3, cu_sq, f_BNE, f_TSW, omegaDiffusivity,  jTan1+jTan2-jTan3, concentration); }
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CP27.cu b/src/gpu/VirtualFluids_GPU/GPU/CP27.cu
index 1ef111330c0d4293c14d66893847689ad8fac77f..8d02f4e1c110fc82b65adda4db67976f29796d07 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CP27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/CP27.cu
@@ -14,69 +14,69 @@ __global__ void CalcCP27(real* DD,
 									unsigned int* neighborX,
 									unsigned int* neighborY,
 									unsigned int* neighborZ,
-									unsigned int size_Mat, 
+									unsigned long long numberOfLBnodes, 
 									bool isEvenTimestep)
 {
 	Distributions27 D;
 	if (isEvenTimestep==true)
 	{
-		D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+		D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
 	} 
 	else
 	{
-		D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+		D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
 	}
 	////////////////////////////////////////////////////////////////////////////////
 	const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -124,20 +124,20 @@ __global__ void CalcCP27(real* DD,
 		////////////////////////////////////////////////////////////////////////////////
 		double PressCP;
 
-		PressCP  =   (D.f[DIR_P00   ])[ke  ]+ (D.f[DIR_M00   ])[kw  ]+ 
-                     (D.f[DIR_0P0   ])[kn  ]+ (D.f[DIR_0M0   ])[ks  ]+
-                     (D.f[DIR_00P   ])[kt  ]+ (D.f[DIR_00M   ])[kb  ]+
-                     (D.f[DIR_PP0  ])[kne ]+ (D.f[DIR_MM0  ])[ksw ]+
-                     (D.f[DIR_PM0  ])[kse ]+ (D.f[DIR_MP0  ])[knw ]+
-                     (D.f[DIR_P0P  ])[kte ]+ (D.f[DIR_M0M  ])[kbw ]+
-                     (D.f[DIR_P0M  ])[kbe ]+ (D.f[DIR_M0P  ])[ktw ]+
-                     (D.f[DIR_0PP  ])[ktn ]+ (D.f[DIR_0MM  ])[kbs ]+
-                     (D.f[DIR_0PM  ])[kbn ]+ (D.f[DIR_0MP  ])[kts ]+
+		PressCP  =   (D.f[DIR_P00])[ke  ]+ (D.f[DIR_M00])[kw  ]+ 
+                     (D.f[DIR_0P0])[kn  ]+ (D.f[DIR_0M0])[ks  ]+
+                     (D.f[DIR_00P])[kt  ]+ (D.f[DIR_00M])[kb  ]+
+                     (D.f[DIR_PP0])[kne ]+ (D.f[DIR_MM0])[ksw ]+
+                     (D.f[DIR_PM0])[kse ]+ (D.f[DIR_MP0])[knw ]+
+                     (D.f[DIR_P0P])[kte ]+ (D.f[DIR_M0M])[kbw ]+
+                     (D.f[DIR_P0M])[kbe ]+ (D.f[DIR_M0P])[ktw ]+
+                     (D.f[DIR_0PP])[ktn ]+ (D.f[DIR_0MM])[kbs ]+
+                     (D.f[DIR_0PM])[kbn ]+ (D.f[DIR_0MP])[kts ]+
                      (D.f[DIR_000])[kzero]+ 
-                     (D.f[DIR_PPP ])[ktne]+ (D.f[DIR_MMP ])[ktsw]+ 
-                     (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]+ 
-                     (D.f[DIR_PPM ])[kbne]+ (D.f[DIR_MMM ])[kbsw]+ 
-                     (D.f[DIR_PMM ])[kbse]+ (D.f[DIR_MPM ])[kbnw];
+                     (D.f[DIR_PPP])[ktne]+ (D.f[DIR_MMP])[ktsw]+ 
+                     (D.f[DIR_PMP])[ktse]+ (D.f[DIR_MPP])[ktnw]+ 
+                     (D.f[DIR_PPM])[kbne]+ (D.f[DIR_MMM])[kbsw]+ 
+                     (D.f[DIR_PMM])[kbse]+ (D.f[DIR_MPM])[kbnw];
 		////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 		cpPress[k] = PressCP;
 		////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/GPU/Calc2ndMoments27.cu b/src/gpu/VirtualFluids_GPU/GPU/Calc2ndMoments27.cu
index ce8fe68cd6a2e8f09f150cb0ccdec502a6278b50..c41751dc1b5cea53983d94d9cc7c3c75c8a84101 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/Calc2ndMoments27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/Calc2ndMoments27.cu
@@ -16,70 +16,70 @@ __global__ void LBCalc2ndMomentsIncompSP27(  real* kxyFromfcNEQ,
 														unsigned int* neighborX,
 														unsigned int* neighborY,
 														unsigned int* neighborZ,
-														unsigned int size_Mat,
+														unsigned long long numberOfLBnodes,
 														real* DD,
 														bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -92,7 +92,7 @@ __global__ void LBCalc2ndMomentsIncompSP27(  real* kxyFromfcNEQ,
    const unsigned k = nx*(ny*z + y) + x;
    //////////////////////////////////////////////////////////////////////////
 
-   if(k < size_Mat)
+   if(k < numberOfLBnodes)
    {
       //////////////////////////////////////////////////////////////////////////
       //index
@@ -125,33 +125,33 @@ __global__ void LBCalc2ndMomentsIncompSP27(  real* kxyFromfcNEQ,
       unsigned int kbsw = neighborZ[ksw];
       //////////////////////////////////////////////////////////////////////////
       real        f_E,f_W,f_N,f_S,f_T,f_B,f_NE,f_SW,f_SE,f_NW,f_TE,f_BW,f_BE,f_TW,f_TN,f_BS,f_BN,f_TS,/*f_ZERO,*/f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
-	  f_E    = (D.f[DIR_P00   ])[ke   ];
-	  f_W    = (D.f[DIR_M00   ])[kw   ];
-	  f_N    = (D.f[DIR_0P0   ])[kn   ];
-	  f_S    = (D.f[DIR_0M0   ])[ks   ];
-	  f_T    = (D.f[DIR_00P   ])[kt   ];
-	  f_B    = (D.f[DIR_00M   ])[kb   ];
-	  f_NE   = (D.f[DIR_PP0  ])[kne  ];
-	  f_SW   = (D.f[DIR_MM0  ])[ksw  ];
-	  f_SE   = (D.f[DIR_PM0  ])[kse  ];
-	  f_NW   = (D.f[DIR_MP0  ])[knw  ];
-	  f_TE   = (D.f[DIR_P0P  ])[kte  ];
-	  f_BW   = (D.f[DIR_M0M  ])[kbw  ];
-	  f_BE   = (D.f[DIR_P0M  ])[kbe  ];
-	  f_TW   = (D.f[DIR_M0P  ])[ktw  ];
-	  f_TN   = (D.f[DIR_0PP  ])[ktn  ];
-	  f_BS   = (D.f[DIR_0MM  ])[kbs  ];
-	  f_BN   = (D.f[DIR_0PM  ])[kbn  ];
-	  f_TS   = (D.f[DIR_0MP  ])[kts  ];
+	  f_E    = (D.f[DIR_P00])[ke   ];
+	  f_W    = (D.f[DIR_M00])[kw   ];
+	  f_N    = (D.f[DIR_0P0])[kn   ];
+	  f_S    = (D.f[DIR_0M0])[ks   ];
+	  f_T    = (D.f[DIR_00P])[kt   ];
+	  f_B    = (D.f[DIR_00M])[kb   ];
+	  f_NE   = (D.f[DIR_PP0])[kne  ];
+	  f_SW   = (D.f[DIR_MM0])[ksw  ];
+	  f_SE   = (D.f[DIR_PM0])[kse  ];
+	  f_NW   = (D.f[DIR_MP0])[knw  ];
+	  f_TE   = (D.f[DIR_P0P])[kte  ];
+	  f_BW   = (D.f[DIR_M0M])[kbw  ];
+	  f_BE   = (D.f[DIR_P0M])[kbe  ];
+	  f_TW   = (D.f[DIR_M0P])[ktw  ];
+	  f_TN   = (D.f[DIR_0PP])[ktn  ];
+	  f_BS   = (D.f[DIR_0MM])[kbs  ];
+	  f_BN   = (D.f[DIR_0PM])[kbn  ];
+	  f_TS   = (D.f[DIR_0MP])[kts  ];
 	  //f_ZERO = (D.f[DIR_000])[kzero];
-	  f_TNE  = (D.f[DIR_PPP ])[ktne ];
-	  f_TSW  = (D.f[DIR_MMP ])[ktsw ];
-	  f_TSE  = (D.f[DIR_PMP ])[ktse ];
-	  f_TNW  = (D.f[DIR_MPP ])[ktnw ];
-	  f_BNE  = (D.f[DIR_PPM ])[kbne ];
-	  f_BSW  = (D.f[DIR_MMM ])[kbsw ];
-	  f_BSE  = (D.f[DIR_PMM ])[kbse ];
-	  f_BNW  = (D.f[DIR_MPM ])[kbnw ];
+	  f_TNE  = (D.f[DIR_PPP])[ktne ];
+	  f_TSW  = (D.f[DIR_MMP])[ktsw ];
+	  f_TSE  = (D.f[DIR_PMP])[ktse ];
+	  f_TNW  = (D.f[DIR_MPP])[ktnw ];
+	  f_BNE  = (D.f[DIR_PPM])[kbne ];
+	  f_BSW  = (D.f[DIR_MMM])[kbsw ];
+	  f_BSE  = (D.f[DIR_PMM])[kbse ];
+	  f_BNW  = (D.f[DIR_MPM])[kbnw ];
       //////////////////////////////////////////////////////////////////////////
 	  real vx1, vx2, vx3;
       kxyFromfcNEQ[k]       = c0o1;
@@ -215,70 +215,70 @@ __global__ void LBCalc2ndMomentsCompSP27(real* kxyFromfcNEQ,
 													unsigned int* neighborX,
 													unsigned int* neighborY,
 													unsigned int* neighborZ,
-													unsigned int size_Mat,
+													unsigned long long numberOfLBnodes,
 													real* DD,
 													bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -291,7 +291,7 @@ __global__ void LBCalc2ndMomentsCompSP27(real* kxyFromfcNEQ,
    const unsigned k = nx*(ny*z + y) + x;
    //////////////////////////////////////////////////////////////////////////
 
-   if(k < size_Mat)
+   if(k < numberOfLBnodes)
    {
       //////////////////////////////////////////////////////////////////////////
       //index
@@ -325,33 +325,33 @@ __global__ void LBCalc2ndMomentsCompSP27(real* kxyFromfcNEQ,
       //////////////////////////////////////////////////////////////////////////
       real f_ZERO;
       real        f_E,f_W,f_N,f_S,f_T,f_B,f_NE,f_SW,f_SE,f_NW,f_TE,f_BW,f_BE,f_TW,f_TN,f_BS,f_BN,f_TS,f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
-	  f_E    = (D.f[DIR_P00   ])[ke   ];
-	  f_W    = (D.f[DIR_M00   ])[kw   ];
-	  f_N    = (D.f[DIR_0P0   ])[kn   ];
-	  f_S    = (D.f[DIR_0M0   ])[ks   ];
-	  f_T    = (D.f[DIR_00P   ])[kt   ];
-	  f_B    = (D.f[DIR_00M   ])[kb   ];
-	  f_NE   = (D.f[DIR_PP0  ])[kne  ];
-	  f_SW   = (D.f[DIR_MM0  ])[ksw  ];
-	  f_SE   = (D.f[DIR_PM0  ])[kse  ];
-	  f_NW   = (D.f[DIR_MP0  ])[knw  ];
-	  f_TE   = (D.f[DIR_P0P  ])[kte  ];
-	  f_BW   = (D.f[DIR_M0M  ])[kbw  ];
-	  f_BE   = (D.f[DIR_P0M  ])[kbe  ];
-	  f_TW   = (D.f[DIR_M0P  ])[ktw  ];
-	  f_TN   = (D.f[DIR_0PP  ])[ktn  ];
-	  f_BS   = (D.f[DIR_0MM  ])[kbs  ];
-	  f_BN   = (D.f[DIR_0PM  ])[kbn  ];
-	  f_TS   = (D.f[DIR_0MP  ])[kts  ];
+	  f_E    = (D.f[DIR_P00])[ke   ];
+	  f_W    = (D.f[DIR_M00])[kw   ];
+	  f_N    = (D.f[DIR_0P0])[kn   ];
+	  f_S    = (D.f[DIR_0M0])[ks   ];
+	  f_T    = (D.f[DIR_00P])[kt   ];
+	  f_B    = (D.f[DIR_00M])[kb   ];
+	  f_NE   = (D.f[DIR_PP0])[kne  ];
+	  f_SW   = (D.f[DIR_MM0])[ksw  ];
+	  f_SE   = (D.f[DIR_PM0])[kse  ];
+	  f_NW   = (D.f[DIR_MP0])[knw  ];
+	  f_TE   = (D.f[DIR_P0P])[kte  ];
+	  f_BW   = (D.f[DIR_M0M])[kbw  ];
+	  f_BE   = (D.f[DIR_P0M])[kbe  ];
+	  f_TW   = (D.f[DIR_M0P])[ktw  ];
+	  f_TN   = (D.f[DIR_0PP])[ktn  ];
+	  f_BS   = (D.f[DIR_0MM])[kbs  ];
+	  f_BN   = (D.f[DIR_0PM])[kbn  ];
+	  f_TS   = (D.f[DIR_0MP])[kts  ];
 	  f_ZERO = (D.f[DIR_000])[kzero];
-	  f_TNE  = (D.f[DIR_PPP ])[ktne ];
-	  f_TSW  = (D.f[DIR_MMP ])[ktsw ];
-	  f_TSE  = (D.f[DIR_PMP ])[ktse ];
-	  f_TNW  = (D.f[DIR_MPP ])[ktnw ];
-	  f_BNE  = (D.f[DIR_PPM ])[kbne ];
-	  f_BSW  = (D.f[DIR_MMM ])[kbsw ];
-	  f_BSE  = (D.f[DIR_PMM ])[kbse ];
-	  f_BNW  = (D.f[DIR_MPM ])[kbnw ];
+	  f_TNE  = (D.f[DIR_PPP])[ktne ];
+	  f_TSW  = (D.f[DIR_MMP])[ktsw ];
+	  f_TSE  = (D.f[DIR_PMP])[ktse ];
+	  f_TNW  = (D.f[DIR_MPP])[ktnw ];
+	  f_BNE  = (D.f[DIR_PPM])[kbne ];
+	  f_BSW  = (D.f[DIR_MMM])[kbsw ];
+	  f_BSE  = (D.f[DIR_PMM])[kbse ];
+	  f_BNW  = (D.f[DIR_MPM])[kbnw ];
       //////////////////////////////////////////////////////////////////////////
 	  real drho;
 	  real vx1, vx2, vx3, rho;
@@ -423,7 +423,7 @@ __global__ void LBCalc3rdMomentsIncompSP27(  real* CUMbbb,
 														unsigned int* neighborY,
 														unsigned int* neighborZ,
 														real* DDStart,
-														int size_Mat,
+														unsigned long long numberOfLBnodes,
 														bool EvenOrOdd)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -437,7 +437,7 @@ __global__ void LBCalc3rdMomentsIncompSP27(  real* CUMbbb,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if(k<size_Mat)
+	if(k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -448,63 +448,63 @@ __global__ void LBCalc3rdMomentsIncompSP27(  real* CUMbbb,
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -517,33 +517,33 @@ __global__ void LBCalc3rdMomentsIncompSP27(  real* CUMbbb,
 			unsigned int kbs  = neighborZ[ks];
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];
-			real mfabb = (D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];
-			real mfbab = (D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];
-			real mfbba = (D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];
-			real mfaab = (D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];
-			real mfacb = (D.f[DIR_MP0  ])[kw ];
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];
-			real mfaba = (D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];
-			real mfabc = (D.f[DIR_M0P  ])[kw ];
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];
-			real mfbac = (D.f[DIR_0MP  ])[ks ];
+			real mfcbb = (D.f[DIR_P00])[k  ];
+			real mfabb = (D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];
+			real mfbab = (D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k  ];
+			real mfbba = (D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k  ];
+			real mfaab = (D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks ];
+			real mfacb = (D.f[DIR_MP0])[kw ];
+			real mfcbc = (D.f[DIR_P0P])[k  ];
+			real mfaba = (D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb ];
+			real mfabc = (D.f[DIR_M0P])[kw ];
+			real mfbcc = (D.f[DIR_0PP])[k  ];
+			real mfbaa = (D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb ];
+			real mfbac = (D.f[DIR_0MP])[ks ];
 			real mfbbb = (D.f[DIR_000])[k  ];
-			real mfccc = (D.f[DIR_PPP ])[k  ];
-			real mfaac = (D.f[DIR_MMP ])[ksw];
-			real mfcac = (D.f[DIR_PMP ])[ks ];
-			real mfacc = (D.f[DIR_MPP ])[kw ];
-			real mfcca = (D.f[DIR_PPM ])[kb ];
-			real mfaaa = (D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs];
-			real mfaca = (D.f[DIR_MPM ])[kbw];
+			real mfccc = (D.f[DIR_PPP])[k  ];
+			real mfaac = (D.f[DIR_MMP])[ksw];
+			real mfcac = (D.f[DIR_PMP])[ks ];
+			real mfacc = (D.f[DIR_MPP])[kw ];
+			real mfcca = (D.f[DIR_PPM])[kb ];
+			real mfaaa = (D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];
+			real mfaca = (D.f[DIR_MPM])[kbw];
 			////////////////////////////////////////////////////////////////////////////////////
 			real vvx    =((((mfccc-mfaaa) + (mfcac-mfaca)) + ((mfcaa-mfacc) + (mfcca-mfaac))) + 
 						     (((mfcba-mfabc) + (mfcbc-mfaba)) + ((mfcab-mfacb) + (mfccb-mfaab))) +
@@ -857,7 +857,7 @@ __global__ void LBCalc3rdMomentsCompSP27(real* CUMbbb,
 													unsigned int* neighborY,
 													unsigned int* neighborZ,
 													real* DDStart,
-													int size_Mat,
+													unsigned long long numberOfLBnodes,
 													bool EvenOrOdd)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -871,7 +871,7 @@ __global__ void LBCalc3rdMomentsCompSP27(real* CUMbbb,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if(k<size_Mat)
+	if(k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -882,63 +882,63 @@ __global__ void LBCalc3rdMomentsCompSP27(real* CUMbbb,
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -951,33 +951,33 @@ __global__ void LBCalc3rdMomentsCompSP27(real* CUMbbb,
 			unsigned int kbs  = neighborZ[ks];
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];
-			real mfabb = (D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];
-			real mfbab = (D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];
-			real mfbba = (D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];
-			real mfaab = (D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];
-			real mfacb = (D.f[DIR_MP0  ])[kw ];
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];
-			real mfaba = (D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];
-			real mfabc = (D.f[DIR_M0P  ])[kw ];
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];
-			real mfbac = (D.f[DIR_0MP  ])[ks ];
+			real mfcbb = (D.f[DIR_P00])[k  ];
+			real mfabb = (D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];
+			real mfbab = (D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k  ];
+			real mfbba = (D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k  ];
+			real mfaab = (D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks ];
+			real mfacb = (D.f[DIR_MP0])[kw ];
+			real mfcbc = (D.f[DIR_P0P])[k  ];
+			real mfaba = (D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb ];
+			real mfabc = (D.f[DIR_M0P])[kw ];
+			real mfbcc = (D.f[DIR_0PP])[k  ];
+			real mfbaa = (D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb ];
+			real mfbac = (D.f[DIR_0MP])[ks ];
 			real mfbbb = (D.f[DIR_000])[k  ];
-			real mfccc = (D.f[DIR_PPP ])[k  ];
-			real mfaac = (D.f[DIR_MMP ])[ksw];
-			real mfcac = (D.f[DIR_PMP ])[ks ];
-			real mfacc = (D.f[DIR_MPP ])[kw ];
-			real mfcca = (D.f[DIR_PPM ])[kb ];
-			real mfaaa = (D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs];
-			real mfaca = (D.f[DIR_MPM ])[kbw];
+			real mfccc = (D.f[DIR_PPP])[k  ];
+			real mfaac = (D.f[DIR_MMP])[ksw];
+			real mfcac = (D.f[DIR_PMP])[ks ];
+			real mfacc = (D.f[DIR_MPP])[kw ];
+			real mfcca = (D.f[DIR_PPM])[kb ];
+			real mfaaa = (D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];
+			real mfaca = (D.f[DIR_MPM])[kbw];
 			////////////////////////////////////////////////////////////////////////////////////
 			real drho = ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
 							(((mfbac+mfbca) + (mfbaa+mfbcc)) + ((mfabc+mfcba) + (mfaba+mfcbc)) + ((mfacb+mfcab) + (mfaab+mfccb))) +
@@ -1298,7 +1298,7 @@ __global__ void LBCalcHigherMomentsIncompSP27(   real* CUMcbb,
 															unsigned int* neighborY,
 															unsigned int* neighborZ,
 															real* DDStart,
-															int size_Mat,
+															unsigned long long numberOfLBnodes,
 															bool EvenOrOdd)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -1312,7 +1312,7 @@ __global__ void LBCalcHigherMomentsIncompSP27(   real* CUMcbb,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if(k<size_Mat)
+	if(k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -1323,63 +1323,63 @@ __global__ void LBCalcHigherMomentsIncompSP27(   real* CUMcbb,
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -1392,33 +1392,33 @@ __global__ void LBCalcHigherMomentsIncompSP27(   real* CUMcbb,
 			unsigned int kbs  = neighborZ[ks];
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];
-			real mfabb = (D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];
-			real mfbab = (D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];
-			real mfbba = (D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];
-			real mfaab = (D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];
-			real mfacb = (D.f[DIR_MP0  ])[kw ];
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];
-			real mfaba = (D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];
-			real mfabc = (D.f[DIR_M0P  ])[kw ];
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];
-			real mfbac = (D.f[DIR_0MP  ])[ks ];
+			real mfcbb = (D.f[DIR_P00])[k  ];
+			real mfabb = (D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];
+			real mfbab = (D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k  ];
+			real mfbba = (D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k  ];
+			real mfaab = (D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks ];
+			real mfacb = (D.f[DIR_MP0])[kw ];
+			real mfcbc = (D.f[DIR_P0P])[k  ];
+			real mfaba = (D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb ];
+			real mfabc = (D.f[DIR_M0P])[kw ];
+			real mfbcc = (D.f[DIR_0PP])[k  ];
+			real mfbaa = (D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb ];
+			real mfbac = (D.f[DIR_0MP])[ks ];
 			real mfbbb = (D.f[DIR_000])[k  ];
-			real mfccc = (D.f[DIR_PPP ])[k  ];
-			real mfaac = (D.f[DIR_MMP ])[ksw];
-			real mfcac = (D.f[DIR_PMP ])[ks ];
-			real mfacc = (D.f[DIR_MPP ])[kw ];
-			real mfcca = (D.f[DIR_PPM ])[kb ];
-			real mfaaa = (D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs];
-			real mfaca = (D.f[DIR_MPM ])[kbw];
+			real mfccc = (D.f[DIR_PPP])[k  ];
+			real mfaac = (D.f[DIR_MMP])[ksw];
+			real mfcac = (D.f[DIR_PMP])[ks ];
+			real mfacc = (D.f[DIR_MPP])[kw ];
+			real mfcca = (D.f[DIR_PPM])[kb ];
+			real mfaaa = (D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];
+			real mfaca = (D.f[DIR_MPM])[kbw];
 			////////////////////////////////////////////////////////////////////////////////////
 			real vvx    =((((mfccc-mfaaa) + (mfcac-mfaca)) + ((mfcaa-mfacc) + (mfcca-mfaac))) + 
 						     (((mfcba-mfabc) + (mfcbc-mfaba)) + ((mfcab-mfacb) + (mfccb-mfaab))) +
@@ -1752,7 +1752,7 @@ __global__ void LBCalcHigherMomentsCompSP27( real* CUMcbb,
 														unsigned int* neighborY,
 														unsigned int* neighborZ,
 														real* DDStart,
-														int size_Mat,
+														unsigned long long numberOfLBnodes,
 														bool EvenOrOdd)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -1766,7 +1766,7 @@ __global__ void LBCalcHigherMomentsCompSP27( real* CUMcbb,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if(k<size_Mat)
+	if(k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -1777,63 +1777,63 @@ __global__ void LBCalcHigherMomentsCompSP27( real* CUMcbb,
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -1846,33 +1846,33 @@ __global__ void LBCalcHigherMomentsCompSP27( real* CUMcbb,
 			unsigned int kbs  = neighborZ[ks];
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];
-			real mfabb = (D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];
-			real mfbab = (D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];
-			real mfbba = (D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];
-			real mfaab = (D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];
-			real mfacb = (D.f[DIR_MP0  ])[kw ];
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];
-			real mfaba = (D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];
-			real mfabc = (D.f[DIR_M0P  ])[kw ];
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];
-			real mfbac = (D.f[DIR_0MP  ])[ks ];
+			real mfcbb = (D.f[DIR_P00])[k  ];
+			real mfabb = (D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];
+			real mfbab = (D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k  ];
+			real mfbba = (D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k  ];
+			real mfaab = (D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks ];
+			real mfacb = (D.f[DIR_MP0])[kw ];
+			real mfcbc = (D.f[DIR_P0P])[k  ];
+			real mfaba = (D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb ];
+			real mfabc = (D.f[DIR_M0P])[kw ];
+			real mfbcc = (D.f[DIR_0PP])[k  ];
+			real mfbaa = (D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb ];
+			real mfbac = (D.f[DIR_0MP])[ks ];
 			real mfbbb = (D.f[DIR_000])[k  ];
-			real mfccc = (D.f[DIR_PPP ])[k  ];
-			real mfaac = (D.f[DIR_MMP ])[ksw];
-			real mfcac = (D.f[DIR_PMP ])[ks ];
-			real mfacc = (D.f[DIR_MPP ])[kw ];
-			real mfcca = (D.f[DIR_PPM ])[kb ];
-			real mfaaa = (D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs];
-			real mfaca = (D.f[DIR_MPM ])[kbw];
+			real mfccc = (D.f[DIR_PPP])[k  ];
+			real mfaac = (D.f[DIR_MMP])[ksw];
+			real mfcac = (D.f[DIR_PMP])[ks ];
+			real mfacc = (D.f[DIR_MPP])[kw ];
+			real mfcca = (D.f[DIR_PPM])[kb ];
+			real mfaaa = (D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];
+			real mfaca = (D.f[DIR_MPM])[kbw];
 			////////////////////////////////////////////////////////////////////////////////////
 			real drho = ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
 							(((mfbac+mfbca) + (mfbaa+mfbcc)) + ((mfabc+mfcba) + (mfaba+mfcbc)) + ((mfacb+mfcab) + (mfaab+mfccb))) +
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CalcConc27.cu b/src/gpu/VirtualFluids_GPU/GPU/CalcConc27.cu
index d246f39a030b6df0b249aee17f37b7d5258ff00d..ad5a05b12a1b3ae2541e36ccffae4635fccfe62a 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CalcConc27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/CalcConc27.cu
@@ -45,7 +45,7 @@ __global__ void CalcConc27(
 	uint* neighborX,
 	uint* neighborY,
 	uint* neighborZ,
-	uint size_Mat,
+	unsigned long long numberOfLBnodes,
 	real* distributionsAD,
 	bool isEvenTimestep)
 {
@@ -67,7 +67,7 @@ __global__ void CalcConc27(
 
    //////////////////////////////////////////////////////////////////////////
    // run for all indices in size_Mat and fluid nodes
-   if ((k < size_Mat) && (typeOfGridNode[k] == GEO_FLUID))
+   if ((k < numberOfLBnodes) && (typeOfGridNode[k] == GEO_FLUID))
    {
       //////////////////////////////////////////////////////////////////////////
       //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
@@ -76,63 +76,63 @@ __global__ void CalcConc27(
       Distributions27 distAD;
       if (isEvenTimestep)
       {
-         distAD.f[DIR_P00   ] = &distributionsAD[DIR_P00   *size_Mat];
-         distAD.f[DIR_M00   ] = &distributionsAD[DIR_M00   *size_Mat];
-         distAD.f[DIR_0P0   ] = &distributionsAD[DIR_0P0   *size_Mat];
-         distAD.f[DIR_0M0   ] = &distributionsAD[DIR_0M0   *size_Mat];
-         distAD.f[DIR_00P   ] = &distributionsAD[DIR_00P   *size_Mat];
-         distAD.f[DIR_00M   ] = &distributionsAD[DIR_00M   *size_Mat];
-         distAD.f[DIR_PP0  ] = &distributionsAD[DIR_PP0  *size_Mat];
-         distAD.f[DIR_MM0  ] = &distributionsAD[DIR_MM0  *size_Mat];
-         distAD.f[DIR_PM0  ] = &distributionsAD[DIR_PM0  *size_Mat];
-         distAD.f[DIR_MP0  ] = &distributionsAD[DIR_MP0  *size_Mat];
-         distAD.f[DIR_P0P  ] = &distributionsAD[DIR_P0P  *size_Mat];
-         distAD.f[DIR_M0M  ] = &distributionsAD[DIR_M0M  *size_Mat];
-         distAD.f[DIR_P0M  ] = &distributionsAD[DIR_P0M  *size_Mat];
-         distAD.f[DIR_M0P  ] = &distributionsAD[DIR_M0P  *size_Mat];
-         distAD.f[DIR_0PP  ] = &distributionsAD[DIR_0PP  *size_Mat];
-         distAD.f[DIR_0MM  ] = &distributionsAD[DIR_0MM  *size_Mat];
-         distAD.f[DIR_0PM  ] = &distributionsAD[DIR_0PM  *size_Mat];
-         distAD.f[DIR_0MP  ] = &distributionsAD[DIR_0MP  *size_Mat];
-         distAD.f[DIR_000] = &distributionsAD[DIR_000*size_Mat];
-         distAD.f[DIR_PPP ] = &distributionsAD[DIR_PPP *size_Mat];
-         distAD.f[DIR_MMP ] = &distributionsAD[DIR_MMP *size_Mat];
-         distAD.f[DIR_PMP ] = &distributionsAD[DIR_PMP *size_Mat];
-         distAD.f[DIR_MPP ] = &distributionsAD[DIR_MPP *size_Mat];
-         distAD.f[DIR_PPM ] = &distributionsAD[DIR_PPM *size_Mat];
-         distAD.f[DIR_MMM ] = &distributionsAD[DIR_MMM *size_Mat];
-         distAD.f[DIR_PMM ] = &distributionsAD[DIR_PMM *size_Mat];
-         distAD.f[DIR_MPM ] = &distributionsAD[DIR_MPM *size_Mat];
+         distAD.f[DIR_P00] = &distributionsAD[DIR_P00 * numberOfLBnodes];
+         distAD.f[DIR_M00] = &distributionsAD[DIR_M00 * numberOfLBnodes];
+         distAD.f[DIR_0P0] = &distributionsAD[DIR_0P0 * numberOfLBnodes];
+         distAD.f[DIR_0M0] = &distributionsAD[DIR_0M0 * numberOfLBnodes];
+         distAD.f[DIR_00P] = &distributionsAD[DIR_00P * numberOfLBnodes];
+         distAD.f[DIR_00M] = &distributionsAD[DIR_00M * numberOfLBnodes];
+         distAD.f[DIR_PP0] = &distributionsAD[DIR_PP0 * numberOfLBnodes];
+         distAD.f[DIR_MM0] = &distributionsAD[DIR_MM0 * numberOfLBnodes];
+         distAD.f[DIR_PM0] = &distributionsAD[DIR_PM0 * numberOfLBnodes];
+         distAD.f[DIR_MP0] = &distributionsAD[DIR_MP0 * numberOfLBnodes];
+         distAD.f[DIR_P0P] = &distributionsAD[DIR_P0P * numberOfLBnodes];
+         distAD.f[DIR_M0M] = &distributionsAD[DIR_M0M * numberOfLBnodes];
+         distAD.f[DIR_P0M] = &distributionsAD[DIR_P0M * numberOfLBnodes];
+         distAD.f[DIR_M0P] = &distributionsAD[DIR_M0P * numberOfLBnodes];
+         distAD.f[DIR_0PP] = &distributionsAD[DIR_0PP * numberOfLBnodes];
+         distAD.f[DIR_0MM] = &distributionsAD[DIR_0MM * numberOfLBnodes];
+         distAD.f[DIR_0PM] = &distributionsAD[DIR_0PM * numberOfLBnodes];
+         distAD.f[DIR_0MP] = &distributionsAD[DIR_0MP * numberOfLBnodes];
+         distAD.f[DIR_000] = &distributionsAD[DIR_000 * numberOfLBnodes];
+         distAD.f[DIR_PPP] = &distributionsAD[DIR_PPP * numberOfLBnodes];
+         distAD.f[DIR_MMP] = &distributionsAD[DIR_MMP * numberOfLBnodes];
+         distAD.f[DIR_PMP] = &distributionsAD[DIR_PMP * numberOfLBnodes];
+         distAD.f[DIR_MPP] = &distributionsAD[DIR_MPP * numberOfLBnodes];
+         distAD.f[DIR_PPM] = &distributionsAD[DIR_PPM * numberOfLBnodes];
+         distAD.f[DIR_MMM] = &distributionsAD[DIR_MMM * numberOfLBnodes];
+         distAD.f[DIR_PMM] = &distributionsAD[DIR_PMM * numberOfLBnodes];
+         distAD.f[DIR_MPM] = &distributionsAD[DIR_MPM * numberOfLBnodes];
       }
       else
       {
-         distAD.f[DIR_M00   ] = &distributionsAD[DIR_P00   *size_Mat];
-         distAD.f[DIR_P00   ] = &distributionsAD[DIR_M00   *size_Mat];
-         distAD.f[DIR_0M0   ] = &distributionsAD[DIR_0P0   *size_Mat];
-         distAD.f[DIR_0P0   ] = &distributionsAD[DIR_0M0   *size_Mat];
-         distAD.f[DIR_00M   ] = &distributionsAD[DIR_00P   *size_Mat];
-         distAD.f[DIR_00P   ] = &distributionsAD[DIR_00M   *size_Mat];
-         distAD.f[DIR_MM0  ] = &distributionsAD[DIR_PP0  *size_Mat];
-         distAD.f[DIR_PP0  ] = &distributionsAD[DIR_MM0  *size_Mat];
-         distAD.f[DIR_MP0  ] = &distributionsAD[DIR_PM0  *size_Mat];
-         distAD.f[DIR_PM0  ] = &distributionsAD[DIR_MP0  *size_Mat];
-         distAD.f[DIR_M0M  ] = &distributionsAD[DIR_P0P  *size_Mat];
-         distAD.f[DIR_P0P  ] = &distributionsAD[DIR_M0M  *size_Mat];
-         distAD.f[DIR_M0P  ] = &distributionsAD[DIR_P0M  *size_Mat];
-         distAD.f[DIR_P0M  ] = &distributionsAD[DIR_M0P  *size_Mat];
-         distAD.f[DIR_0MM  ] = &distributionsAD[DIR_0PP  *size_Mat];
-         distAD.f[DIR_0PP  ] = &distributionsAD[DIR_0MM  *size_Mat];
-         distAD.f[DIR_0MP  ] = &distributionsAD[DIR_0PM  *size_Mat];
-         distAD.f[DIR_0PM  ] = &distributionsAD[DIR_0MP  *size_Mat];
-         distAD.f[DIR_000] = &distributionsAD[DIR_000*size_Mat];
-         distAD.f[DIR_PPP ] = &distributionsAD[DIR_MMM *size_Mat];
-         distAD.f[DIR_MMP ] = &distributionsAD[DIR_PPM *size_Mat];
-         distAD.f[DIR_PMP ] = &distributionsAD[DIR_MPM *size_Mat];
-         distAD.f[DIR_MPP ] = &distributionsAD[DIR_PMM *size_Mat];
-         distAD.f[DIR_PPM ] = &distributionsAD[DIR_MMP *size_Mat];
-         distAD.f[DIR_MMM ] = &distributionsAD[DIR_PPP *size_Mat];
-         distAD.f[DIR_PMM ] = &distributionsAD[DIR_MPP *size_Mat];
-         distAD.f[DIR_MPM ] = &distributionsAD[DIR_PMP *size_Mat];
+         distAD.f[DIR_M00] = &distributionsAD[DIR_P00 * numberOfLBnodes];
+         distAD.f[DIR_P00] = &distributionsAD[DIR_M00 * numberOfLBnodes];
+         distAD.f[DIR_0M0] = &distributionsAD[DIR_0P0 * numberOfLBnodes];
+         distAD.f[DIR_0P0] = &distributionsAD[DIR_0M0 * numberOfLBnodes];
+         distAD.f[DIR_00M] = &distributionsAD[DIR_00P * numberOfLBnodes];
+         distAD.f[DIR_00P] = &distributionsAD[DIR_00M * numberOfLBnodes];
+         distAD.f[DIR_MM0] = &distributionsAD[DIR_PP0 * numberOfLBnodes];
+         distAD.f[DIR_PP0] = &distributionsAD[DIR_MM0 * numberOfLBnodes];
+         distAD.f[DIR_MP0] = &distributionsAD[DIR_PM0 * numberOfLBnodes];
+         distAD.f[DIR_PM0] = &distributionsAD[DIR_MP0 * numberOfLBnodes];
+         distAD.f[DIR_M0M] = &distributionsAD[DIR_P0P * numberOfLBnodes];
+         distAD.f[DIR_P0P] = &distributionsAD[DIR_M0M * numberOfLBnodes];
+         distAD.f[DIR_M0P] = &distributionsAD[DIR_P0M * numberOfLBnodes];
+         distAD.f[DIR_P0M] = &distributionsAD[DIR_M0P * numberOfLBnodes];
+         distAD.f[DIR_0MM] = &distributionsAD[DIR_0PP * numberOfLBnodes];
+         distAD.f[DIR_0PP] = &distributionsAD[DIR_0MM * numberOfLBnodes];
+         distAD.f[DIR_0MP] = &distributionsAD[DIR_0PM * numberOfLBnodes];
+         distAD.f[DIR_0PM] = &distributionsAD[DIR_0MP * numberOfLBnodes];
+         distAD.f[DIR_000] = &distributionsAD[DIR_000 * numberOfLBnodes];
+         distAD.f[DIR_PPP] = &distributionsAD[DIR_MMM * numberOfLBnodes];
+         distAD.f[DIR_MMP] = &distributionsAD[DIR_PPM * numberOfLBnodes];
+         distAD.f[DIR_PMP] = &distributionsAD[DIR_MPM * numberOfLBnodes];
+         distAD.f[DIR_MPP] = &distributionsAD[DIR_PMM * numberOfLBnodes];
+         distAD.f[DIR_PPM] = &distributionsAD[DIR_MMP * numberOfLBnodes];
+         distAD.f[DIR_MMM] = &distributionsAD[DIR_PPP * numberOfLBnodes];
+         distAD.f[DIR_PMM] = &distributionsAD[DIR_MPP * numberOfLBnodes];
+         distAD.f[DIR_MPM] = &distributionsAD[DIR_PMP * numberOfLBnodes];
       }
 	  ////////////////////////////////////////////////////////////////////////////////
 	  //! - Set neighbor indices (necessary for indirect addressing)
@@ -166,33 +166,33 @@ __global__ void CalcConc27(
 	  ////////////////////////////////////////////////////////////////////////////////
 	  //! - Set local distributions
 	  //!
-	  real mfcbb = (distAD.f[DIR_P00   ])[ke  ];
-	  real mfabb = (distAD.f[DIR_M00   ])[kw  ];
-	  real mfbcb = (distAD.f[DIR_0P0   ])[kn  ];
-	  real mfbab = (distAD.f[DIR_0M0   ])[ks  ];
-	  real mfbbc = (distAD.f[DIR_00P   ])[kt  ];
-	  real mfbba = (distAD.f[DIR_00M   ])[kb  ];
-	  real mfccb = (distAD.f[DIR_PP0  ])[kne ];
-	  real mfaab = (distAD.f[DIR_MM0  ])[ksw ];
-	  real mfcab = (distAD.f[DIR_PM0  ])[kse ];
-	  real mfacb = (distAD.f[DIR_MP0  ])[knw ];
-	  real mfcbc = (distAD.f[DIR_P0P  ])[kte ];
-	  real mfaba = (distAD.f[DIR_M0M  ])[kbw ];
-	  real mfcba = (distAD.f[DIR_P0M  ])[kbe ];
-	  real mfabc = (distAD.f[DIR_M0P  ])[ktw ];
-	  real mfbcc = (distAD.f[DIR_0PP  ])[ktn ];
-	  real mfbaa = (distAD.f[DIR_0MM  ])[kbs ];
-	  real mfbca = (distAD.f[DIR_0PM  ])[kbn ];
-	  real mfbac = (distAD.f[DIR_0MP  ])[kts ];
+	  real mfcbb = (distAD.f[DIR_P00])[ke  ];
+	  real mfabb = (distAD.f[DIR_M00])[kw  ];
+	  real mfbcb = (distAD.f[DIR_0P0])[kn  ];
+	  real mfbab = (distAD.f[DIR_0M0])[ks  ];
+	  real mfbbc = (distAD.f[DIR_00P])[kt  ];
+	  real mfbba = (distAD.f[DIR_00M])[kb  ];
+	  real mfccb = (distAD.f[DIR_PP0])[kne ];
+	  real mfaab = (distAD.f[DIR_MM0])[ksw ];
+	  real mfcab = (distAD.f[DIR_PM0])[kse ];
+	  real mfacb = (distAD.f[DIR_MP0])[knw ];
+	  real mfcbc = (distAD.f[DIR_P0P])[kte ];
+	  real mfaba = (distAD.f[DIR_M0M])[kbw ];
+	  real mfcba = (distAD.f[DIR_P0M])[kbe ];
+	  real mfabc = (distAD.f[DIR_M0P])[ktw ];
+	  real mfbcc = (distAD.f[DIR_0PP])[ktn ];
+	  real mfbaa = (distAD.f[DIR_0MM])[kbs ];
+	  real mfbca = (distAD.f[DIR_0PM])[kbn ];
+	  real mfbac = (distAD.f[DIR_0MP])[kts ];
 	  real mfbbb = (distAD.f[DIR_000])[k   ];
-	  real mfccc = (distAD.f[DIR_PPP ])[ktne];
-	  real mfaac = (distAD.f[DIR_MMP ])[ktsw];
-	  real mfcac = (distAD.f[DIR_PMP ])[ktse];
-	  real mfacc = (distAD.f[DIR_MPP ])[ktnw];
-	  real mfcca = (distAD.f[DIR_PPM ])[kbne];
-	  real mfaaa = (distAD.f[DIR_MMM ])[kbsw];
-	  real mfcaa = (distAD.f[DIR_PMM ])[kbse];
-	  real mfaca = (distAD.f[DIR_MPM ])[kbnw];
+	  real mfccc = (distAD.f[DIR_PPP])[ktne];
+	  real mfaac = (distAD.f[DIR_MMP])[ktsw];
+	  real mfcac = (distAD.f[DIR_PMP])[ktse];
+	  real mfacc = (distAD.f[DIR_MPP])[ktnw];
+	  real mfcca = (distAD.f[DIR_PPM])[kbne];
+	  real mfaaa = (distAD.f[DIR_MMM])[kbsw];
+	  real mfcaa = (distAD.f[DIR_PMM])[kbse];
+	  real mfaca = (distAD.f[DIR_MPM])[kbnw];
       //////////////////////////////////////////////////////////////////////////
 	  //! - Calculate concentration using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
 	  //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
@@ -229,30 +229,30 @@ __global__ void CalcConc7( real* Conc,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat,
+                                          unsigned long long numberOfLBnodes,
                                           real* DD7,
                                           bool isEvenTimestep)
 {
    Distributions7 D7;
    if (isEvenTimestep==true)
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[1] = &DD7[1*size_Mat];
-      D7.f[2] = &DD7[2*size_Mat];
-      D7.f[3] = &DD7[3*size_Mat];
-      D7.f[4] = &DD7[4*size_Mat];
-      D7.f[5] = &DD7[5*size_Mat];
-      D7.f[6] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[1] = &DD7[1*numberOfLBnodes];
+      D7.f[2] = &DD7[2*numberOfLBnodes];
+      D7.f[3] = &DD7[3*numberOfLBnodes];
+      D7.f[4] = &DD7[4*numberOfLBnodes];
+      D7.f[5] = &DD7[5*numberOfLBnodes];
+      D7.f[6] = &DD7[6*numberOfLBnodes];
    } 
    else
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[2] = &DD7[1*size_Mat];
-      D7.f[1] = &DD7[2*size_Mat];
-      D7.f[4] = &DD7[3*size_Mat];
-      D7.f[3] = &DD7[4*size_Mat];
-      D7.f[6] = &DD7[5*size_Mat];
-      D7.f[5] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[2] = &DD7[1*numberOfLBnodes];
+      D7.f[1] = &DD7[2*numberOfLBnodes];
+      D7.f[4] = &DD7[3*numberOfLBnodes];
+      D7.f[3] = &DD7[4*numberOfLBnodes];
+      D7.f[6] = &DD7[5*numberOfLBnodes];
+      D7.f[5] = &DD7[6*numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -265,7 +265,7 @@ __global__ void CalcConc7( real* Conc,
    const unsigned k = nx*(ny*z + y) + x;
    //////////////////////////////////////////////////////////////////////////
 
-   if(k<size_Mat)
+   if(k<numberOfLBnodes)
    {
       //////////////////////////////////////////////////////////////////////////
       //index
@@ -327,63 +327,63 @@ __global__ void CalcConc7( real* Conc,
 //    Distributions27 D27;
 //    if (isEvenTimestep==true)
 //    {
-//       D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-//       D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-//       D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-//       D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-//       D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-//       D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-//       D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-//       D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-//       D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-//       D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-//       D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-//       D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-//       D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-//       D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-//       D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-//       D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-//       D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-//       D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-//       D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-//       D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-//       D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-//       D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-//       D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-//       D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-//       D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-//       D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-//       D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+//       D27.f[DIR_P00] = &DD27[DIR_P00 * size_Mat];
+//       D27.f[DIR_M00] = &DD27[DIR_M00 * size_Mat];
+//       D27.f[DIR_0P0] = &DD27[DIR_0P0 * size_Mat];
+//       D27.f[DIR_0M0] = &DD27[DIR_0M0 * size_Mat];
+//       D27.f[DIR_00P] = &DD27[DIR_00P * size_Mat];
+//       D27.f[DIR_00M] = &DD27[DIR_00M * size_Mat];
+//       D27.f[DIR_PP0] = &DD27[DIR_PP0 * size_Mat];
+//       D27.f[DIR_MM0] = &DD27[DIR_MM0 * size_Mat];
+//       D27.f[DIR_PM0] = &DD27[DIR_PM0 * size_Mat];
+//       D27.f[DIR_MP0] = &DD27[DIR_MP0 * size_Mat];
+//       D27.f[DIR_P0P] = &DD27[DIR_P0P * size_Mat];
+//       D27.f[DIR_M0M] = &DD27[DIR_M0M * size_Mat];
+//       D27.f[DIR_P0M] = &DD27[DIR_P0M * size_Mat];
+//       D27.f[DIR_M0P] = &DD27[DIR_M0P * size_Mat];
+//       D27.f[DIR_0PP] = &DD27[DIR_0PP * size_Mat];
+//       D27.f[DIR_0MM] = &DD27[DIR_0MM * size_Mat];
+//       D27.f[DIR_0PM] = &DD27[DIR_0PM * size_Mat];
+//       D27.f[DIR_0MP] = &DD27[DIR_0MP * size_Mat];
+//       D27.f[DIR_000] = &DD27[DIR_000 * size_Mat];
+//       D27.f[DIR_PPP] = &DD27[DIR_PPP * size_Mat];
+//       D27.f[DIR_MMP] = &DD27[DIR_MMP * size_Mat];
+//       D27.f[DIR_PMP] = &DD27[DIR_PMP * size_Mat];
+//       D27.f[DIR_MPP] = &DD27[DIR_MPP * size_Mat];
+//       D27.f[DIR_PPM] = &DD27[DIR_PPM * size_Mat];
+//       D27.f[DIR_MMM] = &DD27[DIR_MMM * size_Mat];
+//       D27.f[DIR_PMM] = &DD27[DIR_PMM * size_Mat];
+//       D27.f[DIR_MPM] = &DD27[DIR_MPM * size_Mat];
 //    }
 //    else
 //    {
-//       D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-//       D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-//       D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-//       D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-//       D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-//       D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-//       D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-//       D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-//       D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-//       D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-//       D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-//       D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-//       D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-//       D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-//       D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-//       D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-//       D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-//       D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-//       D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-//       D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-//       D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-//       D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
-//       D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-//       D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-//       D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-//       D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-//       D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
+//       D27.f[DIR_M00] = &DD27[DIR_P00 * size_Mat];
+//       D27.f[DIR_P00] = &DD27[DIR_M00 * size_Mat];
+//       D27.f[DIR_0M0] = &DD27[DIR_0P0 * size_Mat];
+//       D27.f[DIR_0P0] = &DD27[DIR_0M0 * size_Mat];
+//       D27.f[DIR_00M] = &DD27[DIR_00P * size_Mat];
+//       D27.f[DIR_00P] = &DD27[DIR_00M * size_Mat];
+//       D27.f[DIR_MM0] = &DD27[DIR_PP0 * size_Mat];
+//       D27.f[DIR_PP0] = &DD27[DIR_MM0 * size_Mat];
+//       D27.f[DIR_MP0] = &DD27[DIR_PM0 * size_Mat];
+//       D27.f[DIR_PM0] = &DD27[DIR_MP0 * size_Mat];
+//       D27.f[DIR_M0M] = &DD27[DIR_P0P * size_Mat];
+//       D27.f[DIR_P0P] = &DD27[DIR_M0M * size_Mat];
+//       D27.f[DIR_M0P] = &DD27[DIR_P0M * size_Mat];
+//       D27.f[DIR_P0M] = &DD27[DIR_M0P * size_Mat];
+//       D27.f[DIR_0MM] = &DD27[DIR_0PP * size_Mat];
+//       D27.f[DIR_0PP] = &DD27[DIR_0MM * size_Mat];
+//       D27.f[DIR_0MP] = &DD27[DIR_0PM * size_Mat];
+//       D27.f[DIR_0PM] = &DD27[DIR_0MP * size_Mat];
+//       D27.f[DIR_000] = &DD27[DIR_000 * size_Mat];
+//       D27.f[DIR_MMM] = &DD27[DIR_PPP * size_Mat];
+//       D27.f[DIR_PPM] = &DD27[DIR_MMP * size_Mat];
+//       D27.f[DIR_MPM] = &DD27[DIR_PMP * size_Mat];
+//       D27.f[DIR_PMM] = &DD27[DIR_MPP * size_Mat];
+//       D27.f[DIR_MMP] = &DD27[DIR_PPM * size_Mat];
+//       D27.f[DIR_PPP] = &DD27[DIR_MMM * size_Mat];
+//       D27.f[DIR_MPP] = &DD27[DIR_PMM * size_Mat];
+//       D27.f[DIR_PMP] = &DD27[DIR_MPM * size_Mat];
 //    }
 //    ////////////////////////////////////////////////////////////////////////////////
 //    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -432,20 +432,20 @@ __global__ void CalcConc7( real* Conc,
 
 //       if(geoD[k] == GEO_FLUID)
 //       {
-//          Conc[k]    =   (D27.f[DIR_P00   ])[ke  ]+ (D27.f[DIR_M00   ])[kw  ]+ 
-//                         (D27.f[DIR_0P0   ])[kn  ]+ (D27.f[DIR_0M0   ])[ks  ]+
-//                         (D27.f[DIR_00P   ])[kt  ]+ (D27.f[DIR_00M   ])[kb  ]+
-//                         (D27.f[DIR_PP0  ])[kne ]+ (D27.f[DIR_MM0  ])[ksw ]+
-//                         (D27.f[DIR_PM0  ])[kse ]+ (D27.f[DIR_MP0  ])[knw ]+
-//                         (D27.f[DIR_P0P  ])[kte ]+ (D27.f[DIR_M0M  ])[kbw ]+
-//                         (D27.f[DIR_P0M  ])[kbe ]+ (D27.f[DIR_M0P  ])[ktw ]+
-//                         (D27.f[DIR_0PP  ])[ktn ]+ (D27.f[DIR_0MM  ])[kbs ]+
-//                         (D27.f[DIR_0PM  ])[kbn ]+ (D27.f[DIR_0MP  ])[kts ]+
+//          Conc[k]    =   (D27.f[DIR_P00])[ke  ]+ (D27.f[DIR_M00])[kw  ]+ 
+//                         (D27.f[DIR_0P0])[kn  ]+ (D27.f[DIR_0M0])[ks  ]+
+//                         (D27.f[DIR_00P])[kt  ]+ (D27.f[DIR_00M])[kb  ]+
+//                         (D27.f[DIR_PP0])[kne ]+ (D27.f[DIR_MM0])[ksw ]+
+//                         (D27.f[DIR_PM0])[kse ]+ (D27.f[DIR_MP0])[knw ]+
+//                         (D27.f[DIR_P0P])[kte ]+ (D27.f[DIR_M0M])[kbw ]+
+//                         (D27.f[DIR_P0M])[kbe ]+ (D27.f[DIR_M0P])[ktw ]+
+//                         (D27.f[DIR_0PP])[ktn ]+ (D27.f[DIR_0MM])[kbs ]+
+//                         (D27.f[DIR_0PM])[kbn ]+ (D27.f[DIR_0MP])[kts ]+
 //                         (D27.f[DIR_000])[kzero]+ 
-//                         (D27.f[DIR_PPP ])[ktne]+ (D27.f[DIR_MMP ])[ktsw]+
-//                         (D27.f[DIR_PMP ])[ktse]+ (D27.f[DIR_MPP ])[ktnw]+
-//                         (D27.f[DIR_PPM ])[kbne]+ (D27.f[DIR_MMM ])[kbsw]+
-//                         (D27.f[DIR_PMM ])[kbse]+ (D27.f[DIR_MPM ])[kbnw];
+//                         (D27.f[DIR_PPP])[ktne]+ (D27.f[DIR_MMP])[ktsw]+
+//                         (D27.f[DIR_PMP])[ktse]+ (D27.f[DIR_MPP])[ktnw]+
+//                         (D27.f[DIR_PPM])[kbne]+ (D27.f[DIR_MMM])[kbsw]+
+//                         (D27.f[DIR_PMM])[kbse]+ (D27.f[DIR_MPM])[kbnw];
 //       }
 //    }   
 // }
@@ -476,30 +476,30 @@ __global__ void GetPlaneConc7(real* Conc,
 											unsigned int* neighborX,
 											unsigned int* neighborY,
 											unsigned int* neighborZ,
-											unsigned int size_Mat,
+											unsigned long long numberOfLBnodes,
 											real* DD7,
 											bool isEvenTimestep)
 {
    Distributions7 D7;
    if (isEvenTimestep==true)
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[1] = &DD7[1*size_Mat];
-      D7.f[2] = &DD7[2*size_Mat];
-      D7.f[3] = &DD7[3*size_Mat];
-      D7.f[4] = &DD7[4*size_Mat];
-      D7.f[5] = &DD7[5*size_Mat];
-      D7.f[6] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[1] = &DD7[1*numberOfLBnodes];
+      D7.f[2] = &DD7[2*numberOfLBnodes];
+      D7.f[3] = &DD7[3*numberOfLBnodes];
+      D7.f[4] = &DD7[4*numberOfLBnodes];
+      D7.f[5] = &DD7[5*numberOfLBnodes];
+      D7.f[6] = &DD7[6*numberOfLBnodes];
    } 
    else
    {
-      D7.f[0] = &DD7[0*size_Mat];
-      D7.f[2] = &DD7[1*size_Mat];
-      D7.f[1] = &DD7[2*size_Mat];
-      D7.f[4] = &DD7[3*size_Mat];
-      D7.f[3] = &DD7[4*size_Mat];
-      D7.f[6] = &DD7[5*size_Mat];
-      D7.f[5] = &DD7[6*size_Mat];
+      D7.f[0] = &DD7[0*numberOfLBnodes];
+      D7.f[2] = &DD7[1*numberOfLBnodes];
+      D7.f[1] = &DD7[2*numberOfLBnodes];
+      D7.f[4] = &DD7[3*numberOfLBnodes];
+      D7.f[3] = &DD7[4*numberOfLBnodes];
+      D7.f[6] = &DD7[5*numberOfLBnodes];
+      D7.f[5] = &DD7[6*numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -581,70 +581,70 @@ __global__ void GetPlaneConc27(real* Conc,
 											 unsigned int* neighborX,
 											 unsigned int* neighborY,
 											 unsigned int* neighborZ,
-											 unsigned int size_Mat,
+											 unsigned long long numberOfLBnodes,
 											 real* DD27,
 											 bool isEvenTimestep)
 {
    Distributions27 D27;
    if (isEvenTimestep==true)
    {
-      D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+      D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
    }
    else
    {
-      D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-      D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-      D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-      D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-      D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-      D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-      D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-      D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-      D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-      D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-      D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-      D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-      D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-      D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-      D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-      D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-      D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-      D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-      D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-      D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-      D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-      D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
-      D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-      D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-      D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-      D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-      D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
+      D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+      D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+      D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+      D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+      D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+      D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+      D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+      D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+      D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+      D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+      D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+      D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+      D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+      D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+      D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+      D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+      D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+      D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+      D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+      D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+      D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+      D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
+      D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+      D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+      D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+      D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+      D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -693,20 +693,20 @@ __global__ void GetPlaneConc27(real* Conc,
 
       if(geoD[k] == GEO_FLUID)
       {
-         Conc[k]    =   (D27.f[DIR_P00   ])[ke  ]+ (D27.f[DIR_M00   ])[kw  ]+ 
-                        (D27.f[DIR_0P0   ])[kn  ]+ (D27.f[DIR_0M0   ])[ks  ]+
-                        (D27.f[DIR_00P   ])[kt  ]+ (D27.f[DIR_00M   ])[kb  ]+
-                        (D27.f[DIR_PP0  ])[kne ]+ (D27.f[DIR_MM0  ])[ksw ]+
-                        (D27.f[DIR_PM0  ])[kse ]+ (D27.f[DIR_MP0  ])[knw ]+
-                        (D27.f[DIR_P0P  ])[kte ]+ (D27.f[DIR_M0M  ])[kbw ]+
-                        (D27.f[DIR_P0M  ])[kbe ]+ (D27.f[DIR_M0P  ])[ktw ]+
-                        (D27.f[DIR_0PP  ])[ktn ]+ (D27.f[DIR_0MM  ])[kbs ]+
-                        (D27.f[DIR_0PM  ])[kbn ]+ (D27.f[DIR_0MP  ])[kts ]+
+         Conc[k]    =   (D27.f[DIR_P00])[ke  ]+ (D27.f[DIR_M00])[kw  ]+ 
+                        (D27.f[DIR_0P0])[kn  ]+ (D27.f[DIR_0M0])[ks  ]+
+                        (D27.f[DIR_00P])[kt  ]+ (D27.f[DIR_00M])[kb  ]+
+                        (D27.f[DIR_PP0])[kne ]+ (D27.f[DIR_MM0])[ksw ]+
+                        (D27.f[DIR_PM0])[kse ]+ (D27.f[DIR_MP0])[knw ]+
+                        (D27.f[DIR_P0P])[kte ]+ (D27.f[DIR_M0M])[kbw ]+
+                        (D27.f[DIR_P0M])[kbe ]+ (D27.f[DIR_M0P])[ktw ]+
+                        (D27.f[DIR_0PP])[ktn ]+ (D27.f[DIR_0MM])[kbs ]+
+                        (D27.f[DIR_0PM])[kbn ]+ (D27.f[DIR_0MP])[kts ]+
                         (D27.f[DIR_000])[kzero]+ 
-                        (D27.f[DIR_PPP ])[ktne]+ (D27.f[DIR_MMP ])[ktsw]+
-                        (D27.f[DIR_PMP ])[ktse]+ (D27.f[DIR_MPP ])[ktnw]+
-                        (D27.f[DIR_PPM ])[kbne]+ (D27.f[DIR_MMM ])[kbsw]+
-                        (D27.f[DIR_PMM ])[kbse]+ (D27.f[DIR_MPM ])[kbnw];
+                        (D27.f[DIR_PPP])[ktne]+ (D27.f[DIR_MMP])[ktsw]+
+                        (D27.f[DIR_PMP])[ktse]+ (D27.f[DIR_MPP])[ktnw]+
+                        (D27.f[DIR_PPM])[kbne]+ (D27.f[DIR_MMM])[kbsw]+
+                        (D27.f[DIR_PMM])[kbse]+ (D27.f[DIR_MPM])[kbnw];
       }
    }   
 }
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CalcMac27.cu b/src/gpu/VirtualFluids_GPU/GPU/CalcMac27.cu
index 4792b8846b2612383c07a97419e0473b21ebd187..f7bb09f816f45973fd4e2319a1bfa35cf9172caa 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CalcMac27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/CalcMac27.cu
@@ -1,306 +1,310 @@
-//  _    ___      __              __________      _     __        ______________   __
-// | |  / (_)____/ /___  ______ _/ / ____/ /_  __(_)___/ /____   /  ___/ __  / /  / /
-// | | / / / ___/ __/ / / / __ `/ / /_  / / / / / / __  / ___/  / /___/ /_/ / /  / /
-// | |/ / / /  / /_/ /_/ / /_/ / / __/ / / /_/ / / /_/ (__  )  / /_) / ____/ /__/ / 
-// |___/_/_/   \__/\__,_/\__,_/_/_/   /_/\__,_/_/\__,_/____/   \____/_/    \_____/
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
 //
-//////////////////////////////////////////////////////////////////////////
-/* Device code */
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file CalcMac27.cu
+//! \ingroup GPU
+//! \author Martin Schoenherr, Soeren Peters
+//======================================================================================
 #include "LBM/LB.h" 
 #include "lbm/constants/D3Q27.h"
 #include "lbm/constants/NumericConstants.h"
+#include "lbm/MacroscopicQuantities.h"
+
+#include "Kernel/Utilities/DistributionHelper.cuh"
 
 using namespace vf::lbm::constant;
 using namespace vf::lbm::dir;
+using namespace vf::gpu;
+
+////////////////////////////////////////////////////////////////////////////////
+__global__ void LBCalcMac27(
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    real* distributions,
+    bool isEvenTimestep)
+{
+    const unsigned int tx = threadIdx.x;    // Thread index = lokaler i index
+    const unsigned int by = blockIdx.x;     // Block index x
+    const unsigned int bz = blockIdx.y;     // Block index y
+    const unsigned int x = tx + STARTOFFX;  // Globaler x-Index
+    const unsigned int y = by + STARTOFFY;  // Globaler y-Index
+    const unsigned int z = bz + STARTOFFZ;  // Globaler z-Index
+ 
+    const unsigned nx = blockDim.x + 2 * STARTOFFX;
+    const unsigned ny = gridDim.x + 2 * STARTOFFY;
+ 
+    const unsigned int k = nx*(ny*z + y) + x; // Zugriff auf arrays im device
+ 
+ 
+    if(k >= numberOfLBnodes)
+        return;
+ 
+    if(!isValidFluidNode(geoD[k]))
+       return;
+ 
+    rhoD[k] = c0o1;
+    vxD[k]  = c0o1;
+    vyD[k]  = c0o1;
+    vzD[k]  = c0o1;
+ 
+    DistributionWrapper distr_wrapper(distributions, numberOfLBnodes, isEvenTimestep, k, neighborX, neighborY, neighborZ);
+    const auto& distribution = distr_wrapper.distribution;
+ 
+    rhoD[k] = vf::lbm::getDensity(distribution.f);
+    vxD[k] = vf::lbm::getIncompressibleVelocityX1(distribution.f);
+    vyD[k] = vf::lbm::getIncompressibleVelocityX2(distribution.f);
+    vzD[k] = vf::lbm::getIncompressibleVelocityX3(distribution.f);
+}
+
 
-#include "lbm/MacroscopicQuantities.h"
 
-#include "../Kernel/Utilities/DistributionHelper.cuh"
 
 
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void LBCalcMac27( real* vxD,
-                                        real* vyD,
-                                        real* vzD,
-                                        real* rhoD,
-                                        unsigned int* geoD,
-                                        unsigned int* neighborX,
-                                        unsigned int* neighborY,
-                                        unsigned int* neighborZ,
-                                        unsigned int size_Mat,
-                                        real* distributions,
-                                        bool isEvenTimestep)
+__global__ void LBCalcMacSP27(
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    real* distributions,
+    bool isEvenTimestep)
 {
-   const unsigned int tx = threadIdx.x;    // Thread index = lokaler i index
-   const unsigned int by = blockIdx.x;     // Block index x
-   const unsigned int bz = blockIdx.y;     // Block index y
-   const unsigned int x = tx + STARTOFFX;  // Globaler x-Index 
-   const unsigned int y = by + STARTOFFY;  // Globaler y-Index 
-   const unsigned int z = bz + STARTOFFZ;  // Globaler z-Index 
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
+   
+    //////////////////////////////////////////////////////////////////////////
+    if(nodeIndex<numberOfLBnodes)
+    {
+        //////////////////////////////////////////////////////////////////////////
+        //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on
+        //! timestep is based on the esoteric twist algorithm \ref <a
+        //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
+        //! DOI:10.3390/computation5020019 ]</b></a>
+        //!
+        Distributions27 dist;
+        getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+       
+        //////////////////////////////////////////////////////////////////////////
+        //index
+        unsigned int kzero= nodeIndex;
+        unsigned int ke   = nodeIndex;
+        unsigned int kw   = neighborX[nodeIndex];
+        unsigned int kn   = nodeIndex;
+        unsigned int ks   = neighborY[nodeIndex];
+        unsigned int kt   = nodeIndex;
+        unsigned int kb   = neighborZ[nodeIndex];
+        unsigned int ksw  = neighborY[kw];
+        unsigned int kne  = nodeIndex;
+        unsigned int kse  = ks;
+        unsigned int knw  = kw;
+        unsigned int kbw  = neighborZ[kw];
+        unsigned int kte  = nodeIndex;
+        unsigned int kbe  = kb;
+        unsigned int ktw  = kw;
+        unsigned int kbs  = neighborZ[ks];
+        unsigned int ktn  = nodeIndex;
+        unsigned int kbn  = kb;
+        unsigned int kts  = ks;
+        unsigned int ktse = ks;
+        unsigned int kbnw = kbw;
+        unsigned int ktnw = kw;
+        unsigned int kbse = kbs;
+        unsigned int ktsw = ksw;
+        unsigned int kbne = kb;
+        unsigned int ktne = nodeIndex;
+        unsigned int kbsw = neighborZ[ksw];
+        //////////////////////////////////////////////////////////////////////////
+        pressD[nodeIndex] = c0o1;
+        rhoD[nodeIndex]   = c0o1;
+        vxD[nodeIndex]    = c0o1;
+        vyD[nodeIndex]    = c0o1;
+        vzD[nodeIndex]    = c0o1;
+       
+        if(geoD[nodeIndex] == GEO_FLUID)
+        {
+            rhoD[nodeIndex] = 
+                (dist.f[DIR_P00])[ke  ]+ (dist.f[DIR_M00])[kw  ]+ 
+                (dist.f[DIR_0P0])[kn  ]+ (dist.f[DIR_0M0])[ks  ]+
+                (dist.f[DIR_00P])[kt  ]+ (dist.f[DIR_00M])[kb  ]+
+                (dist.f[DIR_PP0])[kne ]+ (dist.f[DIR_MM0])[ksw ]+
+                (dist.f[DIR_PM0])[kse ]+ (dist.f[DIR_MP0])[knw ]+
+                (dist.f[DIR_P0P])[kte ]+ (dist.f[DIR_M0M])[kbw ]+
+                (dist.f[DIR_P0M])[kbe ]+ (dist.f[DIR_M0P])[ktw ]+
+                (dist.f[DIR_0PP])[ktn ]+ (dist.f[DIR_0MM])[kbs ]+
+                (dist.f[DIR_0PM])[kbn ]+ (dist.f[DIR_0MP])[kts ]+
+                (dist.f[DIR_000])[kzero]+ 
+                (dist.f[DIR_PPP])[ktne]+ (dist.f[DIR_MMP])[ktsw]+ 
+                (dist.f[DIR_PMP])[ktse]+ (dist.f[DIR_MPP])[ktnw]+ 
+                (dist.f[DIR_PPM])[kbne]+ (dist.f[DIR_MMM])[kbsw]+ 
+                (dist.f[DIR_PMM])[kbse]+ (dist.f[DIR_MPM])[kbnw];
+           
+            vxD[nodeIndex] =
+                (dist.f[DIR_P00])[ke  ]- (dist.f[DIR_M00])[kw  ]+ 
+                (dist.f[DIR_PP0])[kne ]- (dist.f[DIR_MM0])[ksw ]+
+                (dist.f[DIR_PM0])[kse ]- (dist.f[DIR_MP0])[knw ]+
+                (dist.f[DIR_P0P])[kte ]- (dist.f[DIR_M0M])[kbw ]+
+                (dist.f[DIR_P0M])[kbe ]- (dist.f[DIR_M0P])[ktw ]+
+                (dist.f[DIR_PPP])[ktne]- (dist.f[DIR_MMP])[ktsw]+ 
+                (dist.f[DIR_PMP])[ktse]- (dist.f[DIR_MPP])[ktnw]+ 
+                (dist.f[DIR_PPM])[kbne]- (dist.f[DIR_MMM])[kbsw]+ 
+                (dist.f[DIR_PMM])[kbse]- (dist.f[DIR_MPM])[kbnw];
+           
+            vyD[nodeIndex] =
+                (dist.f[DIR_0P0])[kn  ]- (dist.f[DIR_0M0])[ks  ]+
+                (dist.f[DIR_PP0])[kne ]- (dist.f[DIR_MM0])[ksw ]-
+                (dist.f[DIR_PM0])[kse ]+ (dist.f[DIR_MP0])[knw ]+
+                (dist.f[DIR_0PP])[ktn ]- (dist.f[DIR_0MM])[kbs ]+
+                (dist.f[DIR_0PM])[kbn ]- (dist.f[DIR_0MP])[kts ]+
+                (dist.f[DIR_PPP])[ktne]- (dist.f[DIR_MMP])[ktsw]- 
+                (dist.f[DIR_PMP])[ktse]+ (dist.f[DIR_MPP])[ktnw]+ 
+                (dist.f[DIR_PPM])[kbne]- (dist.f[DIR_MMM])[kbsw]- 
+                (dist.f[DIR_PMM])[kbse]+ (dist.f[DIR_MPM])[kbnw];
+           
+            vzD[nodeIndex] =
+                (dist.f[DIR_00P])[kt  ]- (dist.f[DIR_00M])[kb  ]+
+                (dist.f[DIR_P0P])[kte ]- (dist.f[DIR_M0M])[kbw ]-
+                (dist.f[DIR_P0M])[kbe ]+ (dist.f[DIR_M0P])[ktw ]+
+                (dist.f[DIR_0PP])[ktn ]- (dist.f[DIR_0MM])[kbs ]-
+                (dist.f[DIR_0PM])[kbn ]+ (dist.f[DIR_0MP])[kts ]+
+                (dist.f[DIR_PPP])[ktne]+ (dist.f[DIR_MMP])[ktsw]+ 
+                (dist.f[DIR_PMP])[ktse]+ (dist.f[DIR_MPP])[ktnw]- 
+                (dist.f[DIR_PPM])[kbne]- (dist.f[DIR_MMM])[kbsw]- 
+                (dist.f[DIR_PMM])[kbse]- (dist.f[DIR_MPM])[kbnw];
+           
+            pressD[nodeIndex] =
+                ((dist.f[DIR_P00])[ke  ]+ (dist.f[DIR_M00])[kw  ]+ 
+                (dist.f[DIR_0P0])[kn  ]+ (dist.f[DIR_0M0])[ks  ]+
+                (dist.f[DIR_00P])[kt  ]+ (dist.f[DIR_00M])[kb  ]+
+                2.f*(
+                (dist.f[DIR_PP0])[kne ]+ (dist.f[DIR_MM0])[ksw ]+
+                (dist.f[DIR_PM0])[kse ]+ (dist.f[DIR_MP0])[knw ]+
+                (dist.f[DIR_P0P])[kte ]+ (dist.f[DIR_M0M])[kbw ]+
+                (dist.f[DIR_P0M])[kbe ]+ (dist.f[DIR_M0P])[ktw ]+
+                (dist.f[DIR_0PP])[ktn ]+ (dist.f[DIR_0MM])[kbs ]+
+                (dist.f[DIR_0PM])[kbn ]+ (dist.f[DIR_0MP])[kts ])+
+                3.f*(
+                (dist.f[DIR_PPP])[ktne]+ (dist.f[DIR_MMP])[ktsw]+ 
+                (dist.f[DIR_PMP])[ktse]+ (dist.f[DIR_MPP])[ktnw]+ 
+                (dist.f[DIR_PPM])[kbne]+ (dist.f[DIR_MMM])[kbsw]+ 
+                (dist.f[DIR_PMM])[kbse]+ (dist.f[DIR_MPM])[kbnw])-
+                rhoD[nodeIndex]-(vxD[nodeIndex] * vxD[nodeIndex] + vyD[nodeIndex] * vyD[nodeIndex] + vzD[nodeIndex] * vzD[nodeIndex]) * (c1o1+c0o1*rhoD[nodeIndex])) * c1o2+rhoD[nodeIndex]; // times zero for incompressible case   
+            //achtung op hart gesetzt Annahme op = 1 ;                                                    ^^^^(1.0/op-0.5)=0.5
+       }
+    }
+}
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+
+
+
+
+
+
+
 
-   const unsigned nx = blockDim.x + 2 * STARTOFFX;
-   const unsigned ny = gridDim.x + 2 * STARTOFFY;
 
-   const unsigned int k = nx*(ny*z + y) + x; // Zugriff auf arrays im device
 
 
-   if(k >= size_Mat)
-      return;
 
-   if(!vf::gpu::isValidFluidNode(geoD[k]))
-      return;
 
-   rhoD[k] = c0o1;
-   vxD[k]  = c0o1;
-   vyD[k]  = c0o1;
-   vzD[k]  = c0o1;
 
-   vf::gpu::DistributionWrapper distr_wrapper(distributions, size_Mat, isEvenTimestep, k, neighborX, neighborY, neighborZ);
-   const auto& distribution = distr_wrapper.distribution;
 
-   rhoD[k] = vf::lbm::getDensity(distribution.f);
-   vxD[k] = vf::lbm::getIncompressibleVelocityX1(distribution.f);
-   vyD[k] = vf::lbm::getIncompressibleVelocityX2(distribution.f);
-   vzD[k] = vf::lbm::getIncompressibleVelocityX3(distribution.f);
 
-}
 
 
 
 
 
-////////////////////////////////////////////////////////////////////////////////
-__global__ void LBCalcMacSP27( real* vxD,
-                                          real* vyD,
-                                          real* vzD,
-                                          real* rhoD,
-                                          real* pressD,
-                                          unsigned int* geoD,
-                                          unsigned int* neighborX,
-                                          unsigned int* neighborY,
-                                          unsigned int* neighborZ,
-                                          unsigned int size_Mat,
-                                          real* DD,
-                                          bool isEvenTimestep)
-{
-   Distributions27 D;
-   if (isEvenTimestep==true)
-   {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-   } 
-   else
-   {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
-   }
-   ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
 
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
 
-   const unsigned k = nx*(ny*z + y) + x;
-   //////////////////////////////////////////////////////////////////////////
 
-   if(k<size_Mat)
-   {
-      //////////////////////////////////////////////////////////////////////////
-      //index
-      unsigned int kzero= k;
-      unsigned int ke   = k;
-      unsigned int kw   = neighborX[k];
-      unsigned int kn   = k;
-      unsigned int ks   = neighborY[k];
-      unsigned int kt   = k;
-      unsigned int kb   = neighborZ[k];
-      unsigned int ksw  = neighborY[kw];
-      unsigned int kne  = k;
-      unsigned int kse  = ks;
-      unsigned int knw  = kw;
-      unsigned int kbw  = neighborZ[kw];
-      unsigned int kte  = k;
-      unsigned int kbe  = kb;
-      unsigned int ktw  = kw;
-      unsigned int kbs  = neighborZ[ks];
-      unsigned int ktn  = k;
-      unsigned int kbn  = kb;
-      unsigned int kts  = ks;
-      unsigned int ktse = ks;
-      unsigned int kbnw = kbw;
-      unsigned int ktnw = kw;
-      unsigned int kbse = kbs;
-      unsigned int ktsw = ksw;
-      unsigned int kbne = kb;
-      unsigned int ktne = k;
-      unsigned int kbsw = neighborZ[ksw];
-      //////////////////////////////////////////////////////////////////////////
-      pressD[k] = c0o1;
-	  rhoD[k]   = c0o1;
-	  vxD[k]    = c0o1;
-	  vyD[k]    = c0o1;
-	  vzD[k]    = c0o1;
-
-      if(geoD[k] == GEO_FLUID)
-      {
-         rhoD[k]    =   (D.f[DIR_P00   ])[ke  ]+ (D.f[DIR_M00   ])[kw  ]+ 
-                        (D.f[DIR_0P0   ])[kn  ]+ (D.f[DIR_0M0   ])[ks  ]+
-                        (D.f[DIR_00P   ])[kt  ]+ (D.f[DIR_00M   ])[kb  ]+
-                        (D.f[DIR_PP0  ])[kne ]+ (D.f[DIR_MM0  ])[ksw ]+
-                        (D.f[DIR_PM0  ])[kse ]+ (D.f[DIR_MP0  ])[knw ]+
-                        (D.f[DIR_P0P  ])[kte ]+ (D.f[DIR_M0M  ])[kbw ]+
-                        (D.f[DIR_P0M  ])[kbe ]+ (D.f[DIR_M0P  ])[ktw ]+
-                        (D.f[DIR_0PP  ])[ktn ]+ (D.f[DIR_0MM  ])[kbs ]+
-                        (D.f[DIR_0PM  ])[kbn ]+ (D.f[DIR_0MP  ])[kts ]+
-                        (D.f[DIR_000])[kzero]+ 
-                        (D.f[DIR_PPP ])[ktne]+ (D.f[DIR_MMP ])[ktsw]+ 
-                        (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]+ 
-                        (D.f[DIR_PPM ])[kbne]+ (D.f[DIR_MMM ])[kbsw]+ 
-                        (D.f[DIR_PMM ])[kbse]+ (D.f[DIR_MPM ])[kbnw];
-
-         vxD[k]     =   (D.f[DIR_P00   ])[ke  ]- (D.f[DIR_M00   ])[kw  ]+ 
-                        (D.f[DIR_PP0  ])[kne ]- (D.f[DIR_MM0  ])[ksw ]+
-                        (D.f[DIR_PM0  ])[kse ]- (D.f[DIR_MP0  ])[knw ]+
-                        (D.f[DIR_P0P  ])[kte ]- (D.f[DIR_M0M  ])[kbw ]+
-                        (D.f[DIR_P0M  ])[kbe ]- (D.f[DIR_M0P  ])[ktw ]+
-                        (D.f[DIR_PPP ])[ktne]- (D.f[DIR_MMP ])[ktsw]+ 
-                        (D.f[DIR_PMP ])[ktse]- (D.f[DIR_MPP ])[ktnw]+ 
-                        (D.f[DIR_PPM ])[kbne]- (D.f[DIR_MMM ])[kbsw]+ 
-                        (D.f[DIR_PMM ])[kbse]- (D.f[DIR_MPM ])[kbnw];
-
-         vyD[k]     =   (D.f[DIR_0P0   ])[kn  ]- (D.f[DIR_0M0   ])[ks  ]+
-                        (D.f[DIR_PP0  ])[kne ]- (D.f[DIR_MM0  ])[ksw ]-
-                        (D.f[DIR_PM0  ])[kse ]+ (D.f[DIR_MP0  ])[knw ]+
-                        (D.f[DIR_0PP  ])[ktn ]- (D.f[DIR_0MM  ])[kbs ]+
-                        (D.f[DIR_0PM  ])[kbn ]- (D.f[DIR_0MP  ])[kts ]+
-                        (D.f[DIR_PPP ])[ktne]- (D.f[DIR_MMP ])[ktsw]- 
-                        (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]+ 
-                        (D.f[DIR_PPM ])[kbne]- (D.f[DIR_MMM ])[kbsw]- 
-                        (D.f[DIR_PMM ])[kbse]+ (D.f[DIR_MPM ])[kbnw];
-
-         vzD[k]     =   (D.f[DIR_00P   ])[kt  ]- (D.f[DIR_00M   ])[kb  ]+
-                        (D.f[DIR_P0P  ])[kte ]- (D.f[DIR_M0M  ])[kbw ]-
-                        (D.f[DIR_P0M  ])[kbe ]+ (D.f[DIR_M0P  ])[ktw ]+
-                        (D.f[DIR_0PP  ])[ktn ]- (D.f[DIR_0MM  ])[kbs ]-
-                        (D.f[DIR_0PM  ])[kbn ]+ (D.f[DIR_0MP  ])[kts ]+
-                        (D.f[DIR_PPP ])[ktne]+ (D.f[DIR_MMP ])[ktsw]+ 
-                        (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]- 
-                        (D.f[DIR_PPM ])[kbne]- (D.f[DIR_MMM ])[kbsw]- 
-                        (D.f[DIR_PMM ])[kbse]- (D.f[DIR_MPM ])[kbnw];
-
-         pressD[k]  =  ((D.f[DIR_P00   ])[ke  ]+ (D.f[DIR_M00   ])[kw  ]+ 
-                        (D.f[DIR_0P0   ])[kn  ]+ (D.f[DIR_0M0   ])[ks  ]+
-                        (D.f[DIR_00P   ])[kt  ]+ (D.f[DIR_00M   ])[kb  ]+
-                        2.f*(
-                        (D.f[DIR_PP0  ])[kne ]+ (D.f[DIR_MM0  ])[ksw ]+
-                        (D.f[DIR_PM0  ])[kse ]+ (D.f[DIR_MP0  ])[knw ]+
-                        (D.f[DIR_P0P  ])[kte ]+ (D.f[DIR_M0M  ])[kbw ]+
-                        (D.f[DIR_P0M  ])[kbe ]+ (D.f[DIR_M0P  ])[ktw ]+
-                        (D.f[DIR_0PP  ])[ktn ]+ (D.f[DIR_0MM  ])[kbs ]+
-                        (D.f[DIR_0PM  ])[kbn ]+ (D.f[DIR_0MP  ])[kts ])+
-                        3.f*(
-                        (D.f[DIR_PPP ])[ktne]+ (D.f[DIR_MMP ])[ktsw]+ 
-                        (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]+ 
-                        (D.f[DIR_PPM ])[kbne]+ (D.f[DIR_MMM ])[kbsw]+ 
-                        (D.f[DIR_PMM ])[kbse]+ (D.f[DIR_MPM ])[kbnw])-
-                        rhoD[k]-(vxD[k] * vxD[k] + vyD[k] * vyD[k] + vzD[k] * vzD[k]) * (c1o1+c0o1*rhoD[k])) * c1o2+rhoD[k]; // times zero for incompressible case   
-         //achtung op hart gesetzt Annahme op = 1 ;                                                    ^^^^(1.0/op-0.5)=0.5
-
-      }
-   }
-}
 
 
 ////////////////////////////////////////////////////////////////////////////////
 __global__ void LBCalcMacCompSP27(
-   real *vxD,
-   real *vyD,
-   real *vzD,
-   real *rhoD,
-   real *pressD,
-   unsigned int *geoD,
-   unsigned int *neighborX,
-   unsigned int *neighborY,
-   unsigned int *neighborZ,
-   unsigned int size_Mat,
-   real *distributions,
-   bool isEvenTimestep)
+    real *vxD,
+    real *vyD,
+    real *vzD,
+    real *rhoD,
+    real *pressD,
+    unsigned int *geoD,
+    unsigned int *neighborX,
+    unsigned int *neighborY,
+    unsigned int *neighborZ,
+    unsigned long long numberOfLBnodes,
+    real *distributions,
+    bool isEvenTimestep)
 {
-    const unsigned k = vf::gpu::getNodeIndex();
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
 
-    if(k >= size_Mat)
+    if(nodeIndex >= numberOfLBnodes)
         return;
 
-    pressD[k] = c0o1;
-    rhoD[k]   = c0o1;
-    vxD[k]    = c0o1;
-    vyD[k]    = c0o1;
-    vzD[k]    = c0o1;
+    pressD[nodeIndex] = c0o1;
+    rhoD[nodeIndex]   = c0o1;
+    vxD[nodeIndex]    = c0o1;
+    vyD[nodeIndex]    = c0o1;
+    vzD[nodeIndex]    = c0o1;
 
-    if (!vf::gpu::isValidFluidNode(geoD[k]))
+    if (!isValidFluidNode(geoD[nodeIndex]))
         return;
 
-    vf::gpu::DistributionWrapper distr_wrapper(distributions, size_Mat, isEvenTimestep, k, neighborX, neighborY,
-                                               neighborZ);
+    DistributionWrapper distr_wrapper(distributions, numberOfLBnodes, isEvenTimestep, nodeIndex, neighborX, neighborY, neighborZ);
     const auto &distribution = distr_wrapper.distribution;
 
-    rhoD[k]   = vf::lbm::getDensity(distribution.f);
-    vxD[k]    = vf::lbm::getCompressibleVelocityX1(distribution.f, rhoD[k]);
-    vyD[k]    = vf::lbm::getCompressibleVelocityX2(distribution.f, rhoD[k]);
-    vzD[k]    = vf::lbm::getCompressibleVelocityX3(distribution.f, rhoD[k]);
-    pressD[k] = vf::lbm::getPressure(distribution.f, rhoD[k], vxD[k], vyD[k], vzD[k]); 
+    rhoD[nodeIndex]   = vf::lbm::getDensity(distribution.f);
+    vxD[nodeIndex]    = vf::lbm::getCompressibleVelocityX1(distribution.f, rhoD[nodeIndex]);
+    vyD[nodeIndex]    = vf::lbm::getCompressibleVelocityX2(distribution.f, rhoD[nodeIndex]);
+    vzD[nodeIndex]    = vf::lbm::getCompressibleVelocityX3(distribution.f, rhoD[nodeIndex]);
+    pressD[nodeIndex] = vf::lbm::getPressure(distribution.f, rhoD[nodeIndex], vxD[nodeIndex], vyD[nodeIndex], vzD[nodeIndex]); 
 }
 
 
@@ -339,206 +343,155 @@ __global__ void LBCalcMacCompSP27(
 
 
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void LBCalcMedSP27( real* vxD,
-                                          real* vyD,
-                                          real* vzD,
-                                          real* rhoD,
-                                          real* pressD,
-                                          unsigned int* geoD,
-                                          unsigned int* neighborX,
-                                          unsigned int* neighborY,
-                                          unsigned int* neighborZ,
-                                          unsigned int size_Mat,
-                                          real* DD,
-                                          bool isEvenTimestep)
+__global__ void LBCalcMedSP27(
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    real* distributions,
+    bool isEvenTimestep)
 {
-   Distributions27 D;
-   if (isEvenTimestep==true)
-   {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-   } 
-   else
-   {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
-   }
-   ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
-
-   const unsigned k = nx*(ny*z + y) + x;
-   //////////////////////////////////////////////////////////////////////////
-
-   if(k<size_Mat)
-   {
-      //////////////////////////////////////////////////////////////////////////
-      //index
-      unsigned int kzero= k;
-      unsigned int ke   = k;
-      unsigned int kw   = neighborX[k];
-      unsigned int kn   = k;
-      unsigned int ks   = neighborY[k];
-      unsigned int kt   = k;
-      unsigned int kb   = neighborZ[k];
-      unsigned int ksw  = neighborY[kw];
-      unsigned int kne  = k;
-      unsigned int kse  = ks;
-      unsigned int knw  = kw;
-      unsigned int kbw  = neighborZ[kw];
-      unsigned int kte  = k;
-      unsigned int kbe  = kb;
-      unsigned int ktw  = kw;
-      unsigned int kbs  = neighborZ[ks];
-      unsigned int ktn  = k;
-      unsigned int kbn  = kb;
-      unsigned int kts  = ks;
-      unsigned int ktse = ks;
-      unsigned int kbnw = kbw;
-      unsigned int ktnw = kw;
-      unsigned int kbse = kbs;
-      unsigned int ktsw = ksw;
-      unsigned int kbne = kb;
-      unsigned int ktne = k;
-      unsigned int kbsw = neighborZ[ksw];
-      //////////////////////////////////////////////////////////////////////////
-      real PRESS = pressD[k];
-      real RHO   = rhoD[k];
-      real VX    = vxD[k];
-      real VY    = vyD[k];
-      real VZ    = vzD[k];
-      //////////////////////////////////////////////////////////////////////////
-      pressD[k] = c0o1;
-	  rhoD[k]   = c0o1;
-	  vxD[k]    = c0o1;
-	  vyD[k]    = c0o1;
-	  vzD[k]    = c0o1;
-
-      if(geoD[k] == GEO_FLUID)
-      {
-         rhoD[k]    =   (D.f[DIR_P00   ])[ke  ]+ (D.f[DIR_M00   ])[kw  ]+ 
-                        (D.f[DIR_0P0   ])[kn  ]+ (D.f[DIR_0M0   ])[ks  ]+
-                        (D.f[DIR_00P   ])[kt  ]+ (D.f[DIR_00M   ])[kb  ]+
-                        (D.f[DIR_PP0  ])[kne ]+ (D.f[DIR_MM0  ])[ksw ]+
-                        (D.f[DIR_PM0  ])[kse ]+ (D.f[DIR_MP0  ])[knw ]+
-                        (D.f[DIR_P0P  ])[kte ]+ (D.f[DIR_M0M  ])[kbw ]+
-                        (D.f[DIR_P0M  ])[kbe ]+ (D.f[DIR_M0P  ])[ktw ]+
-                        (D.f[DIR_0PP  ])[ktn ]+ (D.f[DIR_0MM  ])[kbs ]+
-                        (D.f[DIR_0PM  ])[kbn ]+ (D.f[DIR_0MP  ])[kts ]+
-                        (D.f[DIR_000])[kzero]+ 
-                        (D.f[DIR_PPP ])[ktne]+ (D.f[DIR_MMP ])[ktsw]+ 
-                        (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]+ 
-                        (D.f[DIR_PPM ])[kbne]+ (D.f[DIR_MMM ])[kbsw]+ 
-                        (D.f[DIR_PMM ])[kbse]+ (D.f[DIR_MPM ])[kbnw]+
-                        RHO;
-
-         vxD[k]     =   (D.f[DIR_P00   ])[ke  ]- (D.f[DIR_M00   ])[kw  ]+ 
-                        (D.f[DIR_PP0  ])[kne ]- (D.f[DIR_MM0  ])[ksw ]+
-                        (D.f[DIR_PM0  ])[kse ]- (D.f[DIR_MP0  ])[knw ]+
-                        (D.f[DIR_P0P  ])[kte ]- (D.f[DIR_M0M  ])[kbw ]+
-                        (D.f[DIR_P0M  ])[kbe ]- (D.f[DIR_M0P  ])[ktw ]+
-                        (D.f[DIR_PPP ])[ktne]- (D.f[DIR_MMP ])[ktsw]+ 
-                        (D.f[DIR_PMP ])[ktse]- (D.f[DIR_MPP ])[ktnw]+ 
-                        (D.f[DIR_PPM ])[kbne]- (D.f[DIR_MMM ])[kbsw]+ 
-                        (D.f[DIR_PMM ])[kbse]- (D.f[DIR_MPM ])[kbnw]+
-                        VX;
-
-         vyD[k]     =   (D.f[DIR_0P0   ])[kn  ]- (D.f[DIR_0M0   ])[ks  ]+
-                        (D.f[DIR_PP0  ])[kne ]- (D.f[DIR_MM0  ])[ksw ]-
-                        (D.f[DIR_PM0  ])[kse ]+ (D.f[DIR_MP0  ])[knw ]+
-                        (D.f[DIR_0PP  ])[ktn ]- (D.f[DIR_0MM  ])[kbs ]+
-                        (D.f[DIR_0PM  ])[kbn ]- (D.f[DIR_0MP  ])[kts ]+
-                        (D.f[DIR_PPP ])[ktne]- (D.f[DIR_MMP ])[ktsw]- 
-                        (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]+ 
-                        (D.f[DIR_PPM ])[kbne]- (D.f[DIR_MMM ])[kbsw]- 
-                        (D.f[DIR_PMM ])[kbse]+ (D.f[DIR_MPM ])[kbnw]+
-                        VY;
-
-         vzD[k]     =   (D.f[DIR_00P   ])[kt  ]- (D.f[DIR_00M   ])[kb  ]+
-                        (D.f[DIR_P0P  ])[kte ]- (D.f[DIR_M0M  ])[kbw ]-
-                        (D.f[DIR_P0M  ])[kbe ]+ (D.f[DIR_M0P  ])[ktw ]+
-                        (D.f[DIR_0PP  ])[ktn ]- (D.f[DIR_0MM  ])[kbs ]-
-                        (D.f[DIR_0PM  ])[kbn ]+ (D.f[DIR_0MP  ])[kts ]+
-                        (D.f[DIR_PPP ])[ktne]+ (D.f[DIR_MMP ])[ktsw]+ 
-                        (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]- 
-                        (D.f[DIR_PPM ])[kbne]- (D.f[DIR_MMM ])[kbsw]- 
-                        (D.f[DIR_PMM ])[kbse]- (D.f[DIR_MPM ])[kbnw]+
-                        VZ;
-
-         pressD[k]  =   ((D.f[DIR_P00   ])[ke  ]+ (D.f[DIR_M00   ])[kw  ]+ 
-                        (D.f[DIR_0P0   ])[kn  ]+ (D.f[DIR_0M0   ])[ks  ]+
-                        (D.f[DIR_00P   ])[kt  ]+ (D.f[DIR_00M   ])[kb  ]+
-                        c2o1*(
-                        (D.f[DIR_PP0  ])[kne ]+ (D.f[DIR_MM0  ])[ksw ]+
-                        (D.f[DIR_PM0  ])[kse ]+ (D.f[DIR_MP0  ])[knw ]+
-                        (D.f[DIR_P0P  ])[kte ]+ (D.f[DIR_M0M  ])[kbw ]+
-                        (D.f[DIR_P0M  ])[kbe ]+ (D.f[DIR_M0P  ])[ktw ]+
-                        (D.f[DIR_0PP  ])[ktn ]+ (D.f[DIR_0MM  ])[kbs ]+
-                        (D.f[DIR_0PM  ])[kbn ]+ (D.f[DIR_0MP  ])[kts ])+
-                        c3o1*(
-                        (D.f[DIR_PPP ])[ktne]+ (D.f[DIR_MMP ])[ktsw]+ 
-                        (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]+ 
-                        (D.f[DIR_PPM ])[kbne]+ (D.f[DIR_MMM ])[kbsw]+ 
-                        (D.f[DIR_PMM ])[kbse]+ (D.f[DIR_MPM ])[kbnw])-
-                        rhoD[k]-(vxD[k] * vxD[k] + vyD[k] * vyD[k] + vzD[k] * vzD[k]) * (c1o1+rhoD[k])) * c1o2+rhoD[k]+
-                        PRESS;    
-         //achtung op hart gesetzt Annahme op = 1 ;                                                    ^^^^(1.0/op-0.5)=0.5
-      }
-   }
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
+
+    //////////////////////////////////////////////////////////////////////////
+    if( nodeIndex < numberOfLBnodes )
+    {
+        //////////////////////////////////////////////////////////////////////////
+        //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on
+        //! timestep is based on the esoteric twist algorithm \ref <a
+        //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
+        //! DOI:10.3390/computation5020019 ]</b></a>
+        //!
+        Distributions27 dist;
+        getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+        
+        //////////////////////////////////////////////////////////////////////////
+        //index
+        unsigned int kzero= nodeIndex;
+        unsigned int ke   = nodeIndex;
+        unsigned int kw   = neighborX[nodeIndex];
+        unsigned int kn   = nodeIndex;
+        unsigned int ks   = neighborY[nodeIndex];
+        unsigned int kt   = nodeIndex;
+        unsigned int kb   = neighborZ[nodeIndex];
+        unsigned int ksw  = neighborY[kw];
+        unsigned int kne  = nodeIndex;
+        unsigned int kse  = ks;
+        unsigned int knw  = kw;
+        unsigned int kbw  = neighborZ[kw];
+        unsigned int kte  = nodeIndex;
+        unsigned int kbe  = kb;
+        unsigned int ktw  = kw;
+        unsigned int kbs  = neighborZ[ks];
+        unsigned int ktn  = nodeIndex;
+        unsigned int kbn  = kb;
+        unsigned int kts  = ks;
+        unsigned int ktse = ks;
+        unsigned int kbnw = kbw;
+        unsigned int ktnw = kw;
+        unsigned int kbse = kbs;
+        unsigned int ktsw = ksw;
+        unsigned int kbne = kb;
+        unsigned int ktne = nodeIndex;
+        unsigned int kbsw = neighborZ[ksw];
+        //////////////////////////////////////////////////////////////////////////
+        real PRESS = pressD[nodeIndex];
+        real RHO   = rhoD[nodeIndex];
+        real VX    = vxD[nodeIndex];
+        real VY    = vyD[nodeIndex];
+        real VZ    = vzD[nodeIndex];
+        //////////////////////////////////////////////////////////////////////////
+        pressD[nodeIndex] = c0o1;
+        rhoD[nodeIndex]   = c0o1;
+        vxD[nodeIndex]    = c0o1;
+        vyD[nodeIndex]    = c0o1;
+        vzD[nodeIndex]    = c0o1;
+        
+        if(geoD[nodeIndex] == GEO_FLUID)
+        {
+            rhoD[nodeIndex] =
+                (dist.f[DIR_P00])[ke  ]+ (dist.f[DIR_M00])[kw  ]+ 
+                (dist.f[DIR_0P0])[kn  ]+ (dist.f[DIR_0M0])[ks  ]+
+                (dist.f[DIR_00P])[kt  ]+ (dist.f[DIR_00M])[kb  ]+
+                (dist.f[DIR_PP0])[kne ]+ (dist.f[DIR_MM0])[ksw ]+
+                (dist.f[DIR_PM0])[kse ]+ (dist.f[DIR_MP0])[knw ]+
+                (dist.f[DIR_P0P])[kte ]+ (dist.f[DIR_M0M])[kbw ]+
+                (dist.f[DIR_P0M])[kbe ]+ (dist.f[DIR_M0P])[ktw ]+
+                (dist.f[DIR_0PP])[ktn ]+ (dist.f[DIR_0MM])[kbs ]+
+                (dist.f[DIR_0PM])[kbn ]+ (dist.f[DIR_0MP])[kts ]+
+                (dist.f[DIR_000])[kzero]+ 
+                (dist.f[DIR_PPP])[ktne]+ (dist.f[DIR_MMP])[ktsw]+ 
+                (dist.f[DIR_PMP])[ktse]+ (dist.f[DIR_MPP])[ktnw]+ 
+                (dist.f[DIR_PPM])[kbne]+ (dist.f[DIR_MMM])[kbsw]+ 
+                (dist.f[DIR_PMM])[kbse]+ (dist.f[DIR_MPM])[kbnw]+
+                RHO;
+            
+            vxD[nodeIndex] =
+                (dist.f[DIR_P00])[ke  ]- (dist.f[DIR_M00])[kw  ]+ 
+                (dist.f[DIR_PP0])[kne ]- (dist.f[DIR_MM0])[ksw ]+
+                (dist.f[DIR_PM0])[kse ]- (dist.f[DIR_MP0])[knw ]+
+                (dist.f[DIR_P0P])[kte ]- (dist.f[DIR_M0M])[kbw ]+
+                (dist.f[DIR_P0M])[kbe ]- (dist.f[DIR_M0P])[ktw ]+
+                (dist.f[DIR_PPP])[ktne]- (dist.f[DIR_MMP])[ktsw]+ 
+                (dist.f[DIR_PMP])[ktse]- (dist.f[DIR_MPP])[ktnw]+ 
+                (dist.f[DIR_PPM])[kbne]- (dist.f[DIR_MMM])[kbsw]+ 
+                (dist.f[DIR_PMM])[kbse]- (dist.f[DIR_MPM])[kbnw]+
+                VX;
+            
+            vyD[nodeIndex] =
+                (dist.f[DIR_0P0])[kn  ]- (dist.f[DIR_0M0])[ks  ]+
+                (dist.f[DIR_PP0])[kne ]- (dist.f[DIR_MM0])[ksw ]-
+                (dist.f[DIR_PM0])[kse ]+ (dist.f[DIR_MP0])[knw ]+
+                (dist.f[DIR_0PP])[ktn ]- (dist.f[DIR_0MM])[kbs ]+
+                (dist.f[DIR_0PM])[kbn ]- (dist.f[DIR_0MP])[kts ]+
+                (dist.f[DIR_PPP])[ktne]- (dist.f[DIR_MMP])[ktsw]- 
+                (dist.f[DIR_PMP])[ktse]+ (dist.f[DIR_MPP])[ktnw]+ 
+                (dist.f[DIR_PPM])[kbne]- (dist.f[DIR_MMM])[kbsw]- 
+                (dist.f[DIR_PMM])[kbse]+ (dist.f[DIR_MPM])[kbnw]+
+                VY;
+            
+            vzD[nodeIndex] =
+                (dist.f[DIR_00P])[kt  ]- (dist.f[DIR_00M])[kb  ]+
+                (dist.f[DIR_P0P])[kte ]- (dist.f[DIR_M0M])[kbw ]-
+                (dist.f[DIR_P0M])[kbe ]+ (dist.f[DIR_M0P])[ktw ]+
+                (dist.f[DIR_0PP])[ktn ]- (dist.f[DIR_0MM])[kbs ]-
+                (dist.f[DIR_0PM])[kbn ]+ (dist.f[DIR_0MP])[kts ]+
+                (dist.f[DIR_PPP])[ktne]+ (dist.f[DIR_MMP])[ktsw]+ 
+                (dist.f[DIR_PMP])[ktse]+ (dist.f[DIR_MPP])[ktnw]- 
+                (dist.f[DIR_PPM])[kbne]- (dist.f[DIR_MMM])[kbsw]- 
+                (dist.f[DIR_PMM])[kbse]- (dist.f[DIR_MPM])[kbnw]+
+                VZ;
+            
+            pressD[nodeIndex] =
+                ((dist.f[DIR_P00])[ke  ]+ (dist.f[DIR_M00])[kw  ]+ 
+                (dist.f[DIR_0P0])[kn  ]+ (dist.f[DIR_0M0])[ks  ]+
+                (dist.f[DIR_00P])[kt  ]+ (dist.f[DIR_00M])[kb  ]+
+                c2o1*(
+                (dist.f[DIR_PP0])[kne ]+ (dist.f[DIR_MM0])[ksw ]+
+                (dist.f[DIR_PM0])[kse ]+ (dist.f[DIR_MP0])[knw ]+
+                (dist.f[DIR_P0P])[kte ]+ (dist.f[DIR_M0M])[kbw ]+
+                (dist.f[DIR_P0M])[kbe ]+ (dist.f[DIR_M0P])[ktw ]+
+                (dist.f[DIR_0PP])[ktn ]+ (dist.f[DIR_0MM])[kbs ]+
+                (dist.f[DIR_0PM])[kbn ]+ (dist.f[DIR_0MP])[kts ])+
+                c3o1*(
+                (dist.f[DIR_PPP])[ktne]+ (dist.f[DIR_MMP])[ktsw]+ 
+                (dist.f[DIR_PMP])[ktse]+ (dist.f[DIR_MPP])[ktnw]+ 
+                (dist.f[DIR_PPM])[kbne]+ (dist.f[DIR_MMM])[kbsw]+ 
+                (dist.f[DIR_PMM])[kbse]+ (dist.f[DIR_MPM])[kbnw])-
+                rhoD[nodeIndex]-(vxD[nodeIndex] * vxD[nodeIndex] + vyD[nodeIndex] * vyD[nodeIndex] + vzD[nodeIndex] * vzD[nodeIndex]) * (c1o1+rhoD[nodeIndex])) * c1o2+rhoD[nodeIndex]+
+                PRESS;    
+            //achtung op hart gesetzt Annahme op = 1 ;                                                    ^^^^(1.0/op-0.5)=0.5
+        }
+    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -563,259 +516,152 @@ __global__ void LBCalcMedSP27( real* vxD,
 
 
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void LBCalcMedCompSP27( real* vxD,
-											  real* vyD,
-											  real* vzD,
-											  real* rhoD,
-											  real* pressD,
-											  unsigned int* geoD,
-											  unsigned int* neighborX,
-											  unsigned int* neighborY,
-											  unsigned int* neighborZ,
-											  unsigned int size_Mat,
-											  real* DD,
-											  bool isEvenTimestep)
+__global__ void LBCalcMedCompSP27(
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    real* distributions,
+    bool isEvenTimestep)
 {
-   Distributions27 D;
-   if (isEvenTimestep==true)
-   {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-   } 
-   else
-   {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
-   }
-   ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
-
-   const unsigned k = nx*(ny*z + y) + x;
-   //////////////////////////////////////////////////////////////////////////
-
-   if(k<size_Mat)
-   {
-      //////////////////////////////////////////////////////////////////////////
-      //index
-      //unsigned int kzero= k;
-      unsigned int ke   = k;
-      unsigned int kw   = neighborX[k];
-      unsigned int kn   = k;
-      unsigned int ks   = neighborY[k];
-      unsigned int kt   = k;
-      unsigned int kb   = neighborZ[k];
-      unsigned int ksw  = neighborY[kw];
-      unsigned int kne  = k;
-      unsigned int kse  = ks;
-      unsigned int knw  = kw;
-      unsigned int kbw  = neighborZ[kw];
-      unsigned int kte  = k;
-      unsigned int kbe  = kb;
-      unsigned int ktw  = kw;
-      unsigned int kbs  = neighborZ[ks];
-      unsigned int ktn  = k;
-      unsigned int kbn  = kb;
-      unsigned int kts  = ks;
-      unsigned int ktse = ks;
-      unsigned int kbnw = kbw;
-      unsigned int ktnw = kw;
-      unsigned int kbse = kbs;
-      unsigned int ktsw = ksw;
-      unsigned int kbne = kb;
-      unsigned int ktne = k;
-      unsigned int kbsw = neighborZ[ksw];
-      //////////////////////////////////////////////////////////////////////////
-      real PRESS = pressD[k];
-      real RHO   = rhoD[k];
-      real VX    = vxD[k];
-      real VY    = vyD[k];
-      real VZ    = vzD[k];
-      //////////////////////////////////////////////////////////////////////////
-      pressD[k] = c0o1;
-	  rhoD[k]   = c0o1;
-	  vxD[k]    = c0o1;
-	  vyD[k]    = c0o1;
-	  vzD[k]    = c0o1;
-
-      if(geoD[k] == GEO_FLUID)
-      {
-		  real mfcbb = (D.f[DIR_P00])[k];//[ke   ];
-		  real mfabb = (D.f[DIR_M00])[kw];//[kw   ];  
-		  real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];
-		  real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];  
-		  real mfbbc = (D.f[DIR_00P])[k];//[kt   ];
-		  real mfbba = (D.f[DIR_00M])[kb];//[kb   ];  
-		  real mfccb = (D.f[DIR_PP0])[k];//[kne  ];  
-		  real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];
-		  real mfcab = (D.f[DIR_PM0])[ks];//[kse  ]; 
-		  real mfacb = (D.f[DIR_MP0])[kw];//[knw  ]; 
-		  real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];  
-		  real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];
-		  real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ]; 
-		  real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ]; 
-		  real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];  
-		  real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];
-		  real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ]; 
-		  real mfbac = (D.f[DIR_0MP])[ks];//[kts  ]; 
-		  real mfbbb = (D.f[DIR_000])[k];//[kzero];
-		  real mfccc = (D.f[DIR_PPP])[k];//[ktne ]; 
-		  real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ]; 
-		  real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];
-		  real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];
-		  real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];
-		  real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];
-		  real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ]; 
-		  real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ]; 
-		  ////////////////////////////////////////////////////////////////////////////////////
-		  real drho = 
-			  ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
-			  (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
-			  ((mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc))) + mfbbb;
-
-		  real rho = c1o1 + drho;
-		  
-		  rhoD[k] = drho + RHO;
-
-		  vxD[k] = 
-			  (((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
-			  (((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) +
-			  (mfcbb - mfabb)) / rho) + VX;
-		  vyD[k] = 
-			  (((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
-			  (((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) +
-			  (mfbcb - mfbab)) / rho) + VY;
-		  vzD[k] = 
-			  (((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
-			  (((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) +
-			  (mfbbc - mfbba)) / rho) + VZ;
-
-		  //rhoD[k] =
-			 // (D.f[DIR_P00])[ke] + (D.f[DIR_M00])[kw] +
-			 // (D.f[DIR_0P0])[kn] + (D.f[DIR_0M0])[ks] +
-			 // (D.f[DIR_00P])[kt] + (D.f[DIR_00M])[kb] +
-			 // (D.f[DIR_PP0])[kne] + (D.f[DIR_MM0])[ksw] +
-			 // (D.f[DIR_PM0])[kse] + (D.f[DIR_MP0])[knw] +
-			 // (D.f[DIR_P0P])[kte] + (D.f[DIR_M0M])[kbw] +
-			 // (D.f[DIR_P0M])[kbe] + (D.f[DIR_M0P])[ktw] +
-			 // (D.f[DIR_0PP])[ktn] + (D.f[DIR_0MM])[kbs] +
-			 // (D.f[DIR_0PM])[kbn] + (D.f[DIR_0MP])[kts] +
-			 // (D.f[DIR_000])[kzero] +
-			 // (D.f[DIR_PPP])[ktne] + (D.f[DIR_MMP])[ktsw] +
-			 // (D.f[DIR_PMP])[ktse] + (D.f[DIR_MPP])[ktnw] +
-			 // (D.f[DIR_PPM])[kbne] + (D.f[DIR_MMM])[kbsw] +
-			 // (D.f[DIR_PMM])[kbse] + (D.f[DIR_MPM])[kbnw];// +RHO;
-
-    //     vxD[k] =  
-			 //((D.f[DIR_P00  ])[ke  ]- (D.f[DIR_M00   ])[kw  ]+ 
-    //         (D.f[DIR_PP0  ])[kne ]- (D.f[DIR_MM0  ])[ksw ]+
-    //         (D.f[DIR_PM0  ])[kse ]- (D.f[DIR_MP0  ])[knw ]+
-    //         (D.f[DIR_P0P  ])[kte ]- (D.f[DIR_M0M  ])[kbw ]+
-    //         (D.f[DIR_P0M  ])[kbe ]- (D.f[DIR_M0P  ])[ktw ]+
-    //         (D.f[DIR_PPP ])[ktne]- (D.f[DIR_MMP ])[ktsw]+ 
-    //         (D.f[DIR_PMP ])[ktse]- (D.f[DIR_MPP ])[ktnw]+ 
-    //         (D.f[DIR_PPM ])[kbne]- (D.f[DIR_MMM ])[kbsw]+ 
-    //         (D.f[DIR_PMM ])[kbse]- (D.f[DIR_MPM ])[kbnw]) / (one + rhoD[k])+
-    //         VX;
-
-    //     vyD[k] =  
-			 //((D.f[DIR_0P0  ])[kn  ]- (D.f[DIR_0M0   ])[ks  ]+
-    //         (D.f[DIR_PP0  ])[kne ]- (D.f[DIR_MM0  ])[ksw ]-
-    //         (D.f[DIR_PM0  ])[kse ]+ (D.f[DIR_MP0  ])[knw ]+
-    //         (D.f[DIR_0PP  ])[ktn ]- (D.f[DIR_0MM  ])[kbs ]+
-    //         (D.f[DIR_0PM  ])[kbn ]- (D.f[DIR_0MP  ])[kts ]+
-    //         (D.f[DIR_PPP ])[ktne]- (D.f[DIR_MMP ])[ktsw]- 
-    //         (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]+ 
-    //         (D.f[DIR_PPM ])[kbne]- (D.f[DIR_MMM ])[kbsw]- 
-    //         (D.f[DIR_PMM ])[kbse]+ (D.f[DIR_MPM ])[kbnw]) / (one + rhoD[k])+
-    //         VY;
-
-    //     vzD[k] =  
-			 //((D.f[DIR_00P  ])[kt  ]- (D.f[DIR_00M   ])[kb  ]+
-    //         (D.f[DIR_P0P  ])[kte ]- (D.f[DIR_M0M  ])[kbw ]-
-    //         (D.f[DIR_P0M  ])[kbe ]+ (D.f[DIR_M0P  ])[ktw ]+
-    //         (D.f[DIR_0PP  ])[ktn ]- (D.f[DIR_0MM  ])[kbs ]-
-    //         (D.f[DIR_0PM  ])[kbn ]+ (D.f[DIR_0MP  ])[kts ]+
-    //         (D.f[DIR_PPP ])[ktne]+ (D.f[DIR_MMP ])[ktsw]+ 
-    //         (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]- 
-    //         (D.f[DIR_PPM ])[kbne]- (D.f[DIR_MMM ])[kbsw]- 
-    //         (D.f[DIR_PMM ])[kbse]- (D.f[DIR_MPM ])[kbnw]) / (one + rhoD[k])+
-    //         VZ;
-
-         pressD[k]  =  ((D.f[DIR_P00   ])[ke  ]+ (D.f[DIR_M00   ])[kw  ]+ 
-                        (D.f[DIR_0P0   ])[kn  ]+ (D.f[DIR_0M0   ])[ks  ]+
-                        (D.f[DIR_00P   ])[kt  ]+ (D.f[DIR_00M   ])[kb  ]+
-                        c2o1*(
-                        (D.f[DIR_PP0  ])[kne ]+ (D.f[DIR_MM0  ])[ksw ]+
-                        (D.f[DIR_PM0  ])[kse ]+ (D.f[DIR_MP0  ])[knw ]+
-                        (D.f[DIR_P0P  ])[kte ]+ (D.f[DIR_M0M  ])[kbw ]+
-                        (D.f[DIR_P0M  ])[kbe ]+ (D.f[DIR_M0P  ])[ktw ]+
-                        (D.f[DIR_0PP  ])[ktn ]+ (D.f[DIR_0MM  ])[kbs ]+
-                        (D.f[DIR_0PM  ])[kbn ]+ (D.f[DIR_0MP  ])[kts ])+
-                        c3o1*(
-                        (D.f[DIR_PPP ])[ktne]+ (D.f[DIR_MMP ])[ktsw]+ 
-                        (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]+ 
-                        (D.f[DIR_PPM ])[kbne]+ (D.f[DIR_MMM ])[kbsw]+ 
-                        (D.f[DIR_PMM ])[kbse]+ (D.f[DIR_MPM ])[kbnw])-
-                        rhoD[k]-(vxD[k] * vxD[k] + vyD[k] * vyD[k] + vzD[k] * vzD[k]) * (c1o1+rhoD[k])) * c1o2+rhoD[k]+
-                        PRESS;    
-         //achtung op hart gesetzt Annahme op = 1 ;                                                    ^^^^(1.0/op-0.5)=0.5
-      }
-   }
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
+
+    //////////////////////////////////////////////////////////////////////////
+    if( nodeIndex < numberOfLBnodes )
+    {
+        //////////////////////////////////////////////////////////////////////////
+        //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on
+        //! timestep is based on the esoteric twist algorithm \ref <a
+        //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
+        //! DOI:10.3390/computation5020019 ]</b></a>
+        //!
+        Distributions27 dist;
+        getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+        
+        //////////////////////////////////////////////////////////////////////////
+        //index
+        //unsigned int kzero= k;
+        unsigned int ke   = nodeIndex;
+        unsigned int kw   = neighborX[nodeIndex];
+        unsigned int kn   = nodeIndex;
+        unsigned int ks   = neighborY[nodeIndex];
+        unsigned int kt   = nodeIndex;
+        unsigned int kb   = neighborZ[nodeIndex];
+        unsigned int ksw  = neighborY[kw];
+        unsigned int kne  = nodeIndex;
+        unsigned int kse  = ks;
+        unsigned int knw  = kw;
+        unsigned int kbw  = neighborZ[kw];
+        unsigned int kte  = nodeIndex;
+        unsigned int kbe  = kb;
+        unsigned int ktw  = kw;
+        unsigned int kbs  = neighborZ[ks];
+        unsigned int ktn  = nodeIndex;
+        unsigned int kbn  = kb;
+        unsigned int kts  = ks;
+        unsigned int ktse = ks;
+        unsigned int kbnw = kbw;
+        unsigned int ktnw = kw;
+        unsigned int kbse = kbs;
+        unsigned int ktsw = ksw;
+        unsigned int kbne = kb;
+        unsigned int ktne = nodeIndex;
+        unsigned int kbsw = neighborZ[ksw];
+        //////////////////////////////////////////////////////////////////////////
+        real PRESS = pressD[nodeIndex];
+        real RHO   = rhoD[nodeIndex];
+        real VX    = vxD[nodeIndex];
+        real VY    = vyD[nodeIndex];
+        real VZ    = vzD[nodeIndex];
+        //////////////////////////////////////////////////////////////////////////
+        pressD[nodeIndex] = c0o1;
+        rhoD[nodeIndex]   = c0o1;
+        vxD[nodeIndex]    = c0o1;
+        vyD[nodeIndex]    = c0o1;
+        vzD[nodeIndex]    = c0o1;
+        
+        if(geoD[nodeIndex] == GEO_FLUID)
+        {
+            real mfcbb = (dist.f[DIR_P00])[nodeIndex];//[ke   ];
+            real mfabb = (dist.f[DIR_M00])[kw];//[kw   ];  
+            real mfbcb = (dist.f[DIR_0P0])[nodeIndex];//[kn   ];
+            real mfbab = (dist.f[DIR_0M0])[ks];//[ks   ];  
+            real mfbbc = (dist.f[DIR_00P])[nodeIndex];//[kt   ];
+            real mfbba = (dist.f[DIR_00M])[kb];//[kb   ];  
+            real mfccb = (dist.f[DIR_PP0])[nodeIndex];//[kne  ];  
+            real mfaab = (dist.f[DIR_MM0])[ksw];//[ksw  ];
+            real mfcab = (dist.f[DIR_PM0])[ks];//[kse  ]; 
+            real mfacb = (dist.f[DIR_MP0])[kw];//[knw  ]; 
+            real mfcbc = (dist.f[DIR_P0P])[nodeIndex];//[kte  ];  
+            real mfaba = (dist.f[DIR_M0M])[kbw];//[kbw  ];
+            real mfcba = (dist.f[DIR_P0M])[kb];//[kbe  ]; 
+            real mfabc = (dist.f[DIR_M0P])[kw];//[ktw  ]; 
+            real mfbcc = (dist.f[DIR_0PP])[nodeIndex];//[ktn  ];  
+            real mfbaa = (dist.f[DIR_0MM])[kbs];//[kbs  ];
+            real mfbca = (dist.f[DIR_0PM])[kb];//[kbn  ]; 
+            real mfbac = (dist.f[DIR_0MP])[ks];//[kts  ]; 
+            real mfbbb = (dist.f[DIR_000])[nodeIndex];//[kzero];
+            real mfccc = (dist.f[DIR_PPP])[nodeIndex];//[ktne ]; 
+            real mfaac = (dist.f[DIR_MMP])[ksw];//[ktsw ]; 
+            real mfcac = (dist.f[DIR_PMP])[ks];//[ktse ];
+            real mfacc = (dist.f[DIR_MPP])[kw];//[ktnw ];
+            real mfcca = (dist.f[DIR_PPM])[kb];//[kbne ];
+            real mfaaa = (dist.f[DIR_MMM])[kbsw];//[kbsw ];
+            real mfcaa = (dist.f[DIR_PMM])[kbs];//[kbse ]; 
+            real mfaca = (dist.f[DIR_MPM])[kbw];//[kbnw ]; 
+            ////////////////////////////////////////////////////////////////////////////////////
+            real drho = 
+                ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
+                (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
+                ((mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc))) + mfbbb;
+
+            real rho = c1o1 + drho;
+
+            rhoD[nodeIndex] = drho + RHO;
+
+            vxD[nodeIndex] = 
+                (((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
+                (((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) +
+                (mfcbb - mfabb)) / rho) + VX;
+            vyD[nodeIndex] = 
+                (((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
+                (((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) +
+                (mfbcb - mfbab)) / rho) + VY;
+            vzD[nodeIndex] = 
+                (((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
+                (((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) +
+                (mfbbc - mfbba)) / rho) + VZ;
+
+            pressD[nodeIndex]  =
+                ((dist.f[DIR_P00])[ke  ]+ (dist.f[DIR_M00])[kw  ]+ 
+                (dist.f[DIR_0P0])[kn  ]+ (dist.f[DIR_0M0])[ks  ]+
+                (dist.f[DIR_00P])[kt  ]+ (dist.f[DIR_00M])[kb  ]+
+                c2o1*(
+                (dist.f[DIR_PP0])[kne ]+ (dist.f[DIR_MM0])[ksw ]+
+                (dist.f[DIR_PM0])[kse ]+ (dist.f[DIR_MP0])[knw ]+
+                (dist.f[DIR_P0P])[kte ]+ (dist.f[DIR_M0M])[kbw ]+
+                (dist.f[DIR_P0M])[kbe ]+ (dist.f[DIR_M0P])[ktw ]+
+                (dist.f[DIR_0PP])[ktn ]+ (dist.f[DIR_0MM])[kbs ]+
+                (dist.f[DIR_0PM])[kbn ]+ (dist.f[DIR_0MP])[kts ])+
+                c3o1*(
+                (dist.f[DIR_PPP])[ktne]+ (dist.f[DIR_MMP])[ktsw]+ 
+                (dist.f[DIR_PMP])[ktse]+ (dist.f[DIR_MPP])[ktnw]+ 
+                (dist.f[DIR_PPM])[kbne]+ (dist.f[DIR_MMM])[kbsw]+ 
+                (dist.f[DIR_PMM])[kbse]+ (dist.f[DIR_MPM])[kbnw])-
+                rhoD[nodeIndex]-(vxD[nodeIndex] * vxD[nodeIndex] + vyD[nodeIndex] * vyD[nodeIndex] + vzD[nodeIndex] * vzD[nodeIndex]) * (c1o1+rhoD[nodeIndex])) * c1o2+rhoD[nodeIndex]+
+                PRESS;    
+            //achtung op hart gesetzt Annahme op = 1 ;                                                    ^^^^(1.0/op-0.5)=0.5
+        }
+    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -841,309 +687,191 @@ __global__ void LBCalcMedCompSP27( real* vxD,
 
 ////////////////////////////////////////////////////////////////////////////////
 __global__ void LBCalcMedCompAD27(
-	real* vxD,
-	real* vyD,
-	real* vzD,
-	real* rhoD,
-	real* pressD,
-	real* concD,
-	unsigned int* geoD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	unsigned int size_Mat,
-	real* DD,
-	real* DD_AD,
-	bool isEvenTimestep)
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    real* concD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    real* distributions,
+    real* distributionsAD,
+    bool isEvenTimestep)
 {
-	Distributions27 D;
-	if (isEvenTimestep == true)
-	{
-		D.f[DIR_P00] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_M00] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0P0] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0M0] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00P] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00M] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_PP0] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_MM0] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_PM0] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_MP0] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_P0P] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_M0M] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_P0M] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_M0P] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0PP] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0MM] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0PM] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0MP] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_MMP] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_PMP] = &DD[DIR_PMP *size_Mat];
-		D.f[DIR_MPP] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_PPM] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_MMM] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_PMM] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_MPM] = &DD[DIR_MPM *size_Mat];
-	}
-	else
-	{
-		D.f[DIR_M00] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_P00] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0M0] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0P0] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00M] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00P] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_MM0] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_PP0] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_MP0] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_PM0] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_M0M] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_P0P] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_M0P] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_P0M] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0MM] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0PP] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0MP] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0PM] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_MMP] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_PMP] = &DD[DIR_MPM *size_Mat];
-		D.f[DIR_MPP] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_PPM] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_MMM] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_PMM] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_MPM] = &DD[DIR_PMP *size_Mat];
-	}
-	////////////////////////////////////////////////////////////////////////////////
-	Distributions27 Dad;
-	if (isEvenTimestep == true)
-	{
-		Dad.f[DIR_P00]    = &DD_AD[DIR_P00   *size_Mat];
-		Dad.f[DIR_M00]    = &DD_AD[DIR_M00   *size_Mat];
-		Dad.f[DIR_0P0]    = &DD_AD[DIR_0P0   *size_Mat];
-		Dad.f[DIR_0M0]    = &DD_AD[DIR_0M0   *size_Mat];
-		Dad.f[DIR_00P]    = &DD_AD[DIR_00P   *size_Mat];
-		Dad.f[DIR_00M]    = &DD_AD[DIR_00M   *size_Mat];
-		Dad.f[DIR_PP0]   = &DD_AD[DIR_PP0  *size_Mat];
-		Dad.f[DIR_MM0]   = &DD_AD[DIR_MM0  *size_Mat];
-		Dad.f[DIR_PM0]   = &DD_AD[DIR_PM0  *size_Mat];
-		Dad.f[DIR_MP0]   = &DD_AD[DIR_MP0  *size_Mat];
-		Dad.f[DIR_P0P]   = &DD_AD[DIR_P0P  *size_Mat];
-		Dad.f[DIR_M0M]   = &DD_AD[DIR_M0M  *size_Mat];
-		Dad.f[DIR_P0M]   = &DD_AD[DIR_P0M  *size_Mat];
-		Dad.f[DIR_M0P]   = &DD_AD[DIR_M0P  *size_Mat];
-		Dad.f[DIR_0PP]   = &DD_AD[DIR_0PP  *size_Mat];
-		Dad.f[DIR_0MM]   = &DD_AD[DIR_0MM  *size_Mat];
-		Dad.f[DIR_0PM]   = &DD_AD[DIR_0PM  *size_Mat];
-		Dad.f[DIR_0MP]   = &DD_AD[DIR_0MP  *size_Mat];
-		Dad.f[DIR_000] = &DD_AD[DIR_000*size_Mat];
-		Dad.f[DIR_PPP]  = &DD_AD[DIR_PPP *size_Mat];
-		Dad.f[DIR_MMP]  = &DD_AD[DIR_MMP *size_Mat];
-		Dad.f[DIR_PMP]  = &DD_AD[DIR_PMP *size_Mat];
-		Dad.f[DIR_MPP]  = &DD_AD[DIR_MPP *size_Mat];
-		Dad.f[DIR_PPM]  = &DD_AD[DIR_PPM *size_Mat];
-		Dad.f[DIR_MMM]  = &DD_AD[DIR_MMM *size_Mat];
-		Dad.f[DIR_PMM]  = &DD_AD[DIR_PMM *size_Mat];
-		Dad.f[DIR_MPM]  = &DD_AD[DIR_MPM *size_Mat];
-	}						
-	else					
-	{						
-		Dad.f[DIR_M00]    = &DD_AD[DIR_P00   *size_Mat];
-		Dad.f[DIR_P00]    = &DD_AD[DIR_M00   *size_Mat];
-		Dad.f[DIR_0M0]    = &DD_AD[DIR_0P0   *size_Mat];
-		Dad.f[DIR_0P0]    = &DD_AD[DIR_0M0   *size_Mat];
-		Dad.f[DIR_00M]    = &DD_AD[DIR_00P   *size_Mat];
-		Dad.f[DIR_00P]    = &DD_AD[DIR_00M   *size_Mat];
-		Dad.f[DIR_MM0]   = &DD_AD[DIR_PP0  *size_Mat];
-		Dad.f[DIR_PP0]   = &DD_AD[DIR_MM0  *size_Mat];
-		Dad.f[DIR_MP0]   = &DD_AD[DIR_PM0  *size_Mat];
-		Dad.f[DIR_PM0]   = &DD_AD[DIR_MP0  *size_Mat];
-		Dad.f[DIR_M0M]   = &DD_AD[DIR_P0P  *size_Mat];
-		Dad.f[DIR_P0P]   = &DD_AD[DIR_M0M  *size_Mat];
-		Dad.f[DIR_M0P]   = &DD_AD[DIR_P0M  *size_Mat];
-		Dad.f[DIR_P0M]   = &DD_AD[DIR_M0P  *size_Mat];
-		Dad.f[DIR_0MM]   = &DD_AD[DIR_0PP  *size_Mat];
-		Dad.f[DIR_0PP]   = &DD_AD[DIR_0MM  *size_Mat];
-		Dad.f[DIR_0MP]   = &DD_AD[DIR_0PM  *size_Mat];
-		Dad.f[DIR_0PM]   = &DD_AD[DIR_0MP  *size_Mat];
-		Dad.f[DIR_000] = &DD_AD[DIR_000*size_Mat];
-		Dad.f[DIR_PPP]  = &DD_AD[DIR_MMM *size_Mat];
-		Dad.f[DIR_MMP]  = &DD_AD[DIR_PPM *size_Mat];
-		Dad.f[DIR_PMP]  = &DD_AD[DIR_MPM *size_Mat];
-		Dad.f[DIR_MPP]  = &DD_AD[DIR_PMM *size_Mat];
-		Dad.f[DIR_PPM]  = &DD_AD[DIR_MMP *size_Mat];
-		Dad.f[DIR_MMM]  = &DD_AD[DIR_PPP *size_Mat];
-		Dad.f[DIR_PMM]  = &DD_AD[DIR_MPP *size_Mat];
-		Dad.f[DIR_MPM]  = &DD_AD[DIR_PMP *size_Mat];
-	}
-	////////////////////////////////////////////////////////////////////////////////
-	const unsigned  x = threadIdx.x;  // Globaler x-Index 
-	const unsigned  y = blockIdx.x;   // Globaler y-Index 
-	const unsigned  z = blockIdx.y;   // Globaler z-Index 
-
-	const unsigned nx = blockDim.x;
-	const unsigned ny = gridDim.x;
-
-	const unsigned k = nx*(ny*z + y) + x;
-	//////////////////////////////////////////////////////////////////////////
-
-	if (k < size_Mat)
-	{
-		//////////////////////////////////////////////////////////////////////////
-		//index
-		//unsigned int kzero = k;
-		unsigned int ke = k;
-		unsigned int kw = neighborX[k];
-		unsigned int kn = k;
-		unsigned int ks = neighborY[k];
-		unsigned int kt = k;
-		unsigned int kb = neighborZ[k];
-		unsigned int ksw = neighborY[kw];
-		unsigned int kne = k;
-		unsigned int kse = ks;
-		unsigned int knw = kw;
-		unsigned int kbw = neighborZ[kw];
-		unsigned int kte = k;
-		unsigned int kbe = kb;
-		unsigned int ktw = kw;
-		unsigned int kbs = neighborZ[ks];
-		unsigned int ktn = k;
-		unsigned int kbn = kb;
-		unsigned int kts = ks;
-		unsigned int ktse = ks;
-		unsigned int kbnw = kbw;
-		unsigned int ktnw = kw;
-		unsigned int kbse = kbs;
-		unsigned int ktsw = ksw;
-		unsigned int kbne = kb;
-		unsigned int ktne = k;
-		unsigned int kbsw = neighborZ[ksw];
-		//////////////////////////////////////////////////////////////////////////
-		real CONC  = concD[k];
-		real PRESS = pressD[k];
-		real RHO   = rhoD[k];
-		real VX    = vxD[k];
-		real VY    = vyD[k];
-		real VZ    = vzD[k];
-		//////////////////////////////////////////////////////////////////////////
-		concD[k] = c0o1;
-		pressD[k] = c0o1;
-		rhoD[k] = c0o1;
-		vxD[k] = c0o1;
-		vyD[k] = c0o1;
-		vzD[k] = c0o1;
-
-		if (geoD[k] == GEO_FLUID)
-		{
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];  
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];  
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];  
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];  
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ]; 
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ]; 
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];  
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ]; 
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ]; 
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];  
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ]; 
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ]; 
-			real mfbbb = (D.f[DIR_000])[k];//[kzero];
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ]; 
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ]; 
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ]; 
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ]; 
-			////////////////////////////////////////////////////////////////////////////////////
-			real drho =
-				((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
-				 (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
-				  ((mfabb + mfcbb) + (mfbab + mfbcb)  +  (mfbba + mfbbc))) + mfbbb;
-			real rho = c1o1 + drho;
-			////////////////////////////////////////////////////////////////////////////////////
-
-			rhoD[k] = drho + RHO;
-
-			vxD[k] =
-				(((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
-				(((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) +
-					(mfcbb - mfabb)) / rho) + VX;
-			
-			vyD[k] =
-				(((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
-				(((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) +
-					(mfbcb - mfbab)) / rho) + VY;
-			
-			vzD[k] =
-				(((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
-				(((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) +
-					(mfbbc - mfbba)) / rho) + VZ;
-
-			pressD[k] = 
-				((D.f[DIR_P00])[ke] + (D.f[DIR_M00])[kw] +
-				 (D.f[DIR_0P0])[kn] + (D.f[DIR_0M0])[ks] +
-				 (D.f[DIR_00P])[kt] + (D.f[DIR_00M])[kb] +
-				 c2o1*(
-				 (D.f[DIR_PP0])[kne] + (D.f[DIR_MM0])[ksw] +
-				 (D.f[DIR_PM0])[kse] + (D.f[DIR_MP0])[knw] +
-				 (D.f[DIR_P0P])[kte] + (D.f[DIR_M0M])[kbw] +
-				 (D.f[DIR_P0M])[kbe] + (D.f[DIR_M0P])[ktw] +
-				 (D.f[DIR_0PP])[ktn] + (D.f[DIR_0MM])[kbs] +
-				 (D.f[DIR_0PM])[kbn] + (D.f[DIR_0MP])[kts]) +
-				 c3o1*(
-				 (D.f[DIR_PPP])[ktne] + (D.f[DIR_MMP])[ktsw] +
-				 (D.f[DIR_PMP])[ktse] + (D.f[DIR_MPP])[ktnw] +
-				 (D.f[DIR_PPM])[kbne] + (D.f[DIR_MMM])[kbsw] +
-				 (D.f[DIR_PMM])[kbse] + (D.f[DIR_MPM])[kbnw]) -
-				 rhoD[k] - (vxD[k] * vxD[k] + vyD[k] * vyD[k] + vzD[k] * vzD[k]) * (c1o1 + rhoD[k])) * c1o2 + rhoD[k] +
-				 PRESS;
-				 //achtung op hart gesetzt Annahme op = 1 ;                                                    ^^^^(1.0/op-0.5)=0.5
-			//////////////////////////////////////////////////////////////////////////
-			mfcbb = (Dad.f[DIR_P00   ])[k   ];
-			mfabb = (Dad.f[DIR_M00   ])[kw  ];
-			mfbcb = (Dad.f[DIR_0P0   ])[k   ];
-			mfbab = (Dad.f[DIR_0M0   ])[ks  ];
-			mfbbc = (Dad.f[DIR_00P   ])[k   ];
-			mfbba = (Dad.f[DIR_00M   ])[kb  ];
-			mfccb = (Dad.f[DIR_PP0  ])[k   ];
-			mfaab = (Dad.f[DIR_MM0  ])[ksw ];
-			mfcab = (Dad.f[DIR_PM0  ])[ks  ];
-			mfacb = (Dad.f[DIR_MP0  ])[kw  ];
-			mfcbc = (Dad.f[DIR_P0P  ])[k   ];
-			mfaba = (Dad.f[DIR_M0M  ])[kbw ];
-			mfcba = (Dad.f[DIR_P0M  ])[kb  ];
-			mfabc = (Dad.f[DIR_M0P  ])[kw  ];
-			mfbcc = (Dad.f[DIR_0PP  ])[k   ];
-			mfbaa = (Dad.f[DIR_0MM  ])[kbs ];
-			mfbca = (Dad.f[DIR_0PM  ])[kb  ];
-			mfbac = (Dad.f[DIR_0MP  ])[ks  ];
-			mfbbb = (Dad.f[DIR_000])[k   ];
-			mfccc = (Dad.f[DIR_PPP ])[k   ];
-			mfaac = (Dad.f[DIR_MMP ])[ksw ];
-			mfcac = (Dad.f[DIR_PMP ])[ks  ];
-			mfacc = (Dad.f[DIR_MPP ])[kw  ];
-			mfcca = (Dad.f[DIR_PPM ])[kb  ];
-			mfaaa = (Dad.f[DIR_MMM ])[kbsw];
-			mfcaa = (Dad.f[DIR_PMM ])[kbs ];
-			mfaca = (Dad.f[DIR_MPM ])[kbw ];
-			//////////////////////////////////////////////////////////////////////////
-			concD[k] = 
-				((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa)   + (mfaac + mfcca))) +
-				 (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba)   + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
-				  ((mfabb + mfcbb) + (mfbab + mfbcb)  +  (mfbba + mfbbc))) +  mfbbb + CONC;
-		}
-	}
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
+
+    //////////////////////////////////////////////////////////////////////////
+    if ( nodeIndex < numberOfLBnodes )
+    {
+        //////////////////////////////////////////////////////////////////////////
+        //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on
+        //! timestep is based on the esoteric twist algorithm \ref <a
+        //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
+        //! DOI:10.3390/computation5020019 ]</b></a>
+        //!
+        Distributions27 dist, distAD;
+        getPointersToDistributions(dist,   distributions,   numberOfLBnodes, isEvenTimestep);
+        getPointersToDistributions(distAD, distributionsAD, numberOfLBnodes, isEvenTimestep);
+
+        //////////////////////////////////////////////////////////////////////////
+        //index
+        //unsigned int kzero = k;
+        unsigned int ke = nodeIndex;
+        unsigned int kw = neighborX[nodeIndex];
+        unsigned int kn = nodeIndex;
+        unsigned int ks = neighborY[nodeIndex];
+        unsigned int kt = nodeIndex;
+        unsigned int kb = neighborZ[nodeIndex];
+        unsigned int ksw = neighborY[kw];
+        unsigned int kne = nodeIndex;
+        unsigned int kse = ks;
+        unsigned int knw = kw;
+        unsigned int kbw = neighborZ[kw];
+        unsigned int kte = nodeIndex;
+        unsigned int kbe = kb;
+        unsigned int ktw = kw;
+        unsigned int kbs = neighborZ[ks];
+        unsigned int ktn = nodeIndex;
+        unsigned int kbn = kb;
+        unsigned int kts = ks;
+        unsigned int ktse = ks;
+        unsigned int kbnw = kbw;
+        unsigned int ktnw = kw;
+        unsigned int kbse = kbs;
+        unsigned int ktsw = ksw;
+        unsigned int kbne = kb;
+        unsigned int ktne = nodeIndex;
+        unsigned int kbsw = neighborZ[ksw];
+        //////////////////////////////////////////////////////////////////////////
+        real CONC  = concD[nodeIndex];
+        real PRESS = pressD[nodeIndex];
+        real RHO   = rhoD[nodeIndex];
+        real VX    = vxD[nodeIndex];
+        real VY    = vyD[nodeIndex];
+        real VZ    = vzD[nodeIndex];
+        //////////////////////////////////////////////////////////////////////////
+        concD[nodeIndex]  = c0o1;
+        pressD[nodeIndex] = c0o1;
+        rhoD[nodeIndex]   = c0o1;
+        vxD[nodeIndex]    = c0o1;
+        vyD[nodeIndex]    = c0o1;
+        vzD[nodeIndex]    = c0o1;
+        
+        if (geoD[nodeIndex] == GEO_FLUID)
+        {
+            real mfcbb = (dist.f[DIR_P00])[nodeIndex];//[ke   ];
+            real mfabb = (dist.f[DIR_M00])[kw];//[kw   ];  
+            real mfbcb = (dist.f[DIR_0P0])[nodeIndex];//[kn   ];
+            real mfbab = (dist.f[DIR_0M0])[ks];//[ks   ];  
+            real mfbbc = (dist.f[DIR_00P])[nodeIndex];//[kt   ];
+            real mfbba = (dist.f[DIR_00M])[kb];//[kb   ];  
+            real mfccb = (dist.f[DIR_PP0])[nodeIndex];//[kne  ];  
+            real mfaab = (dist.f[DIR_MM0])[ksw];//[ksw  ];
+            real mfcab = (dist.f[DIR_PM0])[ks];//[kse  ]; 
+            real mfacb = (dist.f[DIR_MP0])[kw];//[knw  ]; 
+            real mfcbc = (dist.f[DIR_P0P])[nodeIndex];//[kte  ];  
+            real mfaba = (dist.f[DIR_M0M])[kbw];//[kbw  ];
+            real mfcba = (dist.f[DIR_P0M])[kb];//[kbe  ]; 
+            real mfabc = (dist.f[DIR_M0P])[kw];//[ktw  ]; 
+            real mfbcc = (dist.f[DIR_0PP])[nodeIndex];//[ktn  ];  
+            real mfbaa = (dist.f[DIR_0MM])[kbs];//[kbs  ];
+            real mfbca = (dist.f[DIR_0PM])[kb];//[kbn  ]; 
+            real mfbac = (dist.f[DIR_0MP])[ks];//[kts  ]; 
+            real mfbbb = (dist.f[DIR_000])[nodeIndex];//[kzero];
+            real mfccc = (dist.f[DIR_PPP])[nodeIndex];//[ktne ]; 
+            real mfaac = (dist.f[DIR_MMP])[ksw];//[ktsw ]; 
+            real mfcac = (dist.f[DIR_PMP])[ks];//[ktse ];
+            real mfacc = (dist.f[DIR_MPP])[kw];//[ktnw ];
+            real mfcca = (dist.f[DIR_PPM])[kb];//[kbne ];
+            real mfaaa = (dist.f[DIR_MMM])[kbsw];//[kbsw ];
+            real mfcaa = (dist.f[DIR_PMM])[kbs];//[kbse ]; 
+            real mfaca = (dist.f[DIR_MPM])[kbw];//[kbnw ]; 
+            ////////////////////////////////////////////////////////////////////////////////////
+            real drho =
+                ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
+                 (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
+                  ((mfabb + mfcbb) + (mfbab + mfbcb)  +  (mfbba + mfbbc))) + mfbbb;
+            real rho = c1o1 + drho;
+            ////////////////////////////////////////////////////////////////////////////////////
+            
+            rhoD[nodeIndex] = drho + RHO;
+            
+            vxD[nodeIndex] =
+                (((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
+                (((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) +
+                    (mfcbb - mfabb)) / rho) + VX;
+            
+            vyD[nodeIndex] =
+                (((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
+                (((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) +
+                    (mfbcb - mfbab)) / rho) + VY;
+            
+            vzD[nodeIndex] =
+                (((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
+                (((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) +
+                    (mfbbc - mfbba)) / rho) + VZ;
+            
+            pressD[nodeIndex] = 
+                ((dist.f[DIR_P00])[ke] + (dist.f[DIR_M00])[kw] +
+                 (dist.f[DIR_0P0])[kn] + (dist.f[DIR_0M0])[ks] +
+                 (dist.f[DIR_00P])[kt] + (dist.f[DIR_00M])[kb] +
+                 c2o1*(
+                 (dist.f[DIR_PP0])[kne] + (dist.f[DIR_MM0])[ksw] +
+                 (dist.f[DIR_PM0])[kse] + (dist.f[DIR_MP0])[knw] +
+                 (dist.f[DIR_P0P])[kte] + (dist.f[DIR_M0M])[kbw] +
+                 (dist.f[DIR_P0M])[kbe] + (dist.f[DIR_M0P])[ktw] +
+                 (dist.f[DIR_0PP])[ktn] + (dist.f[DIR_0MM])[kbs] +
+                 (dist.f[DIR_0PM])[kbn] + (dist.f[DIR_0MP])[kts]) +
+                 c3o1*(
+                 (dist.f[DIR_PPP])[ktne] + (dist.f[DIR_MMP])[ktsw] +
+                 (dist.f[DIR_PMP])[ktse] + (dist.f[DIR_MPP])[ktnw] +
+                 (dist.f[DIR_PPM])[kbne] + (dist.f[DIR_MMM])[kbsw] +
+                 (dist.f[DIR_PMM])[kbse] + (dist.f[DIR_MPM])[kbnw]) -
+                 rhoD[nodeIndex] - (vxD[nodeIndex] * vxD[nodeIndex] + vyD[nodeIndex] * vyD[nodeIndex] + vzD[nodeIndex] * vzD[nodeIndex]) * (c1o1 + rhoD[nodeIndex])) * c1o2 + rhoD[nodeIndex] +
+                 PRESS;
+                 //achtung op hart gesetzt Annahme op = 1 ;                                                    ^^^^(1.0/op-0.5)=0.5
+            //////////////////////////////////////////////////////////////////////////
+            mfcbb = (distAD.f[DIR_P00])[nodeIndex   ];
+            mfabb = (distAD.f[DIR_M00])[kw  ];
+            mfbcb = (distAD.f[DIR_0P0])[nodeIndex   ];
+            mfbab = (distAD.f[DIR_0M0])[ks  ];
+            mfbbc = (distAD.f[DIR_00P])[nodeIndex   ];
+            mfbba = (distAD.f[DIR_00M])[kb  ];
+            mfccb = (distAD.f[DIR_PP0])[nodeIndex   ];
+            mfaab = (distAD.f[DIR_MM0])[ksw ];
+            mfcab = (distAD.f[DIR_PM0])[ks  ];
+            mfacb = (distAD.f[DIR_MP0])[kw  ];
+            mfcbc = (distAD.f[DIR_P0P])[nodeIndex   ];
+            mfaba = (distAD.f[DIR_M0M])[kbw ];
+            mfcba = (distAD.f[DIR_P0M])[kb  ];
+            mfabc = (distAD.f[DIR_M0P])[kw  ];
+            mfbcc = (distAD.f[DIR_0PP])[nodeIndex   ];
+            mfbaa = (distAD.f[DIR_0MM])[kbs ];
+            mfbca = (distAD.f[DIR_0PM])[kb  ];
+            mfbac = (distAD.f[DIR_0MP])[ks  ];
+            mfbbb = (distAD.f[DIR_000])[nodeIndex   ];
+            mfccc = (distAD.f[DIR_PPP])[nodeIndex   ];
+            mfaac = (distAD.f[DIR_MMP])[ksw ];
+            mfcac = (distAD.f[DIR_PMP])[ks  ];
+            mfacc = (distAD.f[DIR_MPP])[kw  ];
+            mfcca = (distAD.f[DIR_PPM])[kb  ];
+            mfaaa = (distAD.f[DIR_MMM])[kbsw];
+            mfcaa = (distAD.f[DIR_PMM])[kbs ];
+            mfaca = (distAD.f[DIR_MPM])[kbw ];
+            //////////////////////////////////////////////////////////////////////////
+            concD[nodeIndex] = 
+                ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa)   + (mfaac + mfcca))) +
+                 (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba)   + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
+                  ((mfabb + mfcbb) + (mfbab + mfbcb)  +  (mfbba + mfbbc))) +  mfbbb + CONC;
+        }
+    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -1168,54 +896,50 @@ __global__ void LBCalcMedCompAD27(
 
 
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void LBCalcMacMedSP27( real* vxD,
-                                             real* vyD,
-                                             real* vzD,
-                                             real* rhoD,
-                                             real* pressD,
-                                             unsigned int* geoD,
-                                             unsigned int* neighborX,
-                                             unsigned int* neighborY,
-                                             unsigned int* neighborZ,
-                                             unsigned int tdiff,
-                                             unsigned int size_Mat,
-                                             bool isEvenTimestep)
+__global__ void LBCalcMacMedSP27(
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned int tdiff,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
-   ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
-
-   const unsigned k = nx*(ny*z + y) + x;
-   //////////////////////////////////////////////////////////////////////////
-
-   if(k<size_Mat)
-   {
-      //////////////////////////////////////////////////////////////////////////
-      real PRESS = pressD[k];
-      real RHO   = rhoD[k];
-      real VX    = vxD[k];
-      real VY    = vyD[k];
-      real VZ    = vzD[k];
-      //////////////////////////////////////////////////////////////////////////
-      pressD[k] = c0o1;
-      rhoD[k]   = c0o1;
-      vxD[k]    = c0o1;
-      vyD[k]    = c0o1;
-      vzD[k]    = c0o1;
-
-      if(geoD[k] == GEO_FLUID)
-      {
-         rhoD[k]    =   RHO   / tdiff;
-         vxD[k]     =   VX    / tdiff;
-         vyD[k]     =   VY    / tdiff;
-         vzD[k]     =   VZ    / tdiff;
-         pressD[k]  =   PRESS / tdiff;    
-      }
-   }
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
+
+    //////////////////////////////////////////////////////////////////////////
+    if(nodeIndex<numberOfLBnodes)
+    {
+        //////////////////////////////////////////////////////////////////////////
+        real PRESS = pressD[nodeIndex];
+        real RHO   = rhoD[nodeIndex];
+        real VX    = vxD[nodeIndex];
+        real VY    = vyD[nodeIndex];
+        real VZ    = vzD[nodeIndex];
+        //////////////////////////////////////////////////////////////////////////
+        pressD[nodeIndex] = c0o1;
+        rhoD[nodeIndex]   = c0o1;
+        vxD[nodeIndex]    = c0o1;
+        vyD[nodeIndex]    = c0o1;
+        vzD[nodeIndex]    = c0o1;
+       
+        if(geoD[nodeIndex] == GEO_FLUID)
+        {
+            rhoD[nodeIndex]    =   RHO   / tdiff;
+            vxD[nodeIndex]     =   VX    / tdiff;
+            vyD[nodeIndex]     =   VY    / tdiff;
+            vzD[nodeIndex]     =   VZ    / tdiff;
+            pressD[nodeIndex]  =   PRESS / tdiff;    
+        }
+    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -1241,34 +965,29 @@ __global__ void LBCalcMacMedSP27( real* vxD,
 
 ////////////////////////////////////////////////////////////////////////////////
 __global__ void LBResetMedianValuesSP27(
-	real* vxD,
-	real* vyD,
-	real* vzD,
-	real* rhoD,
-	real* pressD,
-	unsigned int size_Mat,
-	bool isEvenTimestep)
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
-	////////////////////////////////////////////////////////////////////////////////
-	const unsigned  x = threadIdx.x;  // Globaler x-Index 
-	const unsigned  y = blockIdx.x;   // Globaler y-Index 
-	const unsigned  z = blockIdx.y;   // Globaler z-Index 
-
-	const unsigned nx = blockDim.x;
-	const unsigned ny = gridDim.x;
-
-	const unsigned k = nx*(ny*z + y) + x;
-	//////////////////////////////////////////////////////////////////////////
-
-	if (k<size_Mat)
-	{
-		//////////////////////////////////////////////////////////////////////////
-		pressD[k] = c0o1;
-		rhoD[k] = c0o1;
-		vxD[k] = c0o1;
-		vyD[k] = c0o1;
-		vzD[k] = c0o1;
-	}
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
+
+    //////////////////////////////////////////////////////////////////////////
+    if ( nodeIndex < numberOfLBnodes )
+    {
+        //////////////////////////////////////////////////////////////////////////
+        pressD[nodeIndex] = c0o1;
+        rhoD[nodeIndex] = c0o1;
+        vxD[nodeIndex] = c0o1;
+        vyD[nodeIndex] = c0o1;
+        vzD[nodeIndex] = c0o1;
+    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -1294,36 +1013,30 @@ __global__ void LBResetMedianValuesSP27(
 
 ////////////////////////////////////////////////////////////////////////////////
 __global__ void LBResetMedianValuesAD27(
-	real* vxD,
-	real* vyD,
-	real* vzD,
-	real* rhoD,
-	real* pressD,
-	real* concD,
-	unsigned int size_Mat,
-	bool isEvenTimestep)
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    real* concD,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
-	////////////////////////////////////////////////////////////////////////////////
-	const unsigned  x = threadIdx.x;  // Globaler x-Index 
-	const unsigned  y = blockIdx.x;   // Globaler y-Index 
-	const unsigned  z = blockIdx.y;   // Globaler z-Index 
-
-	const unsigned nx = blockDim.x;
-	const unsigned ny = gridDim.x;
-
-	const unsigned k = nx*(ny*z + y) + x;
-	//////////////////////////////////////////////////////////////////////////
-
-	if (k < size_Mat)
-	{
-		//////////////////////////////////////////////////////////////////////////
-		concD[k]  = c0o1;
-		pressD[k] = c0o1;
-		rhoD[k]   = c0o1;
-		vxD[k]    = c0o1;
-		vyD[k]    = c0o1;
-		vzD[k]    = c0o1;
-	}
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
+
+    //////////////////////////////////////////////////////////////////////////
+    if (nodeIndex < numberOfLBnodes)
+    {
+        concD[nodeIndex]  = c0o1;
+        pressD[nodeIndex] = c0o1;
+        rhoD[nodeIndex]   = c0o1;
+        vxD[nodeIndex]    = c0o1;
+        vyD[nodeIndex]    = c0o1;
+        vzD[nodeIndex]    = c0o1;
+    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -1348,177 +1061,121 @@ __global__ void LBResetMedianValuesAD27(
 
 
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void LBCalcMeasurePoints( real* vxMP,
-												real* vyMP,
-												real* vzMP,
-												real* rhoMP,
-												unsigned int* kMP,
-												unsigned int numberOfPointskMP,
-												unsigned int MPClockCycle,
-												unsigned int t,
-												unsigned int* geoD,
-												unsigned int* neighborX,
-												unsigned int* neighborY,
-												unsigned int* neighborZ,
-												unsigned int size_Mat,
-												real* DD,
-												bool isEvenTimestep)
+__global__ void LBCalcMeasurePoints(
+    real* vxMP,
+    real* vyMP,
+    real* vzMP,
+    real* rhoMP,
+    unsigned int* kMP,
+    unsigned int numberOfPointskMP,
+    unsigned int MPClockCycle,
+    unsigned int t,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    real* distributions,
+    bool isEvenTimestep)
 {
-	Distributions27 D;
-	if (isEvenTimestep==true)
-	{
-		D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-	} 
-	else
-	{
-		D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
-	}
-	////////////////////////////////////////////////////////////////////////////////
-	const unsigned  x = threadIdx.x;  // Globaler x-Index 
-	const unsigned  y = blockIdx.x;   // Globaler y-Index 
-	const unsigned  z = blockIdx.y;   // Globaler z-Index 
-
-	const unsigned nx = blockDim.x;
-	const unsigned ny = gridDim.x;
-
-	const unsigned k = nx*(ny*z + y) + x;
-	//////////////////////////////////////////////////////////////////////////
-
-	if(k<numberOfPointskMP)
-	{
-      //////////////////////////////////////////////////////////////////////////
-      //index
-      unsigned int kzero= kMP[k];//k;
-      unsigned int ke   = kzero;
-      unsigned int kw   = neighborX[kzero];
-      unsigned int kn   = kzero;
-      unsigned int ks   = neighborY[kzero];
-      unsigned int kt   = kzero;
-      unsigned int kb   = neighborZ[kzero];
-      unsigned int ksw  = neighborY[kw];
-      unsigned int kne  = kzero;
-      unsigned int kse  = ks;
-      unsigned int knw  = kw;
-      unsigned int kbw  = neighborZ[kw];
-      unsigned int kte  = kzero;
-      unsigned int kbe  = kb;
-      unsigned int ktw  = kw;
-      unsigned int kbs  = neighborZ[ks];
-      unsigned int ktn  = kzero;
-      unsigned int kbn  = kb;
-      unsigned int kts  = ks;
-      unsigned int ktse = ks;
-      unsigned int kbnw = kbw;
-      unsigned int ktnw = kw;
-      unsigned int kbse = kbs;
-      unsigned int ktsw = ksw;
-      unsigned int kbne = kb;
-      unsigned int ktne = kzero;
-      unsigned int kbsw = neighborZ[ksw];
-      //////////////////////////////////////////////////////////////////////////
-	  unsigned int kMac = k*MPClockCycle + t;
-	  //////////////////////////////////////////////////////////////////////////
-
-      if(geoD[kzero] == GEO_FLUID)
-      {
-         rhoMP[kMac]=   (D.f[DIR_P00   ])[ke  ]+ (D.f[DIR_M00   ])[kw  ]+ 
-                        (D.f[DIR_0P0   ])[kn  ]+ (D.f[DIR_0M0   ])[ks  ]+
-                        (D.f[DIR_00P   ])[kt  ]+ (D.f[DIR_00M   ])[kb  ]+
-                        (D.f[DIR_PP0  ])[kne ]+ (D.f[DIR_MM0  ])[ksw ]+
-                        (D.f[DIR_PM0  ])[kse ]+ (D.f[DIR_MP0  ])[knw ]+
-                        (D.f[DIR_P0P  ])[kte ]+ (D.f[DIR_M0M  ])[kbw ]+
-                        (D.f[DIR_P0M  ])[kbe ]+ (D.f[DIR_M0P  ])[ktw ]+
-                        (D.f[DIR_0PP  ])[ktn ]+ (D.f[DIR_0MM  ])[kbs ]+
-                        (D.f[DIR_0PM  ])[kbn ]+ (D.f[DIR_0MP  ])[kts ]+
-                        (D.f[DIR_000])[kzero]+ 
-                        (D.f[DIR_PPP ])[ktne]+ (D.f[DIR_MMP ])[ktsw]+ 
-                        (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]+ 
-                        (D.f[DIR_PPM ])[kbne]+ (D.f[DIR_MMM ])[kbsw]+ 
-                        (D.f[DIR_PMM ])[kbse]+ (D.f[DIR_MPM ])[kbnw];
-
-         vxMP[kMac] =   (D.f[DIR_P00   ])[ke  ]- (D.f[DIR_M00   ])[kw  ]+ 
-                        (D.f[DIR_PP0  ])[kne ]- (D.f[DIR_MM0  ])[ksw ]+
-                        (D.f[DIR_PM0  ])[kse ]- (D.f[DIR_MP0  ])[knw ]+
-                        (D.f[DIR_P0P  ])[kte ]- (D.f[DIR_M0M  ])[kbw ]+
-                        (D.f[DIR_P0M  ])[kbe ]- (D.f[DIR_M0P  ])[ktw ]+
-                        (D.f[DIR_PPP ])[ktne]- (D.f[DIR_MMP ])[ktsw]+ 
-                        (D.f[DIR_PMP ])[ktse]- (D.f[DIR_MPP ])[ktnw]+ 
-                        (D.f[DIR_PPM ])[kbne]- (D.f[DIR_MMM ])[kbsw]+ 
-                        (D.f[DIR_PMM ])[kbse]- (D.f[DIR_MPM ])[kbnw];
-
-         vyMP[kMac] =   (D.f[DIR_0P0   ])[kn  ]- (D.f[DIR_0M0   ])[ks  ]+
-                        (D.f[DIR_PP0  ])[kne ]- (D.f[DIR_MM0  ])[ksw ]-
-                        (D.f[DIR_PM0  ])[kse ]+ (D.f[DIR_MP0  ])[knw ]+
-                        (D.f[DIR_0PP  ])[ktn ]- (D.f[DIR_0MM  ])[kbs ]+
-                        (D.f[DIR_0PM  ])[kbn ]- (D.f[DIR_0MP  ])[kts ]+
-                        (D.f[DIR_PPP ])[ktne]- (D.f[DIR_MMP ])[ktsw]- 
-                        (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]+ 
-                        (D.f[DIR_PPM ])[kbne]- (D.f[DIR_MMM ])[kbsw]- 
-                        (D.f[DIR_PMM ])[kbse]+ (D.f[DIR_MPM ])[kbnw];
-
-         vzMP[kMac] =   (D.f[DIR_00P   ])[kt  ]- (D.f[DIR_00M   ])[kb  ]+
-                        (D.f[DIR_P0P  ])[kte ]- (D.f[DIR_M0M  ])[kbw ]-
-                        (D.f[DIR_P0M  ])[kbe ]+ (D.f[DIR_M0P  ])[ktw ]+
-                        (D.f[DIR_0PP  ])[ktn ]- (D.f[DIR_0MM  ])[kbs ]-
-                        (D.f[DIR_0PM  ])[kbn ]+ (D.f[DIR_0MP  ])[kts ]+
-                        (D.f[DIR_PPP ])[ktne]+ (D.f[DIR_MMP ])[ktsw]+ 
-                        (D.f[DIR_PMP ])[ktse]+ (D.f[DIR_MPP ])[ktnw]- 
-                        (D.f[DIR_PPM ])[kbne]- (D.f[DIR_MMM ])[kbsw]- 
-                        (D.f[DIR_PMM ])[kbse]- (D.f[DIR_MPM ])[kbnw];
-      }
-   }
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
+
+    //////////////////////////////////////////////////////////////////////////
+    if( nodeIndex < numberOfPointskMP )
+    {
+        //////////////////////////////////////////////////////////////////////////
+        //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on
+        //! timestep is based on the esoteric twist algorithm \ref <a
+        //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
+        //! DOI:10.3390/computation5020019 ]</b></a>
+        //!
+        Distributions27 dist;
+        getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+
+        //////////////////////////////////////////////////////////////////////////
+        //index
+        unsigned int kzero= kMP[nodeIndex];//k;
+        unsigned int ke   = kzero;
+        unsigned int kw   = neighborX[kzero];
+        unsigned int kn   = kzero;
+        unsigned int ks   = neighborY[kzero];
+        unsigned int kt   = kzero;
+        unsigned int kb   = neighborZ[kzero];
+        unsigned int ksw  = neighborY[kw];
+        unsigned int kne  = kzero;
+        unsigned int kse  = ks;
+        unsigned int knw  = kw;
+        unsigned int kbw  = neighborZ[kw];
+        unsigned int kte  = kzero;
+        unsigned int kbe  = kb;
+        unsigned int ktw  = kw;
+        unsigned int kbs  = neighborZ[ks];
+        unsigned int ktn  = kzero;
+        unsigned int kbn  = kb;
+        unsigned int kts  = ks;
+        unsigned int ktse = ks;
+        unsigned int kbnw = kbw;
+        unsigned int ktnw = kw;
+        unsigned int kbse = kbs;
+        unsigned int ktsw = ksw;
+        unsigned int kbne = kb;
+        unsigned int ktne = kzero;
+        unsigned int kbsw = neighborZ[ksw];
+        //////////////////////////////////////////////////////////////////////////
+	    unsigned int kMac = nodeIndex*MPClockCycle + t;
+	    //////////////////////////////////////////////////////////////////////////
+        
+        if(geoD[kzero] == GEO_FLUID)
+        {
+            rhoMP[kMac]= (dist.f[DIR_P00])[ke  ]+ (dist.f[DIR_M00])[kw  ]+ 
+                         (dist.f[DIR_0P0])[kn  ]+ (dist.f[DIR_0M0])[ks  ]+
+                         (dist.f[DIR_00P])[kt  ]+ (dist.f[DIR_00M])[kb  ]+
+                         (dist.f[DIR_PP0])[kne ]+ (dist.f[DIR_MM0])[ksw ]+
+                         (dist.f[DIR_PM0])[kse ]+ (dist.f[DIR_MP0])[knw ]+
+                         (dist.f[DIR_P0P])[kte ]+ (dist.f[DIR_M0M])[kbw ]+
+                         (dist.f[DIR_P0M])[kbe ]+ (dist.f[DIR_M0P])[ktw ]+
+                         (dist.f[DIR_0PP])[ktn ]+ (dist.f[DIR_0MM])[kbs ]+
+                         (dist.f[DIR_0PM])[kbn ]+ (dist.f[DIR_0MP])[kts ]+
+                         (dist.f[DIR_000])[kzero]+ 
+                         (dist.f[DIR_PPP])[ktne]+ (dist.f[DIR_MMP])[ktsw]+ 
+                         (dist.f[DIR_PMP])[ktse]+ (dist.f[DIR_MPP])[ktnw]+ 
+                         (dist.f[DIR_PPM])[kbne]+ (dist.f[DIR_MMM])[kbsw]+ 
+                         (dist.f[DIR_PMM])[kbse]+ (dist.f[DIR_MPM])[kbnw];
+           
+            vxMP[kMac] = (dist.f[DIR_P00])[ke  ]- (dist.f[DIR_M00])[kw  ]+ 
+                         (dist.f[DIR_PP0])[kne ]- (dist.f[DIR_MM0])[ksw ]+
+                         (dist.f[DIR_PM0])[kse ]- (dist.f[DIR_MP0])[knw ]+
+                         (dist.f[DIR_P0P])[kte ]- (dist.f[DIR_M0M])[kbw ]+
+                         (dist.f[DIR_P0M])[kbe ]- (dist.f[DIR_M0P])[ktw ]+
+                         (dist.f[DIR_PPP])[ktne]- (dist.f[DIR_MMP])[ktsw]+ 
+                         (dist.f[DIR_PMP])[ktse]- (dist.f[DIR_MPP])[ktnw]+ 
+                         (dist.f[DIR_PPM])[kbne]- (dist.f[DIR_MMM])[kbsw]+ 
+                         (dist.f[DIR_PMM])[kbse]- (dist.f[DIR_MPM])[kbnw];
+           
+            vyMP[kMac] = (dist.f[DIR_0P0])[kn  ]- (dist.f[DIR_0M0])[ks  ]+
+                         (dist.f[DIR_PP0])[kne ]- (dist.f[DIR_MM0])[ksw ]-
+                         (dist.f[DIR_PM0])[kse ]+ (dist.f[DIR_MP0])[knw ]+
+                         (dist.f[DIR_0PP])[ktn ]- (dist.f[DIR_0MM])[kbs ]+
+                         (dist.f[DIR_0PM])[kbn ]- (dist.f[DIR_0MP])[kts ]+
+                         (dist.f[DIR_PPP])[ktne]- (dist.f[DIR_MMP])[ktsw]- 
+                         (dist.f[DIR_PMP])[ktse]+ (dist.f[DIR_MPP])[ktnw]+ 
+                         (dist.f[DIR_PPM])[kbne]- (dist.f[DIR_MMM])[kbsw]- 
+                         (dist.f[DIR_PMM])[kbse]+ (dist.f[DIR_MPM])[kbnw];
+           
+            vzMP[kMac] = (dist.f[DIR_00P])[kt  ]- (dist.f[DIR_00M])[kb  ]+
+                         (dist.f[DIR_P0P])[kte ]- (dist.f[DIR_M0M])[kbw ]-
+                         (dist.f[DIR_P0M])[kbe ]+ (dist.f[DIR_M0P])[ktw ]+
+                         (dist.f[DIR_0PP])[ktn ]- (dist.f[DIR_0MM])[kbs ]-
+                         (dist.f[DIR_0PM])[kbn ]+ (dist.f[DIR_0MP])[kts ]+
+                         (dist.f[DIR_PPP])[ktne]+ (dist.f[DIR_MMP])[ktsw]+ 
+                         (dist.f[DIR_PMP])[ktse]+ (dist.f[DIR_MPP])[ktnw]- 
+                         (dist.f[DIR_PPM])[kbne]- (dist.f[DIR_MMM])[kbsw]- 
+                         (dist.f[DIR_PMM])[kbse]- (dist.f[DIR_MPM])[kbnw];
+        }
+    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -1559,40 +1216,36 @@ __global__ void LBCalcMeasurePoints( real* vxMP,
 
 
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void LBSetOutputWallVelocitySP27( real* vxD,
-														real* vyD,
-														real* vzD,
-														real* vxWall,
-														real* vyWall,
-														real* vzWall,
-														int numberOfWallNodes, 
-														int* kWallNodes, 
-														real* rhoD,
-														real* pressD,
-														unsigned int* geoD,
-														unsigned int* neighborX,
-														unsigned int* neighborY,
-														unsigned int* neighborZ,
-														unsigned int size_Mat,
-														real* DD,
-														bool isEvenTimestep)
+__global__ void LBSetOutputWallVelocitySP27(
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* vxWall,
+    real* vyWall,
+    real* vzWall,
+    int numberOfWallNodes, 
+    int* kWallNodes, 
+    real* rhoD,
+    real* pressD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    real* DD,
+    bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+   //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+   //!
+   const unsigned nodeIndex = getNodeIndex();
 
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
-
-   const unsigned k = nx*(ny*z + y) + x;
    //////////////////////////////////////////////////////////////////////////
-
-   if(k<numberOfWallNodes)
+   if(nodeIndex<numberOfWallNodes)
    {
       //////////////////////////////////////////////////////////////////////////
       //index
-      unsigned int KWN  = kWallNodes[k];
+      unsigned int KWN  = kWallNodes[nodeIndex];
       //////////////////////////////////////////////////////////////////////////
       vxD[KWN] = 0.0;//vxWall[k];
       vyD[KWN] = 0.0;//vyWall[k];
diff --git a/src/gpu/VirtualFluids_GPU/GPU/Cascade27.cu b/src/gpu/VirtualFluids_GPU/GPU/Cascade27.cu
index a79588421a624cae62ec32127739efb47bb7b2ef..457623d4ee62b624248306b6b900fcff3f026286 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/Cascade27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/Cascade27.cu
@@ -15,7 +15,7 @@ __global__ void LB_Kernel_Cascade_SP_27(     real omega,
 														unsigned int* neighborY,
 														unsigned int* neighborZ,
 														real* DDStart,
-														int size_Mat,
+														unsigned long long numberOfLBnodes,
 														bool EvenOrOdd)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -29,7 +29,7 @@ __global__ void LB_Kernel_Cascade_SP_27(     real omega,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if(k<size_Mat)
+	if(k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -40,63 +40,63 @@ __global__ void LB_Kernel_Cascade_SP_27(     real omega,
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -129,33 +129,33 @@ __global__ void LB_Kernel_Cascade_SP_27(     real omega,
 			//unsigned int ktne = k;
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00   ])[kw ];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0   ])[ks ];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M   ])[kb ];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0  ])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0  ])[kw ];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M  ])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P  ])[kw ];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP  ])[ks ];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k  ];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw ];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks ];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k  ];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb ];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k  ];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks ];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw ];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k  ];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb ];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw ];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k  ];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb ];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks ];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k  ];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP ])[k  ];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP ])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP ])[ks ];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP ])[kw ];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM ])[kb ];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM ])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM ])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k  ];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks ];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw ];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb ];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 			////////////////////////////////////////////////////////////////////////////////////
 			//slow
 			//real oMdrho = one - ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
@@ -842,7 +842,7 @@ __global__ void LB_Kernel_Casc_Comp_SP_27(      real omega,
 														   unsigned int* neighborY,
 														   unsigned int* neighborZ,
 														   real* DDStart,
-														   int size_Mat,
+														   unsigned long long numberOfLBnodes,
 														   bool EvenOrOdd)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -856,7 +856,7 @@ __global__ void LB_Kernel_Casc_Comp_SP_27(      real omega,
    const unsigned k = nx*(ny*z + y) + x;
    //////////////////////////////////////////////////////////////////////////
 
-   if(k<size_Mat)
+   if(k<numberOfLBnodes)
    {
       ////////////////////////////////////////////////////////////////////////////////
       unsigned int BC;
@@ -867,63 +867,63 @@ __global__ void LB_Kernel_Casc_Comp_SP_27(      real omega,
          Distributions27 D;
          if (EvenOrOdd==true)
          {
-            D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-            D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-            D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-            D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-            D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-            D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-            D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-            D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-            D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-            D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-            D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-            D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-            D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-            D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-            D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-            D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-            D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-            D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-            D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-            D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-            D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-            D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-            D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-            D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-            D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-            D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-            D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+            D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+            D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+            D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+            D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+            D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+            D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+            D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+            D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+            D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+            D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+            D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+            D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+            D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+            D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+            D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+            D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+            D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+            D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+            D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+            D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+            D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+            D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+            D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+            D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+            D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+            D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+            D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
          }
          else
          {
-            D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-            D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-            D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-            D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-            D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-            D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-            D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-            D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-            D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-            D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-            D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-            D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-            D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-            D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-            D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-            D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-            D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-            D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-            D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-            D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-            D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-            D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-            D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-            D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-            D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-            D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-            D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+            D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+            D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+            D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+            D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+            D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+            D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+            D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+            D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+            D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+            D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+            D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+            D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+            D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+            D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+            D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+            D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+            D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+            D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+            D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+            D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+            D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+            D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+            D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+            D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+            D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+            D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+            D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
          }
 
          ////////////////////////////////////////////////////////////////////////////////
@@ -956,33 +956,33 @@ __global__ void LB_Kernel_Casc_Comp_SP_27(      real omega,
          unsigned int ktne = k;
          unsigned int kbsw = neighborZ[ksw];
          //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-         real f_E     = (D.f[DIR_P00   ])[ke   ];// +  c2over27 ;
-         real f_W     = (D.f[DIR_M00   ])[kw   ];// +  c2over27 ;
-         real f_N     = (D.f[DIR_0P0   ])[kn   ];// +  c2over27 ;
-         real f_S     = (D.f[DIR_0M0   ])[ks   ];// +  c2over27 ;
-         real f_F     = (D.f[DIR_00P   ])[kt   ];// +  c2over27 ;
-         real f_B     = (D.f[DIR_00M   ])[kb   ];// +  c2over27 ;
-         real f_NE    = (D.f[DIR_PP0  ])[kne  ];// +  c1over54 ;
-         real f_SW    = (D.f[DIR_MM0  ])[ksw  ];// +  c1over54 ;
-         real f_SE    = (D.f[DIR_PM0  ])[kse  ];// +  c1over54 ;
-         real f_NW    = (D.f[DIR_MP0  ])[knw  ];// +  c1over54 ;
-         real f_Ef    = (D.f[DIR_P0P  ])[kte  ];// +  c1over54 ;
-         real f_Wb    = (D.f[DIR_M0M  ])[kbw  ];// +  c1over54 ;
-         real f_Eb    = (D.f[DIR_P0M  ])[kbe  ];// +  c1over54 ;
-         real f_Wf    = (D.f[DIR_M0P  ])[ktw  ];// +  c1over54 ;
-         real f_Nf    = (D.f[DIR_0PP  ])[ktn  ];// +  c1over54 ;
-         real f_Sb    = (D.f[DIR_0MM  ])[kbs  ];// +  c1over54 ;
-         real f_Nb    = (D.f[DIR_0PM  ])[kbn  ];// +  c1over54 ;
-         real f_Sf    = (D.f[DIR_0MP  ])[kts  ];// +  c1over54 ;
+         real f_E     = (D.f[DIR_P00])[ke   ];// +  c2over27 ;
+         real f_W     = (D.f[DIR_M00])[kw   ];// +  c2over27 ;
+         real f_N     = (D.f[DIR_0P0])[kn   ];// +  c2over27 ;
+         real f_S     = (D.f[DIR_0M0])[ks   ];// +  c2over27 ;
+         real f_F     = (D.f[DIR_00P])[kt   ];// +  c2over27 ;
+         real f_B     = (D.f[DIR_00M])[kb   ];// +  c2over27 ;
+         real f_NE    = (D.f[DIR_PP0])[kne  ];// +  c1over54 ;
+         real f_SW    = (D.f[DIR_MM0])[ksw  ];// +  c1over54 ;
+         real f_SE    = (D.f[DIR_PM0])[kse  ];// +  c1over54 ;
+         real f_NW    = (D.f[DIR_MP0])[knw  ];// +  c1over54 ;
+         real f_Ef    = (D.f[DIR_P0P])[kte  ];// +  c1over54 ;
+         real f_Wb    = (D.f[DIR_M0M])[kbw  ];// +  c1over54 ;
+         real f_Eb    = (D.f[DIR_P0M])[kbe  ];// +  c1over54 ;
+         real f_Wf    = (D.f[DIR_M0P])[ktw  ];// +  c1over54 ;
+         real f_Nf    = (D.f[DIR_0PP])[ktn  ];// +  c1over54 ;
+         real f_Sb    = (D.f[DIR_0MM])[kbs  ];// +  c1over54 ;
+         real f_Nb    = (D.f[DIR_0PM])[kbn  ];// +  c1over54 ;
+         real f_Sf    = (D.f[DIR_0MP])[kts  ];// +  c1over54 ;
          real f_R     = (D.f[DIR_000])[kzero];// +  c8over27 ;
-         real f_Nef   = (D.f[DIR_PPP ])[ktne ];// +  c1over216;
-         real f_Swf   = (D.f[DIR_MMP ])[ktsw ];// +  c1over216;
-         real f_Sef   = (D.f[DIR_PMP ])[ktse ];// +  c1over216;
-         real f_Nwf   = (D.f[DIR_MPP ])[ktnw ];// +  c1over216;
-         real f_Neb   = (D.f[DIR_PPM ])[kbne ];// +  c1over216;
-         real f_Swb   = (D.f[DIR_MMM ])[kbsw ];// +  c1over216;
-         real f_Seb   = (D.f[DIR_PMM ])[kbse ];// +  c1over216;
-         real f_Nwb   = (D.f[DIR_MPM ])[kbnw ];// +  c1over216;
+         real f_Nef   = (D.f[DIR_PPP])[ktne ];// +  c1over216;
+         real f_Swf   = (D.f[DIR_MMP])[ktsw ];// +  c1over216;
+         real f_Sef   = (D.f[DIR_PMP])[ktse ];// +  c1over216;
+         real f_Nwf   = (D.f[DIR_MPP])[ktnw ];// +  c1over216;
+         real f_Neb   = (D.f[DIR_PPM])[kbne ];// +  c1over216;
+         real f_Swb   = (D.f[DIR_MMM])[kbsw ];// +  c1over216;
+         real f_Seb   = (D.f[DIR_PMM])[kbse ];// +  c1over216;
+         real f_Nwb   = (D.f[DIR_MPM])[kbnw ];// +  c1over216;
          ////////////////////////////////////////////////////////////////////////////////////
 		 real rho=f_NW+f_W+f_SW+f_S+f_SE+f_E+f_NE+f_N+f_R+f_Nf+f_Nb+f_Sf+f_Sb+f_Ef+f_Eb+f_Wf+f_Wb+f_Nwf+f_Nwb+f_Nef+f_Neb+f_Swf+f_Swb+f_Sef+f_Seb+f_F+f_B+c1o1;// ACHTUNG ne EINS !!!!!!!!
 		 real pix=(f_NE+f_E+f_SE+f_Ef+f_Eb-f_NW-f_W-f_SW-f_Wf-f_Wb+f_Nef+f_Neb+f_Sef+f_Seb-f_Nwf-f_Nwb-f_Swf-f_Swb);
@@ -1689,7 +1689,7 @@ __global__ void LB_Kernel_Casc_SP_MS_OHM_27(  real omega,
                                                          unsigned int* neighborY,
                                                          unsigned int* neighborZ,
                                                          real* DDStart,
-                                                         int size_Mat,
+                                                         unsigned long long numberOfLBnodes,
                                                          bool EvenOrOdd)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -1703,7 +1703,7 @@ __global__ void LB_Kernel_Casc_SP_MS_OHM_27(  real omega,
    const unsigned k = nx*(ny*z + y) + x;
    //////////////////////////////////////////////////////////////////////////
 
-   if(k<size_Mat)
+   if(k<numberOfLBnodes)
    {
       ////////////////////////////////////////////////////////////////////////////////
       unsigned int BC;
@@ -1714,63 +1714,63 @@ __global__ void LB_Kernel_Casc_SP_MS_OHM_27(  real omega,
          Distributions27 D;
          if (EvenOrOdd==true)
          {
-            D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-            D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-            D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-            D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-            D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-            D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-            D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-            D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-            D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-            D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-            D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-            D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-            D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-            D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-            D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-            D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-            D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-            D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-            D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-            D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-            D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-            D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-            D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-            D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-            D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-            D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-            D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+            D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+            D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+            D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+            D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+            D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+            D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+            D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+            D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+            D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+            D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+            D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+            D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+            D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+            D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+            D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+            D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+            D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+            D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+            D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+            D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+            D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+            D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+            D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+            D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+            D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+            D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+            D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
          }
          else
          {
-            D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-            D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-            D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-            D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-            D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-            D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-            D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-            D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-            D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-            D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-            D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-            D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-            D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-            D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-            D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-            D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-            D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-            D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-            D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-            D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-            D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-            D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-            D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-            D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-            D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-            D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-            D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+            D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+            D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+            D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+            D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+            D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+            D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+            D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+            D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+            D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+            D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+            D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+            D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+            D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+            D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+            D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+            D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+            D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+            D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+            D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+            D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+            D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+            D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+            D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+            D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+            D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+            D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+            D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
          }
 
          ////////////////////////////////////////////////////////////////////////////////
@@ -1803,33 +1803,33 @@ __global__ void LB_Kernel_Casc_SP_MS_OHM_27(  real omega,
          //unsigned int ktne = k;
          unsigned int kbsw = neighborZ[ksw];
          //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-         real fE    =  (D.f[DIR_P00   ])[k  ];//ke
-         real fW    =  (D.f[DIR_M00   ])[kw ];
-         real fN    =  (D.f[DIR_0P0   ])[k  ];//kn
-         real fS    =  (D.f[DIR_0M0   ])[ks ];
-         real fT    =  (D.f[DIR_00P   ])[k  ];//kt
-         real fB    =  (D.f[DIR_00M   ])[kb ];
-         real fNE   =  (D.f[DIR_PP0  ])[k  ];//kne
-         real fSW   =  (D.f[DIR_MM0  ])[ksw];
-         real fSE   =  (D.f[DIR_PM0  ])[ks ];//kse
-         real fNW   =  (D.f[DIR_MP0  ])[kw ];//knw
-         real fTE   =  (D.f[DIR_P0P  ])[k  ];//kte
-         real fBW   =  (D.f[DIR_M0M  ])[kbw];
-         real fBE   =  (D.f[DIR_P0M  ])[kb ];//kbe
-         real fTW   =  (D.f[DIR_M0P  ])[kw ];//ktw
-         real fTN   =  (D.f[DIR_0PP  ])[k  ];//ktn
-         real fBS   =  (D.f[DIR_0MM  ])[kbs];
-         real fBN   =  (D.f[DIR_0PM  ])[kb ];//kbn
-         real fTS   =  (D.f[DIR_0MP  ])[ks ];//kts
+         real fE    =  (D.f[DIR_P00])[k  ];//ke
+         real fW    =  (D.f[DIR_M00])[kw ];
+         real fN    =  (D.f[DIR_0P0])[k  ];//kn
+         real fS    =  (D.f[DIR_0M0])[ks ];
+         real fT    =  (D.f[DIR_00P])[k  ];//kt
+         real fB    =  (D.f[DIR_00M])[kb ];
+         real fNE   =  (D.f[DIR_PP0])[k  ];//kne
+         real fSW   =  (D.f[DIR_MM0])[ksw];
+         real fSE   =  (D.f[DIR_PM0])[ks ];//kse
+         real fNW   =  (D.f[DIR_MP0])[kw ];//knw
+         real fTE   =  (D.f[DIR_P0P])[k  ];//kte
+         real fBW   =  (D.f[DIR_M0M])[kbw];
+         real fBE   =  (D.f[DIR_P0M])[kb ];//kbe
+         real fTW   =  (D.f[DIR_M0P])[kw ];//ktw
+         real fTN   =  (D.f[DIR_0PP])[k  ];//ktn
+         real fBS   =  (D.f[DIR_0MM])[kbs];
+         real fBN   =  (D.f[DIR_0PM])[kb ];//kbn
+         real fTS   =  (D.f[DIR_0MP])[ks ];//kts
          real fZERO =  (D.f[DIR_000])[k  ];//kzero
-         real fTNE   = (D.f[DIR_PPP ])[k  ];//ktne
-         real fTSW   = (D.f[DIR_MMP ])[ksw];//ktsw
-         real fTSE   = (D.f[DIR_PMP ])[ks ];//ktse
-         real fTNW   = (D.f[DIR_MPP ])[kw ];//ktnw
-         real fBNE   = (D.f[DIR_PPM ])[kb ];//kbne
-         real fBSW   = (D.f[DIR_MMM ])[kbsw];
-         real fBSE   = (D.f[DIR_PMM ])[kbs];//kbse
-         real fBNW   = (D.f[DIR_MPM ])[kbw];//kbnw
+         real fTNE   = (D.f[DIR_PPP])[k  ];//ktne
+         real fTSW   = (D.f[DIR_MMP])[ksw];//ktsw
+         real fTSE   = (D.f[DIR_PMP])[ks ];//ktse
+         real fTNW   = (D.f[DIR_MPP])[kw ];//ktnw
+         real fBNE   = (D.f[DIR_PPM])[kb ];//kbne
+         real fBSW   = (D.f[DIR_MMM])[kbsw];
+         real fBSE   = (D.f[DIR_PMM])[kbs];//kbse
+         real fBNW   = (D.f[DIR_MPM])[kbw];//kbnw
          ////////////////////////////////////////////////////////////////////////////////
          real rho0   =  (fTNE+fBSW)+(fTSW+fBNE)+(fTSE+fBNW)+(fTNW+fBSE)+(fNE+fSW)+(fNW+fSE)+(fTE+fBW)+(fBE+fTW)+(fTN+fBS)+(fBN+fTS)+(fE+fW)+(fN+fS)+(fT+fB)+fZERO;
          real rho    =  rho0 + c1o1;
@@ -2321,7 +2321,7 @@ __global__ void LB_Kernel_Casc_SP_MS_27(   real omega,
                                                       unsigned int* neighborY,
                                                       unsigned int* neighborZ,
                                                       real* DDStart,
-                                                      int size_Mat,
+                                                      unsigned long long numberOfLBnodes,
                                                       bool EvenOrOdd)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -2335,7 +2335,7 @@ __global__ void LB_Kernel_Casc_SP_MS_27(   real omega,
    const unsigned k = nx*(ny*z + y) + x;
    //////////////////////////////////////////////////////////////////////////
 
-   if(k<size_Mat)
+   if(k<numberOfLBnodes)
    {
       ////////////////////////////////////////////////////////////////////////////////
       unsigned int BC;
@@ -2346,63 +2346,63 @@ __global__ void LB_Kernel_Casc_SP_MS_27(   real omega,
          Distributions27 D;
          if (EvenOrOdd==true)
          {
-            D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-            D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-            D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-            D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-            D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-            D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-            D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-            D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-            D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-            D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-            D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-            D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-            D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-            D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-            D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-            D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-            D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-            D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-            D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-            D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-            D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-            D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-            D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-            D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-            D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-            D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-            D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+            D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+            D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+            D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+            D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+            D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+            D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+            D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+            D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+            D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+            D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+            D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+            D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+            D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+            D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+            D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+            D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+            D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+            D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+            D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+            D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+            D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+            D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+            D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+            D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+            D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+            D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+            D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
          }
          else
          {
-            D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-            D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-            D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-            D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-            D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-            D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-            D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-            D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-            D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-            D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-            D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-            D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-            D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-            D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-            D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-            D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-            D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-            D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-            D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-            D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-            D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-            D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-            D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-            D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-            D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-            D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-            D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+            D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+            D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+            D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+            D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+            D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+            D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+            D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+            D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+            D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+            D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+            D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+            D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+            D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+            D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+            D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+            D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+            D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+            D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+            D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+            D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+            D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+            D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+            D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+            D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+            D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+            D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+            D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
          }
 
          ////////////////////////////////////////////////////////////////////////////////
@@ -2435,33 +2435,33 @@ __global__ void LB_Kernel_Casc_SP_MS_27(   real omega,
          //unsigned int ktne = k;
          unsigned int kbsw = neighborZ[ksw];
          //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-         real fE    =  (D.f[DIR_P00   ])[k  ];//ke
-         real fW    =  (D.f[DIR_M00   ])[kw ];
-         real fN    =  (D.f[DIR_0P0   ])[k  ];//kn
-         real fS    =  (D.f[DIR_0M0   ])[ks ];
-         real fT    =  (D.f[DIR_00P   ])[k  ];//kt
-         real fB    =  (D.f[DIR_00M   ])[kb ];
-         real fNE   =  (D.f[DIR_PP0  ])[k  ];//kne
-         real fSW   =  (D.f[DIR_MM0  ])[ksw];
-         real fSE   =  (D.f[DIR_PM0  ])[ks ];//kse
-         real fNW   =  (D.f[DIR_MP0  ])[kw ];//knw
-         real fTE   =  (D.f[DIR_P0P  ])[k  ];//kte
-         real fBW   =  (D.f[DIR_M0M  ])[kbw];
-         real fBE   =  (D.f[DIR_P0M  ])[kb ];//kbe
-         real fTW   =  (D.f[DIR_M0P  ])[kw ];//ktw
-         real fTN   =  (D.f[DIR_0PP  ])[k  ];//ktn
-         real fBS   =  (D.f[DIR_0MM  ])[kbs];
-         real fBN   =  (D.f[DIR_0PM  ])[kb ];//kbn
-         real fTS   =  (D.f[DIR_0MP  ])[ks ];//kts
+         real fE    =  (D.f[DIR_P00])[k  ];//ke
+         real fW    =  (D.f[DIR_M00])[kw ];
+         real fN    =  (D.f[DIR_0P0])[k  ];//kn
+         real fS    =  (D.f[DIR_0M0])[ks ];
+         real fT    =  (D.f[DIR_00P])[k  ];//kt
+         real fB    =  (D.f[DIR_00M])[kb ];
+         real fNE   =  (D.f[DIR_PP0])[k  ];//kne
+         real fSW   =  (D.f[DIR_MM0])[ksw];
+         real fSE   =  (D.f[DIR_PM0])[ks ];//kse
+         real fNW   =  (D.f[DIR_MP0])[kw ];//knw
+         real fTE   =  (D.f[DIR_P0P])[k  ];//kte
+         real fBW   =  (D.f[DIR_M0M])[kbw];
+         real fBE   =  (D.f[DIR_P0M])[kb ];//kbe
+         real fTW   =  (D.f[DIR_M0P])[kw ];//ktw
+         real fTN   =  (D.f[DIR_0PP])[k  ];//ktn
+         real fBS   =  (D.f[DIR_0MM])[kbs];
+         real fBN   =  (D.f[DIR_0PM])[kb ];//kbn
+         real fTS   =  (D.f[DIR_0MP])[ks ];//kts
          real fZERO =  (D.f[DIR_000])[k  ];//kzero
-         real fTNE   = (D.f[DIR_PPP ])[k  ];//ktne
-         real fTSW   = (D.f[DIR_MMP ])[ksw];//ktsw
-         real fTSE   = (D.f[DIR_PMP ])[ks ];//ktse
-         real fTNW   = (D.f[DIR_MPP ])[kw ];//ktnw
-         real fBNE   = (D.f[DIR_PPM ])[kb ];//kbne
-         real fBSW   = (D.f[DIR_MMM ])[kbsw];
-         real fBSE   = (D.f[DIR_PMM ])[kbs];//kbse
-         real fBNW   = (D.f[DIR_MPM ])[kbw];//kbnw
+         real fTNE   = (D.f[DIR_PPP])[k  ];//ktne
+         real fTSW   = (D.f[DIR_MMP])[ksw];//ktsw
+         real fTSE   = (D.f[DIR_PMP])[ks ];//ktse
+         real fTNW   = (D.f[DIR_MPP])[kw ];//ktnw
+         real fBNE   = (D.f[DIR_PPM])[kb ];//kbne
+         real fBSW   = (D.f[DIR_MMM])[kbsw];
+         real fBSE   = (D.f[DIR_PMM])[kbs];//kbse
+         real fBNW   = (D.f[DIR_MPM])[kbw];//kbnw
          ////////////////////////////////////////////////////////////////////////////////
          real rho0   =  fZERO+fE+fW+fN+fS+fT+fB+fNE+fSW+fSE+fNW+fTE+fBW+fBE+fTW+fTN+fBS+fBN+fTS+fTNE+fTSW+fTSE+fTNW+fBNE+fBSW+fBSE+fBNW;
          real rho    =  rho0 + c1o1;
@@ -2846,7 +2846,7 @@ __global__ void LB_Kernel_Casc_SP_MS_Diff_27(real omega,
                                                         unsigned int* neighborY,
                                                         unsigned int* neighborZ,
                                                         real* DDStart,
-                                                        int size_Mat,
+                                                        unsigned long long numberOfLBnodes,
                                                         bool EvenOrOdd)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -2860,7 +2860,7 @@ __global__ void LB_Kernel_Casc_SP_MS_Diff_27(real omega,
    const unsigned k = nx*(ny*z + y) + x;
    //////////////////////////////////////////////////////////////////////////
 
-   if(k<size_Mat)
+   if(k<numberOfLBnodes)
    {
       ////////////////////////////////////////////////////////////////////////////////
       unsigned int BC;
@@ -2871,63 +2871,63 @@ __global__ void LB_Kernel_Casc_SP_MS_Diff_27(real omega,
          Distributions27 D;
          if (EvenOrOdd==true)
          {
-            D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-            D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-            D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-            D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-            D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-            D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-            D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-            D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-            D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-            D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-            D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-            D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-            D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-            D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-            D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-            D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-            D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-            D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-            D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-            D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-            D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-            D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-            D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-            D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-            D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-            D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-            D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+            D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+            D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+            D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+            D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+            D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+            D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+            D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+            D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+            D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+            D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+            D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+            D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+            D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+            D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+            D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+            D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+            D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+            D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+            D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+            D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+            D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+            D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+            D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+            D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+            D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+            D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+            D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
          }
          else
          {
-            D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-            D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-            D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-            D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-            D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-            D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-            D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-            D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-            D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-            D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-            D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-            D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-            D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-            D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-            D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-            D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-            D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-            D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-            D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-            D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-            D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-            D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-            D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-            D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-            D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-            D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-            D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+            D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+            D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+            D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+            D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+            D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+            D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+            D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+            D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+            D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+            D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+            D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+            D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+            D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+            D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+            D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+            D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+            D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+            D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+            D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+            D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+            D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+            D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+            D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+            D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+            D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+            D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+            D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
          }
 
          ////////////////////////////////////////////////////////////////////////////////
@@ -2960,33 +2960,33 @@ __global__ void LB_Kernel_Casc_SP_MS_Diff_27(real omega,
          //unsigned int ktne = k;
          unsigned int kbsw = neighborZ[ksw];
          //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-         real fE    =  (D.f[DIR_P00   ])[k  ];//ke
-         real fW    =  (D.f[DIR_M00   ])[kw ];
-         real fN    =  (D.f[DIR_0P0   ])[k  ];//kn
-         real fS    =  (D.f[DIR_0M0   ])[ks ];
-         real fT    =  (D.f[DIR_00P   ])[k  ];//kt
-         real fB    =  (D.f[DIR_00M   ])[kb ];
-         real fNE   =  (D.f[DIR_PP0  ])[k  ];//kne
-         real fSW   =  (D.f[DIR_MM0  ])[ksw];
-         real fSE   =  (D.f[DIR_PM0  ])[ks ];//kse
-         real fNW   =  (D.f[DIR_MP0  ])[kw ];//knw
-         real fTE   =  (D.f[DIR_P0P  ])[k  ];//kte
-         real fBW   =  (D.f[DIR_M0M  ])[kbw];
-         real fBE   =  (D.f[DIR_P0M  ])[kb ];//kbe
-         real fTW   =  (D.f[DIR_M0P  ])[kw ];//ktw
-         real fTN   =  (D.f[DIR_0PP  ])[k  ];//ktn
-         real fBS   =  (D.f[DIR_0MM  ])[kbs];
-         real fBN   =  (D.f[DIR_0PM  ])[kb ];//kbn
-         real fTS   =  (D.f[DIR_0MP  ])[ks ];//kts
+         real fE    =  (D.f[DIR_P00])[k  ];//ke
+         real fW    =  (D.f[DIR_M00])[kw ];
+         real fN    =  (D.f[DIR_0P0])[k  ];//kn
+         real fS    =  (D.f[DIR_0M0])[ks ];
+         real fT    =  (D.f[DIR_00P])[k  ];//kt
+         real fB    =  (D.f[DIR_00M])[kb ];
+         real fNE   =  (D.f[DIR_PP0])[k  ];//kne
+         real fSW   =  (D.f[DIR_MM0])[ksw];
+         real fSE   =  (D.f[DIR_PM0])[ks ];//kse
+         real fNW   =  (D.f[DIR_MP0])[kw ];//knw
+         real fTE   =  (D.f[DIR_P0P])[k  ];//kte
+         real fBW   =  (D.f[DIR_M0M])[kbw];
+         real fBE   =  (D.f[DIR_P0M])[kb ];//kbe
+         real fTW   =  (D.f[DIR_M0P])[kw ];//ktw
+         real fTN   =  (D.f[DIR_0PP])[k  ];//ktn
+         real fBS   =  (D.f[DIR_0MM])[kbs];
+         real fBN   =  (D.f[DIR_0PM])[kb ];//kbn
+         real fTS   =  (D.f[DIR_0MP])[ks ];//kts
          real fZERO =  (D.f[DIR_000])[k  ];//kzero
-         real fTNE   = (D.f[DIR_PPP ])[k  ];//ktne
-         real fTSW   = (D.f[DIR_MMP ])[ksw];//ktsw
-         real fTSE   = (D.f[DIR_PMP ])[ks ];//ktse
-         real fTNW   = (D.f[DIR_MPP ])[kw ];//ktnw
-         real fBNE   = (D.f[DIR_PPM ])[kb ];//kbne
-         real fBSW   = (D.f[DIR_MMM ])[kbsw];
-         real fBSE   = (D.f[DIR_PMM ])[kbs];//kbse
-         real fBNW   = (D.f[DIR_MPM ])[kbw];//kbnw
+         real fTNE   = (D.f[DIR_PPP])[k  ];//ktne
+         real fTSW   = (D.f[DIR_MMP])[ksw];//ktsw
+         real fTSE   = (D.f[DIR_PMP])[ks ];//ktse
+         real fTNW   = (D.f[DIR_MPP])[kw ];//ktnw
+         real fBNE   = (D.f[DIR_PPM])[kb ];//kbne
+         real fBSW   = (D.f[DIR_MMM])[kbsw];
+         real fBSE   = (D.f[DIR_PMM])[kbs];//kbse
+         real fBNW   = (D.f[DIR_MPM])[kbw];//kbnw
          ////////////////////////////////////////////////////////////////////////////////
          real rho0   =  fZERO+fE+fW+fN+fS+fT+fB+fNE+fSW+fSE+fNW+fTE+fBW+fBE+fTW+fTN+fBS+fBN+fTS+fTNE+fTSW+fTSE+fTNW+fBNE+fBSW+fBSE+fBNW;
          real rho    =  rho0 + c1o1;
@@ -3368,7 +3368,7 @@ __global__ void LB_Kernel_Casc_SP_27(  real omega,
                                                   unsigned int* neighborY,
                                                   unsigned int* neighborZ,
                                                   real* DDStart,
-                                                  int size_Mat,
+                                                  unsigned long long numberOfLBnodes,
                                                   bool EvenOrOdd)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -3382,7 +3382,7 @@ __global__ void LB_Kernel_Casc_SP_27(  real omega,
    const unsigned k = nx*(ny*z + y) + x;
    //////////////////////////////////////////////////////////////////////////
 
-   if(k<size_Mat)
+   if(k<numberOfLBnodes)
    {
       ////////////////////////////////////////////////////////////////////////////////
       unsigned int BC;
@@ -3393,63 +3393,63 @@ __global__ void LB_Kernel_Casc_SP_27(  real omega,
        Distributions27 D;
        if (EvenOrOdd==true)
        {
-          D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-          D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-          D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-          D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-          D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-          D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-          D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-          D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-          D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-          D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-          D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-          D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-          D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-          D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-          D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-          D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-          D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-          D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-          D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-          D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-          D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-          D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-          D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-          D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-          D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-          D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-          D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+          D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+          D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+          D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+          D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+          D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+          D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+          D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+          D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+          D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+          D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+          D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+          D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+          D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+          D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+          D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+          D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+          D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+          D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+          D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+          D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+          D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+          D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+          D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+          D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+          D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+          D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+          D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
        }
        else
        {
-          D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-          D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-          D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-          D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-          D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-          D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-          D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-          D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-          D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-          D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-          D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-          D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-          D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-          D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-          D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-          D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-          D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-          D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-          D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-          D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-          D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-          D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-          D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-          D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-          D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-          D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-          D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+          D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+          D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+          D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+          D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+          D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+          D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+          D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+          D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+          D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+          D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+          D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+          D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+          D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+          D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+          D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+          D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+          D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+          D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+          D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+          D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+          D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+          D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+          D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+          D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+          D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+          D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+          D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
        }
 
        ////////////////////////////////////////////////////////////////////////////////
@@ -3512,33 +3512,33 @@ __global__ void LB_Kernel_Casc_SP_27(  real omega,
        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
        real f_E,f_W,f_N,f_S,f_T,f_B,f_NE,f_SW,f_SE,f_NW,f_TE,f_BW,f_BE,f_TW,f_TN,f_BS,f_BN,f_TS,f_ZERO, f_TNE,f_TNW,f_TSE,f_TSW, f_BNE,f_BNW,f_BSE,f_BSW;
        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-       f_E    =  (D.f[DIR_P00   ])[ke]+c2o27;
-       f_W    =  (D.f[DIR_M00   ])[kw]+c2o27;
-       f_N    =  (D.f[DIR_0P0   ])[kn]+c2o27;
-       f_S    =  (D.f[DIR_0M0   ])[ks]+c2o27;
-       f_T    =  (D.f[DIR_00P   ])[kt]+c2o27;
-       f_B    =  (D.f[DIR_00M   ])[kb]+c2o27;
-       f_NE   =  (D.f[DIR_PP0  ])[kne]+c1o54;
-       f_SW   =  (D.f[DIR_MM0  ])[ksw]+c1o54;
-       f_SE   =  (D.f[DIR_PM0  ])[kse]+c1o54;
-       f_NW   =  (D.f[DIR_MP0  ])[knw]+c1o54;
-       f_TE   =  (D.f[DIR_P0P  ])[kte]+c1o54;
-       f_BW   =  (D.f[DIR_M0M  ])[kbw]+c1o54;
-       f_BE   =  (D.f[DIR_P0M  ])[kbe]+c1o54;
-       f_TW   =  (D.f[DIR_M0P  ])[ktw]+c1o54;
-       f_TN   =  (D.f[DIR_0PP  ])[ktn]+c1o54;
-       f_BS   =  (D.f[DIR_0MM  ])[kbs]+c1o54;
-       f_BN   =  (D.f[DIR_0PM  ])[kbn]+c1o54;
-       f_TS   =  (D.f[DIR_0MP  ])[kts]+c1o54;
+       f_E    =  (D.f[DIR_P00])[ke]+c2o27;
+       f_W    =  (D.f[DIR_M00])[kw]+c2o27;
+       f_N    =  (D.f[DIR_0P0])[kn]+c2o27;
+       f_S    =  (D.f[DIR_0M0])[ks]+c2o27;
+       f_T    =  (D.f[DIR_00P])[kt]+c2o27;
+       f_B    =  (D.f[DIR_00M])[kb]+c2o27;
+       f_NE   =  (D.f[DIR_PP0])[kne]+c1o54;
+       f_SW   =  (D.f[DIR_MM0])[ksw]+c1o54;
+       f_SE   =  (D.f[DIR_PM0])[kse]+c1o54;
+       f_NW   =  (D.f[DIR_MP0])[knw]+c1o54;
+       f_TE   =  (D.f[DIR_P0P])[kte]+c1o54;
+       f_BW   =  (D.f[DIR_M0M])[kbw]+c1o54;
+       f_BE   =  (D.f[DIR_P0M])[kbe]+c1o54;
+       f_TW   =  (D.f[DIR_M0P])[ktw]+c1o54;
+       f_TN   =  (D.f[DIR_0PP])[ktn]+c1o54;
+       f_BS   =  (D.f[DIR_0MM])[kbs]+c1o54;
+       f_BN   =  (D.f[DIR_0PM])[kbn]+c1o54;
+       f_TS   =  (D.f[DIR_0MP])[kts]+c1o54;
        f_ZERO =  (D.f[DIR_000])[kzero]+c8o27;
-       f_TNE   = (D.f[DIR_PPP ])[ktne]+c1o216;
-       f_TSW   = (D.f[DIR_MMP ])[ktsw]+c1o216;
-       f_TSE   = (D.f[DIR_PMP ])[ktse]+c1o216;
-       f_TNW   = (D.f[DIR_MPP ])[ktnw]+c1o216;
-       f_BNE   = (D.f[DIR_PPM ])[kbne]+c1o216;
-       f_BSW   = (D.f[DIR_MMM ])[kbsw]+c1o216;
-       f_BSE   = (D.f[DIR_PMM ])[kbse]+c1o216;
-       f_BNW   = (D.f[DIR_MPM ])[kbnw]+c1o216;
+       f_TNE   = (D.f[DIR_PPP])[ktne]+c1o216;
+       f_TSW   = (D.f[DIR_MMP])[ktsw]+c1o216;
+       f_TSE   = (D.f[DIR_PMP])[ktse]+c1o216;
+       f_TNW   = (D.f[DIR_MPP])[ktnw]+c1o216;
+       f_BNE   = (D.f[DIR_PPM])[kbne]+c1o216;
+       f_BSW   = (D.f[DIR_MMM])[kbsw]+c1o216;
+       f_BSE   = (D.f[DIR_PMM])[kbse]+c1o216;
+       f_BNW   = (D.f[DIR_MPM])[kbnw]+c1o216;
        ////////////////////////////////////////////////////////////////////////////////
 
        if( BC == GEO_FLUID || BC == GEO_VELO)
@@ -4060,7 +4060,7 @@ __global__ void LB_Kernel_Casc27(real omega,
                                             unsigned int* neighborY,
                                             unsigned int* neighborZ,
                                             real* DDStart,
-                                            int size_Mat,
+                                            unsigned long long numberOfLBnodes,
                                             bool EvenOrOdd)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -4089,63 +4089,63 @@ __global__ void LB_Kernel_Casc27(real omega,
       Distributions27 D;
       if (EvenOrOdd==true)
       {
-         D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
       }
       else
       {
-         D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-         D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-         D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-         D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-         D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-         D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-         D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-         D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-         D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+         D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+         D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
       }
 
       ////////////////////////////////////////////////////////////////////////////////
@@ -4208,33 +4208,33 @@ __global__ void LB_Kernel_Casc27(real omega,
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real f_E,f_W,f_N,f_S,f_T,f_B,f_NE,f_SW,f_SE,f_NW,f_TE,f_BW,f_BE,f_TW,f_TN,f_BS,f_BN,f_TS,f_ZERO, f_TNE,f_TNW,f_TSE,f_TSW, f_BNE,f_BNW,f_BSE,f_BSW;
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      f_E    =  (D.f[DIR_P00   ])[ke]+c2o27;
-      f_W    =  (D.f[DIR_M00   ])[kw]+c2o27;
-      f_N    =  (D.f[DIR_0P0   ])[kn]+c2o27;
-      f_S    =  (D.f[DIR_0M0   ])[ks]+c2o27;
-      f_T    =  (D.f[DIR_00P   ])[kt]+c2o27;
-      f_B    =  (D.f[DIR_00M   ])[kb]+c2o27;
-      f_NE   =  (D.f[DIR_PP0  ])[kne]+c1o54;
-      f_SW   =  (D.f[DIR_MM0  ])[ksw]+c1o54;
-      f_SE   =  (D.f[DIR_PM0  ])[kse]+c1o54;
-      f_NW   =  (D.f[DIR_MP0  ])[knw]+c1o54;
-      f_TE   =  (D.f[DIR_P0P  ])[kte]+c1o54;
-      f_BW   =  (D.f[DIR_M0M  ])[kbw]+c1o54;
-      f_BE   =  (D.f[DIR_P0M  ])[kbe]+c1o54;
-      f_TW   =  (D.f[DIR_M0P  ])[ktw]+c1o54;
-      f_TN   =  (D.f[DIR_0PP  ])[ktn]+c1o54;
-      f_BS   =  (D.f[DIR_0MM  ])[kbs]+c1o54;
-      f_BN   =  (D.f[DIR_0PM  ])[kbn]+c1o54;
-      f_TS   =  (D.f[DIR_0MP  ])[kts]+c1o54;
+      f_E    =  (D.f[DIR_P00])[ke]+c2o27;
+      f_W    =  (D.f[DIR_M00])[kw]+c2o27;
+      f_N    =  (D.f[DIR_0P0])[kn]+c2o27;
+      f_S    =  (D.f[DIR_0M0])[ks]+c2o27;
+      f_T    =  (D.f[DIR_00P])[kt]+c2o27;
+      f_B    =  (D.f[DIR_00M])[kb]+c2o27;
+      f_NE   =  (D.f[DIR_PP0])[kne]+c1o54;
+      f_SW   =  (D.f[DIR_MM0])[ksw]+c1o54;
+      f_SE   =  (D.f[DIR_PM0])[kse]+c1o54;
+      f_NW   =  (D.f[DIR_MP0])[knw]+c1o54;
+      f_TE   =  (D.f[DIR_P0P])[kte]+c1o54;
+      f_BW   =  (D.f[DIR_M0M])[kbw]+c1o54;
+      f_BE   =  (D.f[DIR_P0M])[kbe]+c1o54;
+      f_TW   =  (D.f[DIR_M0P])[ktw]+c1o54;
+      f_TN   =  (D.f[DIR_0PP])[ktn]+c1o54;
+      f_BS   =  (D.f[DIR_0MM])[kbs]+c1o54;
+      f_BN   =  (D.f[DIR_0PM])[kbn]+c1o54;
+      f_TS   =  (D.f[DIR_0MP])[kts]+c1o54;
       f_ZERO =  (D.f[DIR_000])[kzero]+c8o27;
-      f_TNE   = (D.f[DIR_PPP ])[ktne]+c1o216;
-      f_TSW   = (D.f[DIR_MMP ])[ktsw]+c1o216;
-      f_TSE   = (D.f[DIR_PMP ])[ktse]+c1o216;
-      f_TNW   = (D.f[DIR_MPP ])[ktnw]+c1o216;
-      f_BNE   = (D.f[DIR_PPM ])[kbne]+c1o216;
-      f_BSW   = (D.f[DIR_MMM ])[kbsw]+c1o216;
-      f_BSE   = (D.f[DIR_PMM ])[kbse]+c1o216;
-      f_BNW   = (D.f[DIR_MPM ])[kbnw]+c1o216;
+      f_TNE   = (D.f[DIR_PPP])[ktne]+c1o216;
+      f_TSW   = (D.f[DIR_MMP])[ktsw]+c1o216;
+      f_TSE   = (D.f[DIR_PMP])[ktse]+c1o216;
+      f_TNW   = (D.f[DIR_MPP])[ktnw]+c1o216;
+      f_BNE   = (D.f[DIR_PPM])[kbne]+c1o216;
+      f_BSW   = (D.f[DIR_MMM])[kbsw]+c1o216;
+      f_BSE   = (D.f[DIR_PMM])[kbse]+c1o216;
+      f_BNW   = (D.f[DIR_MPM])[kbnw]+c1o216;
       ////////////////////////////////////////////////////////////////////////////////
 
       if( BC == GEO_FLUID || BC == GEO_VELO)
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
index 22192216927f91c33fafc23c54c3fae334abdd34..9fd2a6b2f5c5c10a36856852db47f989ace714ce 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
+++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
@@ -5,72 +5,65 @@
 #include <math.h>
 
 #include <Parameter/Parameter.h>
+
 #include "Parameter/CudaStreamManager.h"
-#include "PreCollisionInteractor/ActuatorLine.h"
+#include "PreCollisionInteractor/ActuatorFarm.h"
 #include "PreCollisionInteractor/Probes/Probe.h"
+#include <PreCollisionInteractor/PrecursorWriter.h>
 
 #include "Calculation/PorousMedia.h"
 
 #include "lbm/constants/NumericConstants.h"
 
-void CudaMemoryManager::cudaAllocFull(int lev)
-{
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->geo      ), parameter->getParH(lev)->mem_size_int  ));
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->k        ), parameter->getParH(lev)->mem_size_int  ));
-}
-void CudaMemoryManager::cudaFreeFull(int lev)
-{
-    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->geo   ));
-    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->k     ));
-}
+
 void CudaMemoryManager::cudaCopyPrint(int lev)
 {
-    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->velocityX   , parameter->getParD(lev)->velocityX   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->velocityY   , parameter->getParD(lev)->velocityY   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->velocityZ   , parameter->getParD(lev)->velocityZ   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->rho  , parameter->getParD(lev)->rho  , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->pressure, parameter->getParD(lev)->pressure, parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->velocityX   , parameter->getParD(lev)->velocityX   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->velocityY   , parameter->getParD(lev)->velocityY   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->velocityZ   , parameter->getParD(lev)->velocityZ   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->rho         , parameter->getParD(lev)->rho         , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->pressure    , parameter->getParD(lev)->pressure    , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
 
     if(parameter->getIsBodyForce())
     {
-        checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->forceX_SP   , parameter->getParD(lev)->forceX_SP   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-        checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->forceY_SP   , parameter->getParD(lev)->forceY_SP   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-        checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->forceZ_SP   , parameter->getParD(lev)->forceZ_SP   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
+        checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->forceX_SP   , parameter->getParD(lev)->forceX_SP   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+        checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->forceY_SP   , parameter->getParD(lev)->forceY_SP   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+        checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->forceZ_SP   , parameter->getParD(lev)->forceZ_SP   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
     }
 
     if(parameter->getUseTurbulentViscosity())
     {
-        checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->turbViscosity   , parameter->getParD(lev)->turbViscosity   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
+        checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->turbViscosity   , parameter->getParD(lev)->turbViscosity   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
     }
 }
 void CudaMemoryManager::cudaCopyMedianPrint(int lev)
 {
-    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vx_SP_Med   , parameter->getParD(lev)->vx_SP_Med   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vy_SP_Med   , parameter->getParD(lev)->vy_SP_Med   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vz_SP_Med   , parameter->getParD(lev)->vz_SP_Med   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->rho_SP_Med  , parameter->getParD(lev)->rho_SP_Med  , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->press_SP_Med, parameter->getParD(lev)->press_SP_Med, parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vx_SP_Med   , parameter->getParD(lev)->vx_SP_Med   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vy_SP_Med   , parameter->getParD(lev)->vy_SP_Med   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vz_SP_Med   , parameter->getParD(lev)->vz_SP_Med   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->rho_SP_Med  , parameter->getParD(lev)->rho_SP_Med  , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->press_SP_Med, parameter->getParD(lev)->press_SP_Med, parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
 }
 void CudaMemoryManager::cudaAllocCoord(int lev)
 {
 	//Host
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->coordinateX      ), parameter->getParH(lev)->mem_size_real_SP  ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->coordinateY      ), parameter->getParH(lev)->mem_size_real_SP  ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->coordinateZ      ), parameter->getParH(lev)->mem_size_real_SP  ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->coordinateX      ), parameter->getParH(lev)->memSizeRealLBnodes  ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->coordinateY      ), parameter->getParH(lev)->memSizeRealLBnodes  ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->coordinateZ      ), parameter->getParH(lev)->memSizeRealLBnodes  ));
 	//Device (spinning ship + uppsala)
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->coordinateX      ), parameter->getParH(lev)->mem_size_real_SP  ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->coordinateY      ), parameter->getParH(lev)->mem_size_real_SP  ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->coordinateZ      ), parameter->getParH(lev)->mem_size_real_SP  ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->coordinateX      ), parameter->getParH(lev)->memSizeRealLBnodes  ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->coordinateY      ), parameter->getParH(lev)->memSizeRealLBnodes  ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->coordinateZ      ), parameter->getParH(lev)->memSizeRealLBnodes  ));
 	//////////////////////////////////////////////////////////////////////////
-	double tmp = 3. * (double)parameter->getParH(lev)->mem_size_real_SP;
+	double tmp = 3. * (double)parameter->getParH(lev)->memSizeRealLBnodes;
 	setMemsizeGPU(tmp, false);
 }
 void CudaMemoryManager::cudaCopyCoord(int lev)
 {
 	//copy host to device
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->coordinateX,  parameter->getParH(lev)->coordinateX,  parameter->getParH(lev)->mem_size_real_SP     , cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->coordinateY,  parameter->getParH(lev)->coordinateY,  parameter->getParH(lev)->mem_size_real_SP     , cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->coordinateZ,  parameter->getParH(lev)->coordinateZ,  parameter->getParH(lev)->mem_size_real_SP     , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->coordinateX,  parameter->getParH(lev)->coordinateX,  parameter->getParH(lev)->memSizeRealLBnodes     , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->coordinateY,  parameter->getParH(lev)->coordinateY,  parameter->getParH(lev)->memSizeRealLBnodes     , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->coordinateZ,  parameter->getParH(lev)->coordinateZ,  parameter->getParH(lev)->memSizeRealLBnodes     , cudaMemcpyHostToDevice));
 }
 void CudaMemoryManager::cudaFreeCoord(int lev)
 {
@@ -81,24 +74,24 @@ void CudaMemoryManager::cudaFreeCoord(int lev)
 void CudaMemoryManager::cudaAllocBodyForce(int lev)
 {
     //Host
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->forceX_SP      ), parameter->getParH(lev)->mem_size_real_SP  ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->forceY_SP      ), parameter->getParH(lev)->mem_size_real_SP  ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->forceZ_SP      ), parameter->getParH(lev)->mem_size_real_SP  ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->forceX_SP      ), parameter->getParH(lev)->memSizeRealLBnodes  ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->forceY_SP      ), parameter->getParH(lev)->memSizeRealLBnodes  ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->forceZ_SP      ), parameter->getParH(lev)->memSizeRealLBnodes  ));
 	//Device
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->forceX_SP      ), parameter->getParH(lev)->mem_size_real_SP  ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->forceY_SP      ), parameter->getParH(lev)->mem_size_real_SP  ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->forceZ_SP      ), parameter->getParH(lev)->mem_size_real_SP  ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->forceX_SP      ), parameter->getParH(lev)->memSizeRealLBnodes  ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->forceY_SP      ), parameter->getParH(lev)->memSizeRealLBnodes  ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->forceZ_SP      ), parameter->getParH(lev)->memSizeRealLBnodes  ));
 	//////////////////////////////////////////////////////////////////////////
-	double tmp = 3. * (double)parameter->getParH(lev)->mem_size_real_SP;
+	double tmp = 3. * (double)parameter->getParH(lev)->memSizeRealLBnodes;
 	setMemsizeGPU(tmp, false);
 
 }
 void CudaMemoryManager::cudaCopyBodyForce(int lev)
 {
    	//copy host to device
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->forceX_SP,  parameter->getParH(lev)->forceX_SP,  parameter->getParH(lev)->mem_size_real_SP     , cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->forceY_SP,  parameter->getParH(lev)->forceY_SP,  parameter->getParH(lev)->mem_size_real_SP     , cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->forceZ_SP,  parameter->getParH(lev)->forceZ_SP,  parameter->getParH(lev)->mem_size_real_SP     , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->forceX_SP,  parameter->getParH(lev)->forceX_SP,  parameter->getParH(lev)->memSizeRealLBnodes     , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->forceY_SP,  parameter->getParH(lev)->forceY_SP,  parameter->getParH(lev)->memSizeRealLBnodes     , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->forceZ_SP,  parameter->getParH(lev)->forceZ_SP,  parameter->getParH(lev)->memSizeRealLBnodes     , cudaMemcpyHostToDevice));
 
 }
 void CudaMemoryManager::cudaFreeBodyForce(int lev)
@@ -111,71 +104,71 @@ void CudaMemoryManager::cudaFreeBodyForce(int lev)
 //print
 void CudaMemoryManager::cudaCopyDataToHost(int lev)
 {
-	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->velocityX   , parameter->getParD(lev)->velocityX   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->velocityY   , parameter->getParD(lev)->velocityY   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->velocityZ   , parameter->getParD(lev)->velocityZ   , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->rho  , parameter->getParD(lev)->rho  , parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
-	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->pressure, parameter->getParD(lev)->pressure, parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
+	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->velocityX   , parameter->getParD(lev)->velocityX   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->velocityY   , parameter->getParD(lev)->velocityY   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->velocityZ   , parameter->getParD(lev)->velocityZ   , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->rho         , parameter->getParD(lev)->rho         , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
+	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->pressure    , parameter->getParD(lev)->pressure    , parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
 }
 //sparse
 void CudaMemoryManager::cudaAllocSP(int lev)
 {
 	//Host
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->typeOfGridNode           ), parameter->getParH(lev)->mem_size_int_SP    ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->neighborX    ), parameter->getParH(lev)->mem_size_int_SP    ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->neighborY    ), parameter->getParH(lev)->mem_size_int_SP    ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->neighborZ    ), parameter->getParH(lev)->mem_size_int_SP    ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->rho          ), parameter->getParH(lev)->mem_size_real_SP));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityX           ), parameter->getParH(lev)->mem_size_real_SP));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityY           ), parameter->getParH(lev)->mem_size_real_SP));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityZ           ), parameter->getParH(lev)->mem_size_real_SP));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->pressure        ), parameter->getParH(lev)->mem_size_real_SP));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->typeOfGridNode), parameter->getParH(lev)->memSizeLonglongLBnodes));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->neighborX     ), parameter->getParH(lev)->memSizeLonglongLBnodes));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->neighborY     ), parameter->getParH(lev)->memSizeLonglongLBnodes));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->neighborZ     ), parameter->getParH(lev)->memSizeLonglongLBnodes));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->rho           ), parameter->getParH(lev)->memSizeRealLBnodes    ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityX     ), parameter->getParH(lev)->memSizeRealLBnodes    ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityY     ), parameter->getParH(lev)->memSizeRealLBnodes    ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityZ     ), parameter->getParH(lev)->memSizeRealLBnodes    ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->pressure      ), parameter->getParH(lev)->memSizeRealLBnodes    ));
 	//Device
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->typeOfGridNode               ), parameter->getParD(lev)->mem_size_int_SP    ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->neighborX        ), parameter->getParD(lev)->mem_size_int_SP    ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->neighborY        ), parameter->getParD(lev)->mem_size_int_SP    ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->neighborZ        ), parameter->getParD(lev)->mem_size_int_SP    ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->rho              ), parameter->getParD(lev)->mem_size_real_SP));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityX               ), parameter->getParD(lev)->mem_size_real_SP));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityY               ), parameter->getParD(lev)->mem_size_real_SP));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityZ               ), parameter->getParD(lev)->mem_size_real_SP));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->pressure            ), parameter->getParD(lev)->mem_size_real_SP));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->distributions.f[0]           ), (unsigned long long)parameter->getD3Qxx()*(unsigned long long)parameter->getParD(lev)->mem_size_real_SP));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->typeOfGridNode    ), parameter->getParD(lev)->memSizeLonglongLBnodes));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->neighborX         ), parameter->getParD(lev)->memSizeLonglongLBnodes));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->neighborY         ), parameter->getParD(lev)->memSizeLonglongLBnodes));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->neighborZ         ), parameter->getParD(lev)->memSizeLonglongLBnodes));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->rho               ), parameter->getParD(lev)->memSizeRealLBnodes    ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityX         ), parameter->getParD(lev)->memSizeRealLBnodes    ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityY         ), parameter->getParD(lev)->memSizeRealLBnodes    ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityZ         ), parameter->getParD(lev)->memSizeRealLBnodes    ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->pressure          ), parameter->getParD(lev)->memSizeRealLBnodes    ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->distributions.f[0]), (unsigned long long)parameter->getD3Qxx()*(unsigned long long)parameter->getParD(lev)->memSizeRealLBnodes));
 	//////////////////////////////////////////////////////////////////////////
-	double tmp = 4. * (double)parameter->getParH(lev)->mem_size_int_SP + 5. * (double)parameter->getParH(lev)->mem_size_real_SP + (double)parameter->getD3Qxx() * (double)parameter->getParH(lev)->mem_size_real_SP;
+	double tmp = 4. * (double)parameter->getParH(lev)->memSizeLonglongLBnodes + 5. * (double)parameter->getParH(lev)->memSizeRealLBnodes + (double)parameter->getD3Qxx() * (double)parameter->getParH(lev)->memSizeRealLBnodes;
 	setMemsizeGPU(tmp, false);
 }
 void CudaMemoryManager::cudaCopySP(int lev)
 {
 	//copy host to device
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->typeOfGridNode       ,  parameter->getParH(lev)->typeOfGridNode       ,  parameter->getParH(lev)->mem_size_int_SP     , cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->neighborX,  parameter->getParH(lev)->neighborX,  parameter->getParH(lev)->mem_size_int_SP     , cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->neighborY,  parameter->getParH(lev)->neighborY,  parameter->getParH(lev)->mem_size_int_SP     , cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->neighborZ,  parameter->getParH(lev)->neighborZ,  parameter->getParH(lev)->mem_size_int_SP     , cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->rho      ,  parameter->getParH(lev)->rho      ,  parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityX       ,  parameter->getParH(lev)->velocityX       ,  parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityY       ,  parameter->getParH(lev)->velocityY       ,  parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityZ       ,  parameter->getParH(lev)->velocityZ       ,  parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->pressure    ,  parameter->getParH(lev)->pressure    ,  parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->typeOfGridNode, parameter->getParH(lev)->typeOfGridNode,  parameter->getParH(lev)->memSizeLonglongLBnodes , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->neighborX     , parameter->getParH(lev)->neighborX     ,  parameter->getParH(lev)->memSizeLonglongLBnodes , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->neighborY     , parameter->getParH(lev)->neighborY     ,  parameter->getParH(lev)->memSizeLonglongLBnodes , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->neighborZ     , parameter->getParH(lev)->neighborZ     ,  parameter->getParH(lev)->memSizeLonglongLBnodes , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->rho           , parameter->getParH(lev)->rho           ,  parameter->getParH(lev)->memSizeRealLBnodes     , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityX     , parameter->getParH(lev)->velocityX     ,  parameter->getParH(lev)->memSizeRealLBnodes     , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityY     , parameter->getParH(lev)->velocityY     ,  parameter->getParH(lev)->memSizeRealLBnodes     , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityZ     , parameter->getParH(lev)->velocityZ     ,  parameter->getParH(lev)->memSizeRealLBnodes     , cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->pressure      , parameter->getParH(lev)->pressure      ,  parameter->getParH(lev)->memSizeRealLBnodes     , cudaMemcpyHostToDevice));
 }
 void CudaMemoryManager::cudaFreeSP(int lev)
 {
-	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->typeOfGridNode       ));
-	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->velocityX       ));
-	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->velocityY       ));
-	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->velocityZ       ));
-	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->rho      ));
-	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->pressure    ));
-	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->neighborX));
-	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->neighborY));
-	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->neighborZ));
+	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->typeOfGridNode ));
+	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->velocityX      ));
+	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->velocityY      ));
+	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->velocityZ      ));
+	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->rho            ));
+	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->pressure       ));
+	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->neighborX      ));
+	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->neighborY      ));
+	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->neighborZ      ));
 }
 void CudaMemoryManager::cudaAllocF3SP(int lev)
 {
     //Device
-    checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->g6.g[0]), (unsigned long long)6*(unsigned long long)parameter->getParD(lev)->mem_size_real_SP));
+    checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->g6.g[0]), (unsigned long long)6*(unsigned long long)parameter->getParD(lev)->memSizeRealLBnodes));
     //////////////////////////////////////////////////////////////////////////
-    double tmp = (double)6 * (double)parameter->getParH(lev)->mem_size_real_SP;
+    double tmp = (double)6 * (double)parameter->getParH(lev)->memSizeRealLBnodes;
     setMemsizeGPU(tmp, false);
 }
 
@@ -207,20 +200,20 @@ void CudaMemoryManager::cudaAllocVeloBC(int lev)
 
 	//Host
 	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.q27[0]),  parameter->getD3Qxx()*mem_size_inflow_Q_q ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.k),                  mem_size_inflow_Q_k ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.Vx),                 mem_size_inflow_Q_q ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.Vy),                 mem_size_inflow_Q_q ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.Vz),                 mem_size_inflow_Q_q ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.deltaVz),            mem_size_inflow_Q_q ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.RhoBC),              mem_size_inflow_Q_q ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.k),                             mem_size_inflow_Q_k ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.Vx),                            mem_size_inflow_Q_q ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.Vy),                            mem_size_inflow_Q_q ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.Vz),                            mem_size_inflow_Q_q ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.deltaVz),                       mem_size_inflow_Q_q ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->velocityBC.RhoBC),                         mem_size_inflow_Q_q ));
 
 	//Device
 	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityBC.q27[0]),      parameter->getD3Qxx()*mem_size_inflow_Q_q ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityBC.k),                      mem_size_inflow_Q_k ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityBC.Vx),                     mem_size_inflow_Q_q ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityBC.Vy),                     mem_size_inflow_Q_q ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityBC.Vz),                     mem_size_inflow_Q_q ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityBC.deltaVz),                mem_size_inflow_Q_q ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityBC.k),                                 mem_size_inflow_Q_k ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityBC.Vx),                                mem_size_inflow_Q_q ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityBC.Vy),                                mem_size_inflow_Q_q ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityBC.Vz),                                mem_size_inflow_Q_q ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->velocityBC.deltaVz),                           mem_size_inflow_Q_q ));
 
 	//////////////////////////////////////////////////////////////////////////
 	double tmp = (double)mem_size_inflow_Q_k + 4. * (double)mem_size_inflow_Q_q + (double)parameter->getD3Qxx() * (double)mem_size_inflow_Q_q;
@@ -232,13 +225,14 @@ void CudaMemoryManager::cudaCopyVeloBC(int lev)
 	unsigned int mem_size_inflow_Q_q = sizeof(real)*parameter->getParH(lev)->velocityBC.numberOfBCnodes;
 
 	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityBC.q27[0],  parameter->getParH(lev)->velocityBC.q27[0], parameter->getD3Qxx()* mem_size_inflow_Q_q,  cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityBC.k,       parameter->getParH(lev)->velocityBC.k,                  mem_size_inflow_Q_k,  cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityBC.Vx,      parameter->getParH(lev)->velocityBC.Vx,                 mem_size_inflow_Q_q,  cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityBC.Vy,      parameter->getParH(lev)->velocityBC.Vy,                 mem_size_inflow_Q_q,  cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityBC.Vz,      parameter->getParH(lev)->velocityBC.Vz,                 mem_size_inflow_Q_q,  cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityBC.deltaVz, parameter->getParH(lev)->velocityBC.deltaVz,            mem_size_inflow_Q_q,  cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityBC.k,       parameter->getParH(lev)->velocityBC.k,                             mem_size_inflow_Q_k,  cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityBC.Vx,      parameter->getParH(lev)->velocityBC.Vx,                            mem_size_inflow_Q_q,  cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityBC.Vy,      parameter->getParH(lev)->velocityBC.Vy,                            mem_size_inflow_Q_q,  cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityBC.Vz,      parameter->getParH(lev)->velocityBC.Vz,                            mem_size_inflow_Q_q,  cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityBC.deltaVz, parameter->getParH(lev)->velocityBC.deltaVz,                       mem_size_inflow_Q_q,  cudaMemcpyHostToDevice));
 
 }
+
 void CudaMemoryManager::cudaFreeVeloBC(int lev)
 {
 	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->velocityBC.q27[0] ));
@@ -256,15 +250,15 @@ void CudaMemoryManager::cudaAllocOutflowBC(int lev)
 
 	//Host
 	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->outflowBC.q27[0]), parameter->getD3Qxx()*mem_size_outflow_Q_q ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->outflowBC.k),                 mem_size_outflow_Q_k ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->outflowBC.kN),                mem_size_outflow_Q_k ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->outflowBC.RhoBC),             mem_size_outflow_Q_q ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->outflowBC.k),                            mem_size_outflow_Q_k ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->outflowBC.kN),                           mem_size_outflow_Q_k ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->outflowBC.RhoBC),                        mem_size_outflow_Q_q ));
 
 	//Device
 	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->outflowBC.q27[0]),     parameter->getD3Qxx()* mem_size_outflow_Q_q ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->outflowBC.k),                      mem_size_outflow_Q_k ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->outflowBC.kN),                     mem_size_outflow_Q_k ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->outflowBC.RhoBC),                  mem_size_outflow_Q_q ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->outflowBC.k),                                 mem_size_outflow_Q_k ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->outflowBC.kN),                                mem_size_outflow_Q_k ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->outflowBC.RhoBC),                             mem_size_outflow_Q_q ));
 
 	//////////////////////////////////////////////////////////////////////////
 	double tmp = (double)mem_size_outflow_Q_q + 2. * (double)mem_size_outflow_Q_k + (double)parameter->getD3Qxx()*(double)mem_size_outflow_Q_q;
@@ -276,9 +270,9 @@ void CudaMemoryManager::cudaCopyOutflowBC(int lev)
 	unsigned int mem_size_outflow_Q_q = sizeof(real)*parameter->getParH(lev)->outflowBC.numberOfBCnodes;
 
 	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->outflowBC.q27[0],  parameter->getParH(lev)->outflowBC.q27[0], parameter->getD3Qxx()* mem_size_outflow_Q_q,  cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->outflowBC.k,       parameter->getParH(lev)->outflowBC.k,                  mem_size_outflow_Q_k,  cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->outflowBC.kN,      parameter->getParH(lev)->outflowBC.kN,                 mem_size_outflow_Q_k,  cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->outflowBC.RhoBC,   parameter->getParH(lev)->outflowBC.RhoBC,              mem_size_outflow_Q_q,  cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->outflowBC.k,       parameter->getParH(lev)->outflowBC.k,                             mem_size_outflow_Q_k,  cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->outflowBC.kN,      parameter->getParH(lev)->outflowBC.kN,                            mem_size_outflow_Q_k,  cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->outflowBC.RhoBC,   parameter->getParH(lev)->outflowBC.RhoBC,                         mem_size_outflow_Q_q,  cudaMemcpyHostToDevice));
 }
 void CudaMemoryManager::cudaFreeOutflowBC(int lev)
 {
@@ -297,13 +291,13 @@ void CudaMemoryManager::cudaAllocNoSlipBC(int lev)
 
 	//Host
 	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->noSlipBC.q27[0]), parameter->getD3Qxx()*mem_size_Q_q      ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->noSlipBC.k),                 mem_size_Q_k      ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->noSlipBC.qread),             mem_size_Q_q_read ));//Geller
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->noSlipBC.valueQ),            mem_size_Q_value  ));//Geller
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->noSlipBC.k),                            mem_size_Q_k      ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->noSlipBC.qread),                        mem_size_Q_q_read ));//Geller
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->noSlipBC.valueQ),                       mem_size_Q_value  ));//Geller
 
 	//Device
 	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->noSlipBC.q27[0]),     parameter->getD3Qxx()* mem_size_Q_q     ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->noSlipBC.k),                      mem_size_Q_k     ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->noSlipBC.k),                                 mem_size_Q_k     ));
 
 	//////////////////////////////////////////////////////////////////////////
 	double tmp = (double)mem_size_Q_k + (double)parameter->getD3Qxx()*(double)mem_size_Q_q;
@@ -315,7 +309,7 @@ void CudaMemoryManager::cudaCopyNoSlipBC(int lev)
 	unsigned int mem_size_Q_q = sizeof(real)*parameter->getParH(lev)->noSlipBC.numberOfBCnodes;
 
 	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->noSlipBC.q27[0], parameter->getParH(lev)->noSlipBC.q27[0], parameter->getD3Qxx()* mem_size_Q_q,       cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->noSlipBC.k,      parameter->getParH(lev)->noSlipBC.k,                  mem_size_Q_k,       cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->noSlipBC.k,      parameter->getParH(lev)->noSlipBC.k,                             mem_size_Q_k,       cudaMemcpyHostToDevice));
 }
 void CudaMemoryManager::cudaFreeNoSlipBC(int lev)
 {
@@ -332,11 +326,11 @@ void CudaMemoryManager::cudaAllocGeomBC(int lev)
 
 	//Host
 	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->geometryBC.q27[0]), parameter->getD3Qxx()*mem_size_Q_q      ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->geometryBC.k),                 mem_size_Q_k      ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->geometryBC.k),                            mem_size_Q_k      ));
 
 	//Device
 	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->geometryBC.q27[0]),     parameter->getD3Qxx()* mem_size_Q_q     ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->geometryBC.k),                      mem_size_Q_k     ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->geometryBC.k),                                 mem_size_Q_k     ));
 
 	//////////////////////////////////////////////////////////////////////////
 	double tmp = (double)mem_size_Q_k + (double)parameter->getD3Qxx()*(double)mem_size_Q_q;
@@ -348,7 +342,7 @@ void CudaMemoryManager::cudaCopyGeomBC(int lev)
 	unsigned int mem_size_Q_q = sizeof(real)*parameter->getParH(lev)->geometryBC.numberOfBCnodes;
 
 	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->geometryBC.q27[0], parameter->getParH(lev)->geometryBC.q27[0], parameter->getD3Qxx()* mem_size_Q_q,       cudaMemcpyHostToDevice));
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->geometryBC.k,      parameter->getParH(lev)->geometryBC.k,                  mem_size_Q_k,       cudaMemcpyHostToDevice));
+	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->geometryBC.k,      parameter->getParH(lev)->geometryBC.k,                             mem_size_Q_k,       cudaMemcpyHostToDevice));
 }
 void CudaMemoryManager::cudaFreeGeomBC(int lev)
 {
@@ -363,15 +357,15 @@ void CudaMemoryManager::cudaAllocPress(int lev)
 
 	//Host
 	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->pressureBC.q27[0]), parameter->getD3Qxx()*mem_size_Q_q      ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->pressureBC.k),                 mem_size_Q_k      ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->pressureBC.kN),                mem_size_Q_k      ));
-	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->pressureBC.RhoBC),             mem_size_Q_q      ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->pressureBC.k),                            mem_size_Q_k      ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->pressureBC.kN),                           mem_size_Q_k      ));
+	checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->pressureBC.RhoBC),                        mem_size_Q_q      ));
 
 	//Device
 	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->pressureBC.q27[0]),     parameter->getD3Qxx()* mem_size_Q_q     ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->pressureBC.k),                      mem_size_Q_k     ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->pressureBC.kN),                     mem_size_Q_k     ));
-	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->pressureBC.RhoBC),                  mem_size_Q_q     ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->pressureBC.k),                                 mem_size_Q_k     ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->pressureBC.kN),                                mem_size_Q_k     ));
+	checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->pressureBC.RhoBC),                             mem_size_Q_q     ));
 
 	//////////////////////////////////////////////////////////////////////////
 	double tmp = 2. * (double)mem_size_Q_k + (double)mem_size_Q_q + (double)parameter->getD3Qxx()*(double)mem_size_Q_q;
@@ -524,24 +518,24 @@ void CudaMemoryManager::cudaCopyProcessNeighborXIndex(int lev, unsigned int proc
 								cudaMemcpyHostToDevice));
 }
 void CudaMemoryManager::cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor,
-                                                     const unsigned int &memsizeFsRecv, int streamIndex)
+                                                     const unsigned int &memsizeFsRecv)
 {
-    if (streamIndex == -1)
+    if (!parameter->getStreamManager()->streamIsRegistered(CudaStreamIndex::SubDomainBorder))
         checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].f[0],
 						 parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0],
 						 parameter->getD3Qxx() * memsizeFsRecv,
 						 cudaMemcpyHostToDevice));
     else
         checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].f[0],
-                         parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0],
-                         parameter->getD3Qxx() * memsizeFsRecv,
-                         cudaMemcpyHostToDevice,
-                         parameter->getStreamManager()->getStream(streamIndex)));
+                                         parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0],
+                                         parameter->getD3Qxx() * memsizeFsRecv,
+                                         cudaMemcpyHostToDevice,
+                                         parameter->getStreamManager()->getStream(CudaStreamIndex::SubDomainBorder)));
 }
 void CudaMemoryManager::cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor,
-                                                     const unsigned int &memsizeFsSend, int streamIndex)
-{
-    if (streamIndex == -1)
+                                                     const unsigned int &memsizeFsSend)
+{  
+    if (!parameter->getStreamManager()->streamIsRegistered(CudaStreamIndex::SubDomainBorder))
     	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborX[processNeighbor].f[0],
     								parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].f[0],
     								parameter->getD3Qxx() * memsizeFsSend,
@@ -551,7 +545,7 @@ void CudaMemoryManager::cudaCopyProcessNeighborXFsDH(int lev, unsigned int proce
     								     parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].f[0],
     								     parameter->getD3Qxx() * memsizeFsSend,
     								     cudaMemcpyDeviceToHost,
-                                         parameter->getStreamManager()->getStream(streamIndex)));
+                                         parameter->getStreamManager()->getStream(CudaStreamIndex::SubDomainBorder)));
 }
 void CudaMemoryManager::cudaFreeProcessNeighborX(int lev, unsigned int processNeighbor)
 {
@@ -594,35 +588,33 @@ void CudaMemoryManager::cudaCopyProcessNeighborYIndex(int lev, unsigned int proc
 								parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].memsizeIndex,
 								cudaMemcpyHostToDevice));
 }
-void CudaMemoryManager::cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv,
-                                                     int streamIndex)
+void CudaMemoryManager::cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv)
 {
-    if (streamIndex == -1)
+    if (!parameter->getStreamManager()->streamIsRegistered(CudaStreamIndex::SubDomainBorder))
 	    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0],
 								    parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0],
 								    parameter->getD3Qxx() * memsizeFsRecv,
 								    cudaMemcpyHostToDevice));
     else
-        checkCudaErrors(cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0],
-                        parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0],
-                        parameter->getD3Qxx() * memsizeFsRecv,
-                        cudaMemcpyHostToDevice,
-                        parameter->getStreamManager()->getStream(streamIndex)));
+        checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0],
+                                         parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0],
+                                         parameter->getD3Qxx() * memsizeFsRecv,
+                                         cudaMemcpyHostToDevice,
+                                         parameter->getStreamManager()->getStream(CudaStreamIndex::SubDomainBorder)));
 }
-void CudaMemoryManager::cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend,
-                                                     int streamIndex)
+void CudaMemoryManager::cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend)
 {
-    if (streamIndex == -1)
+    if (!parameter->getStreamManager()->streamIsRegistered(CudaStreamIndex::SubDomainBorder))
 	    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0],
 	    							parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0],
 	    							parameter->getD3Qxx() * memsizeFsSend,
 	    							cudaMemcpyDeviceToHost));
     else
-        checkCudaErrors(
-            cudaMemcpyAsync(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0],
-                            parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0],
-                            parameter->getD3Qxx() * memsizeFsSend,
-                            cudaMemcpyDeviceToHost, parameter->getStreamManager()->getStream(streamIndex)));
+        checkCudaErrors( cudaMemcpyAsync(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0],
+                                         parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0],
+                                         parameter->getD3Qxx() * memsizeFsSend,
+                                         cudaMemcpyDeviceToHost, 
+                                         parameter->getStreamManager()->getStream(CudaStreamIndex::SubDomainBorder)));
 }
 void CudaMemoryManager::cudaFreeProcessNeighborY(int lev, unsigned int processNeighbor)
 {
@@ -666,9 +658,9 @@ void CudaMemoryManager::cudaCopyProcessNeighborZIndex(int lev, unsigned int proc
 								cudaMemcpyHostToDevice));
 }
 void CudaMemoryManager::cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor,
-                                                     const unsigned int &memsizeFsRecv, int streamIndex)
+                                                     const unsigned int &memsizeFsRecv)
 {
-    if (streamIndex == -1)
+    if (!parameter->getStreamManager()->streamIsRegistered(CudaStreamIndex::SubDomainBorder))
 	    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].f[0],
 	    							parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].f[0],
 	    							parameter->getD3Qxx() * memsizeFsRecv,
@@ -678,12 +670,12 @@ void CudaMemoryManager::cudaCopyProcessNeighborZFsHD(int lev, unsigned int proce
 	    				                 parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].f[0],
 	    				                 parameter->getD3Qxx() * memsizeFsRecv,
 	    				                 cudaMemcpyHostToDevice,
-                                         parameter->getStreamManager()->getStream(streamIndex)));
+                                         parameter->getStreamManager()->getStream(CudaStreamIndex::SubDomainBorder)));
 }
 void CudaMemoryManager::cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor,
-                                                     const unsigned int &memsizeFsSend, int streamIndex)
+                                                     const unsigned int &memsizeFsSend)
 {
-    if (streamIndex == -1)
+    if (!parameter->getStreamManager()->streamIsRegistered(CudaStreamIndex::SubDomainBorder))
         checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborZ[processNeighbor].f[0],
 	        					    parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].f[0],
 	        					    parameter->getD3Qxx() * memsizeFsSend,
@@ -693,7 +685,7 @@ void CudaMemoryManager::cudaCopyProcessNeighborZFsDH(int lev, unsigned int proce
 	        						     parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].f[0],
 	        						     parameter->getD3Qxx() * memsizeFsSend,
 	        						     cudaMemcpyDeviceToHost,
-                                         parameter->getStreamManager()->getStream(streamIndex)));
+                                         parameter->getStreamManager()->getStream(CudaStreamIndex::SubDomainBorder)));
 }
 void CudaMemoryManager::cudaFreeProcessNeighborZ(int lev, unsigned int processNeighbor)
 {
@@ -887,17 +879,17 @@ void CudaMemoryManager::cudaFreeProcessNeighborF3Z(int lev, unsigned int process
 void CudaMemoryManager::cudaAllocNeighborWSB(int lev)
 {
     //Host
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->neighborInverse    ), parameter->getParH(lev)->mem_size_int_SP    ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->neighborInverse    ), parameter->getParH(lev)->memSizeLonglongLBnodes    ));
     //Device
-    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->neighborInverse        ), parameter->getParD(lev)->mem_size_int_SP    ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->neighborInverse        ), parameter->getParD(lev)->memSizeLonglongLBnodes    ));
     //////////////////////////////////////////////////////////////////////////
-    double tmp = (double)parameter->getParH(lev)->mem_size_int_SP;
+    double tmp = (double)parameter->getParH(lev)->memSizeLonglongLBnodes;
     setMemsizeGPU(tmp, false);
 }
 void CudaMemoryManager::cudaCopyNeighborWSB(int lev)
 {
     //copy host to device
-    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->neighborInverse,  parameter->getParH(lev)->neighborInverse,  parameter->getParH(lev)->mem_size_int_SP     , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->neighborInverse,  parameter->getParH(lev)->neighborInverse,  parameter->getParH(lev)->memSizeLonglongLBnodes     , cudaMemcpyHostToDevice));
 }
 void CudaMemoryManager::cudaFreeNeighborWSB(int lev)
 {
@@ -907,7 +899,7 @@ void CudaMemoryManager::cudaFreeNeighborWSB(int lev)
 void CudaMemoryManager::cudaAllocTurbulentViscosity(int lev)
 {
     //Host
-    checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->turbViscosity), parameter->getParH(lev)->mem_size_real_SP));
+    checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->turbViscosity), parameter->getParH(lev)->memSizeRealLBnodes));
     //Debug
     // checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->gSij ), parameter->getParH(lev)->mem_size_real_SP));
     // checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->gSDij), parameter->getParH(lev)->mem_size_real_SP));
@@ -922,7 +914,7 @@ void CudaMemoryManager::cudaAllocTurbulentViscosity(int lev)
     // checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->gDzvz), parameter->getParH(lev)->mem_size_real_SP));
 
     //Device
-    checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->turbViscosity), parameter->getParD(lev)->mem_size_real_SP));
+    checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->turbViscosity), parameter->getParD(lev)->memSizeRealLBnodes));
     //Debug
     // checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->gSij ), parameter->getParD(lev)->mem_size_real_SP));
     // checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->gSDij), parameter->getParD(lev)->mem_size_real_SP));
@@ -937,13 +929,13 @@ void CudaMemoryManager::cudaAllocTurbulentViscosity(int lev)
     // checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->gDzvz), parameter->getParD(lev)->mem_size_real_SP));
     // //////////////////////////////////////////////////////////////////////////
     // double tmp = (double)parameter->getParH(lev)->mem_size_real_SP * 12.0;
-    double tmp = (double)parameter->getParH(lev)->mem_size_real_SP;
+    double tmp = (double)parameter->getParH(lev)->memSizeRealLBnodes;
     setMemsizeGPU(tmp, false);
 }
 void CudaMemoryManager::cudaCopyTurbulentViscosityHD(int lev)
 {
     //copy host to device
-    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->turbViscosity, parameter->getParH(lev)->turbViscosity, parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->turbViscosity, parameter->getParH(lev)->turbViscosity, parameter->getParH(lev)->memSizeRealLBnodes, cudaMemcpyHostToDevice));
     //Debug
     // checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->gSij , parameter->getParH(lev)->gSij , parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyHostToDevice));
     // checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->gSDij, parameter->getParH(lev)->gSDij, parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyHostToDevice));
@@ -960,7 +952,7 @@ void CudaMemoryManager::cudaCopyTurbulentViscosityHD(int lev)
 void CudaMemoryManager::cudaCopyTurbulentViscosityDH(int lev)
 {
     //copy device to host
-    checkCudaErrors(cudaMemcpy(parameter->getParH(lev)->turbViscosity, parameter->getParD(lev)->turbViscosity, parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(parameter->getParH(lev)->turbViscosity, parameter->getParD(lev)->turbViscosity, parameter->getParH(lev)->memSizeRealLBnodes, cudaMemcpyDeviceToHost));
     //Debug
     // checkCudaErrors(cudaMemcpy(parameter->getParH(lev)->gSij , parameter->getParD(lev)->gSij , parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyDeviceToHost));
     // checkCudaErrors(cudaMemcpy(parameter->getParH(lev)->gSDij, parameter->getParD(lev)->gSDij, parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyDeviceToHost));
@@ -1062,29 +1054,29 @@ void CudaMemoryManager::cudaFreeTurbulenceIntensity(int lev)
 void CudaMemoryManager::cudaAllocMedianSP(int lev)
 {
     //Host
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->rho_SP_Med      ), parameter->getParH(lev)->mem_size_real_SP));
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vx_SP_Med       ), parameter->getParH(lev)->mem_size_real_SP));
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vy_SP_Med       ), parameter->getParH(lev)->mem_size_real_SP));
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vz_SP_Med       ), parameter->getParH(lev)->mem_size_real_SP));
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->press_SP_Med    ), parameter->getParH(lev)->mem_size_real_SP));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->rho_SP_Med      ), parameter->getParH(lev)->memSizeRealLBnodes));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vx_SP_Med       ), parameter->getParH(lev)->memSizeRealLBnodes));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vy_SP_Med       ), parameter->getParH(lev)->memSizeRealLBnodes));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vz_SP_Med       ), parameter->getParH(lev)->memSizeRealLBnodes));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->press_SP_Med    ), parameter->getParH(lev)->memSizeRealLBnodes));
     //Device
-    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->rho_SP_Med          ), parameter->getParD(lev)->mem_size_real_SP));
-    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vx_SP_Med           ), parameter->getParD(lev)->mem_size_real_SP));
-    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vy_SP_Med           ), parameter->getParD(lev)->mem_size_real_SP));
-    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vz_SP_Med           ), parameter->getParD(lev)->mem_size_real_SP));
-    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->press_SP_Med        ), parameter->getParD(lev)->mem_size_real_SP));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->rho_SP_Med          ), parameter->getParD(lev)->memSizeRealLBnodes));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vx_SP_Med           ), parameter->getParD(lev)->memSizeRealLBnodes));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vy_SP_Med           ), parameter->getParD(lev)->memSizeRealLBnodes));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vz_SP_Med           ), parameter->getParD(lev)->memSizeRealLBnodes));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->press_SP_Med        ), parameter->getParD(lev)->memSizeRealLBnodes));
     //////////////////////////////////////////////////////////////////////////
-    double tmp = 5. * (double)parameter->getParH(lev)->mem_size_real_SP;
+    double tmp = 5. * (double)parameter->getParH(lev)->memSizeRealLBnodes;
     setMemsizeGPU(tmp, false);
 }
 void CudaMemoryManager::cudaCopyMedianSP(int lev)
 {
     //copy host to device
-    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->rho_SP_Med  ,  parameter->getParH(lev)->rho_SP_Med  ,  parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyHostToDevice));
-    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vx_SP_Med   ,  parameter->getParH(lev)->vx_SP_Med   ,  parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyHostToDevice));
-    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vy_SP_Med   ,  parameter->getParH(lev)->vy_SP_Med   ,  parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyHostToDevice));
-    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vz_SP_Med   ,  parameter->getParH(lev)->vz_SP_Med   ,  parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyHostToDevice));
-    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->press_SP_Med,  parameter->getParH(lev)->press_SP_Med,  parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->rho_SP_Med  ,  parameter->getParH(lev)->rho_SP_Med  ,  parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vx_SP_Med   ,  parameter->getParH(lev)->vx_SP_Med   ,  parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vy_SP_Med   ,  parameter->getParH(lev)->vy_SP_Med   ,  parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vz_SP_Med   ,  parameter->getParH(lev)->vz_SP_Med   ,  parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->press_SP_Med,  parameter->getParH(lev)->press_SP_Med,  parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyHostToDevice));
 }
 void CudaMemoryManager::cudaFreeMedianSP(int lev)
 {
@@ -1097,11 +1089,11 @@ void CudaMemoryManager::cudaFreeMedianSP(int lev)
 void CudaMemoryManager::cudaAllocMedianOut(int lev)
 {
     //Host
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->rho_SP_Med_Out      ), parameter->getParH(lev)->mem_size_real_SP));
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vx_SP_Med_Out       ), parameter->getParH(lev)->mem_size_real_SP));
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vy_SP_Med_Out       ), parameter->getParH(lev)->mem_size_real_SP));
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vz_SP_Med_Out       ), parameter->getParH(lev)->mem_size_real_SP));
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->press_SP_Med_Out    ), parameter->getParH(lev)->mem_size_real_SP));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->rho_SP_Med_Out      ), parameter->getParH(lev)->memSizeRealLBnodes));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vx_SP_Med_Out       ), parameter->getParH(lev)->memSizeRealLBnodes));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vy_SP_Med_Out       ), parameter->getParH(lev)->memSizeRealLBnodes));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vz_SP_Med_Out       ), parameter->getParH(lev)->memSizeRealLBnodes));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->press_SP_Med_Out    ), parameter->getParH(lev)->memSizeRealLBnodes));
 }
 void CudaMemoryManager::cudaFreeMedianOut(int lev)
 {
@@ -1655,6 +1647,133 @@ void CudaMemoryManager::cudaFreeWallModel(int lev, bool hasWallModelMonitor)
     }
 }
 
+
+//Precursor BC
+void CudaMemoryManager::cudaAllocPrecursorBC(int lev)
+{   
+    uint memSizeQInt = parameter->getParH(lev)->precursorBC.numberOfBCnodes*sizeof(int);
+    uint memSizeQUint = parameter->getParH(lev)->precursorBC.numberOfBCnodes*sizeof(uint);
+    uint memSizeQReal = parameter->getParH(lev)->precursorBC.numberOfBCnodes*sizeof(real);
+
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.k, memSizeQInt));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.q27[0], parameter->getD3Qxx()*memSizeQReal));
+    
+
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.planeNeighbor0PP, memSizeQUint));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.planeNeighbor0PM, memSizeQUint));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.planeNeighbor0MP, memSizeQUint));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.planeNeighbor0MM, memSizeQUint));
+
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.weights0PP, memSizeQReal));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.weights0PM, memSizeQReal));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.weights0MP, memSizeQReal));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.weights0MM, memSizeQReal));
+
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.k, memSizeQInt));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.q27[0], parameter->getD3Qxx()*memSizeQReal));
+
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.planeNeighbor0PP, memSizeQUint));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.planeNeighbor0PM, memSizeQUint));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.planeNeighbor0MP, memSizeQUint));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.planeNeighbor0MM, memSizeQUint));
+
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.weights0PP, memSizeQReal));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.weights0PM, memSizeQReal));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.weights0MP, memSizeQReal));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.weights0MM, memSizeQReal));
+
+    real memSize = memSizeQInt+4*memSizeQUint+(4+parameter->getD3Qxx())*memSizeQReal;
+    setMemsizeGPU(memSize, false);
+
+}
+
+
+void CudaMemoryManager::cudaAllocPrecursorData(int lev)
+{
+    size_t size = parameter->getParH(lev)->precursorBC.numberOfPrecursorNodes*sizeof(real)*parameter->getParH(lev)->precursorBC.numberOfQuantities;
+
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.last, size));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.current, size));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.next, size));
+
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.last, size));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.current, size));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.next, size));
+    setMemsizeGPU(3*size, false);
+}
+
+
+void CudaMemoryManager::cudaCopyPrecursorBC(int lev)
+{
+    uint memSizeQInt = parameter->getParH(lev)->precursorBC.numberOfBCnodes*sizeof(int);
+    uint memSizeQUint = parameter->getParH(lev)->precursorBC.numberOfBCnodes*sizeof(uint);
+    uint memSizeQReal = parameter->getParH(lev)->precursorBC.numberOfBCnodes*sizeof(real);
+
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.k, parameter->getParH(lev)->precursorBC.k, memSizeQInt, cudaMemcpyHostToDevice));
+
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.q27[0], parameter->getParH(lev)->precursorBC.q27[0], memSizeQReal*parameter->getD3Qxx(), cudaMemcpyHostToDevice));
+
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.planeNeighbor0PP, parameter->getParH(lev)->precursorBC.planeNeighbor0PP, memSizeQUint, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.planeNeighbor0PM, parameter->getParH(lev)->precursorBC.planeNeighbor0PM, memSizeQUint, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.planeNeighbor0MP, parameter->getParH(lev)->precursorBC.planeNeighbor0MP, memSizeQUint, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.planeNeighbor0MM, parameter->getParH(lev)->precursorBC.planeNeighbor0MM, memSizeQUint, cudaMemcpyHostToDevice));
+
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.weights0PP, parameter->getParH(lev)->precursorBC.weights0PP, memSizeQReal, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.weights0PM, parameter->getParH(lev)->precursorBC.weights0PM, memSizeQReal, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.weights0MP, parameter->getParH(lev)->precursorBC.weights0MP, memSizeQReal, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.weights0MM, parameter->getParH(lev)->precursorBC.weights0MM, memSizeQReal, cudaMemcpyHostToDevice));
+}
+void CudaMemoryManager::cudaCopyPrecursorData(int lev)
+{
+    auto prec = &parameter->getParH(lev)->precursorBC;
+    auto precStream = parameter->getStreamManager()->getStream(CudaStreamIndex::Precursor);
+    size_t memSize = prec->numberOfPrecursorNodes*sizeof(real)*prec->numberOfQuantities;
+    checkCudaErrors( cudaStreamSynchronize(precStream) );
+    checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->precursorBC.next, prec->next, memSize, cudaMemcpyHostToDevice, precStream) );
+}
+
+
+void CudaMemoryManager::cudaFreePrecursorBC(int lev)
+{
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.k));
+
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.q27[0]));
+
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.planeNeighbor0PP));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.planeNeighbor0PM));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.planeNeighbor0MP));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.planeNeighbor0MM));
+
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.weights0PP));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.weights0PM));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.weights0MP));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.weights0MM));
+
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.k));
+
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.q27[0]));
+
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.planeNeighbor0PP));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.planeNeighbor0PM));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.planeNeighbor0MP));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.planeNeighbor0MM));
+
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.weights0PP));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.weights0PM));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.weights0MP));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.weights0MM));
+}
+
+void CudaMemoryManager::cudaFreePrecursorData(int lev)
+{
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.last));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.current));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.next));
+
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.last));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.current));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.next));
+}
 //Test roundoff error
 void CudaMemoryManager::cudaAllocTestRE(int lev, unsigned int size)
 {
@@ -1913,15 +2032,15 @@ void CudaMemoryManager::cudaFreeMeasurePointsIndex(int lev)
 }
 void CudaMemoryManager::cudaAllocFsForCheckPointAndRestart(int lev)
 {
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->distributions.f[0] ),           (unsigned long long)parameter->getD3Qxx()*(unsigned long long)parameter->getParH(lev)->mem_size_real_SP));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->distributions.f[0] ),           (unsigned long long)parameter->getD3Qxx()*(unsigned long long)parameter->getParH(lev)->memSizeRealLBnodes));
 }
 void CudaMemoryManager::cudaCopyFsForRestart(int lev)
 {
-    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->distributions.f[0],  parameter->getParH(lev)->distributions.f[0],     (unsigned long long)parameter->getD3Qxx()*(unsigned long long)parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->distributions.f[0],  parameter->getParH(lev)->distributions.f[0],     (unsigned long long)parameter->getD3Qxx()*(unsigned long long)parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyHostToDevice));
 }
 void CudaMemoryManager::cudaCopyFsForCheckPoint(int lev)
 {
-    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->distributions.f[0],  parameter->getParD(lev)->distributions.f[0],     (unsigned long long)parameter->getD3Qxx()*(unsigned long long)parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->distributions.f[0],  parameter->getParD(lev)->distributions.f[0],     (unsigned long long)parameter->getD3Qxx()*(unsigned long long)parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
 }
 void CudaMemoryManager::cudaFreeFsForCheckPointAndRestart(int lev)
 {
@@ -2400,20 +2519,20 @@ void CudaMemoryManager::cudaFreePorousMedia(PorousMedia* pm, int lev)
 void CudaMemoryManager::cudaAllocConcentration(int lev)
 {
     //Host
-    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->Conc), parameter->getParH(lev)->mem_size_real_SP));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->Conc), parameter->getParH(lev)->memSizeRealLBnodes));
     //Device
-    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->Conc), parameter->getParD(lev)->mem_size_real_SP));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->Conc), parameter->getParD(lev)->memSizeRealLBnodes));
     //////////////////////////////////////////////////////////////////////////
-    double tmp = (double)parameter->getParH(lev)->mem_size_real_SP;
+    double tmp = (double)parameter->getParH(lev)->memSizeRealLBnodes;
     setMemsizeGPU(tmp, false);
 }
 void CudaMemoryManager::cudaCopyConcentrationDeviceToHost(int lev)
 {
-    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->Conc, parameter->getParD(lev)->Conc,  parameter->getParH(lev)->mem_size_real_SP , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->Conc, parameter->getParD(lev)->Conc,  parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
 }
 void CudaMemoryManager::cudaCopyConcentrationHostToDevice(int lev)
 {
-    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->Conc, parameter->getParH(lev)->Conc, parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->Conc, parameter->getParH(lev)->Conc, parameter->getParH(lev)->memSizeRealLBnodes, cudaMemcpyHostToDevice));
 }
 void CudaMemoryManager::cudaFreeConcentration(int lev)
 {
@@ -2425,14 +2544,14 @@ void CudaMemoryManager::cudaAllocTempFs(int lev)
     //Device
     if (parameter->getDiffMod() == 7)
     {
-        checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->distributionsAD7.f[0]), parameter->getDiffMod()*parameter->getParH(lev)->mem_size_real_SP));
+        checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->distributionsAD7.f[0]), parameter->getDiffMod()*parameter->getParH(lev)->memSizeRealLBnodes));
     }
     else if (parameter->getDiffMod() == 27)
     {
-        checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->distributionsAD27.f[0]), parameter->getDiffMod()*parameter->getParH(lev)->mem_size_real_SP));
+        checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->distributionsAD27.f[0]), parameter->getDiffMod()*parameter->getParH(lev)->memSizeRealLBnodes));
     }
     //////////////////////////////////////////////////////////////////////////
-    double tmp = (double)(parameter->getDiffMod() * parameter->getParH(lev)->mem_size_real_SP);
+    double tmp = (double)(parameter->getDiffMod() * parameter->getParH(lev)->memSizeRealLBnodes);
     setMemsizeGPU(tmp, false);
 }
 //////////////////////////////////////////////////////////////////////////
@@ -2627,12 +2746,12 @@ void CudaMemoryManager::cudaFreeConcFile(int lev)
 void CudaMemoryManager::cudaAllocMedianOutAD(int lev)
 {
 	//Host
-	checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->rho_SP_Med_Out),   parameter->getParH(lev)->mem_size_real_SP));
-	checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->vx_SP_Med_Out),    parameter->getParH(lev)->mem_size_real_SP));
-	checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->vy_SP_Med_Out),    parameter->getParH(lev)->mem_size_real_SP));
-	checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->vz_SP_Med_Out),    parameter->getParH(lev)->mem_size_real_SP));
-	checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->press_SP_Med_Out), parameter->getParH(lev)->mem_size_real_SP));
-	checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->Conc_Med_Out),     parameter->getParH(lev)->mem_size_real_SP));
+	checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->rho_SP_Med_Out),   parameter->getParH(lev)->memSizeRealLBnodes));
+	checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->vx_SP_Med_Out),    parameter->getParH(lev)->memSizeRealLBnodes));
+	checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->vy_SP_Med_Out),    parameter->getParH(lev)->memSizeRealLBnodes));
+	checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->vz_SP_Med_Out),    parameter->getParH(lev)->memSizeRealLBnodes));
+	checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->press_SP_Med_Out), parameter->getParH(lev)->memSizeRealLBnodes));
+	checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->Conc_Med_Out),     parameter->getParH(lev)->memSizeRealLBnodes));
 }
 void CudaMemoryManager::cudaFreeMedianOutAD(int lev)
 {
@@ -2886,31 +3005,31 @@ void CudaMemoryManager::cudaFreeProcessNeighborADZ(int lev, unsigned int process
 void CudaMemoryManager::cudaAlloc2ndOrderDerivitivesIsoTest(int lev)
 {
     //Host
-    checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->dxxUx), parameter->getParH(lev)->mem_size_real_SP));
-    checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->dyyUy), parameter->getParH(lev)->mem_size_real_SP));
-    checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->dzzUz), parameter->getParH(lev)->mem_size_real_SP));
+    checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->dxxUx), parameter->getParH(lev)->memSizeRealLBnodes));
+    checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->dyyUy), parameter->getParH(lev)->memSizeRealLBnodes));
+    checkCudaErrors(cudaMallocHost((void**) &(parameter->getParH(lev)->dzzUz), parameter->getParH(lev)->memSizeRealLBnodes));
     //Device (spinning ship)
-    checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->dxxUx), parameter->getParH(lev)->mem_size_real_SP));
-    checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->dyyUy), parameter->getParH(lev)->mem_size_real_SP));
-    checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->dzzUz), parameter->getParH(lev)->mem_size_real_SP));
+    checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->dxxUx), parameter->getParH(lev)->memSizeRealLBnodes));
+    checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->dyyUy), parameter->getParH(lev)->memSizeRealLBnodes));
+    checkCudaErrors(cudaMalloc((void**) &(parameter->getParD(lev)->dzzUz), parameter->getParH(lev)->memSizeRealLBnodes));
     //////////////////////////////////////////////////////////////////////////
-    double tmp = 3. * (double)parameter->getParH(lev)->mem_size_real_SP;
+    double tmp = 3. * (double)parameter->getParH(lev)->memSizeRealLBnodes;
     setMemsizeGPU(tmp, false);
     //printf("Coord = %f MB",tmp/1000000.);
 }
 void CudaMemoryManager::cudaCopy2ndOrderDerivitivesIsoTestDH(int lev)
 {
     //copy device to host
-    checkCudaErrors(cudaMemcpy(parameter->getParH(lev)->dxxUx, parameter->getParD(lev)->dxxUx, parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(parameter->getParH(lev)->dyyUy, parameter->getParD(lev)->dyyUy, parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(parameter->getParH(lev)->dzzUz, parameter->getParD(lev)->dzzUz, parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(parameter->getParH(lev)->dxxUx, parameter->getParD(lev)->dxxUx, parameter->getParH(lev)->memSizeRealLBnodes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(parameter->getParH(lev)->dyyUy, parameter->getParD(lev)->dyyUy, parameter->getParH(lev)->memSizeRealLBnodes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(parameter->getParH(lev)->dzzUz, parameter->getParD(lev)->dzzUz, parameter->getParH(lev)->memSizeRealLBnodes, cudaMemcpyDeviceToHost));
 }
 void CudaMemoryManager::cudaCopy2ndOrderDerivitivesIsoTestHD(int lev)
 {
     //copy host to device
-    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->dxxUx, parameter->getParH(lev)->dxxUx, parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->dyyUy, parameter->getParH(lev)->dyyUy, parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->dzzUz, parameter->getParH(lev)->dzzUz, parameter->getParH(lev)->mem_size_real_SP, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->dxxUx, parameter->getParH(lev)->dxxUx, parameter->getParH(lev)->memSizeRealLBnodes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->dyyUy, parameter->getParH(lev)->dyyUy, parameter->getParH(lev)->memSizeRealLBnodes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->dzzUz, parameter->getParH(lev)->dzzUz, parameter->getParH(lev)->memSizeRealLBnodes, cudaMemcpyHostToDevice));
 
 }
 void CudaMemoryManager::cudaFree2ndOrderDerivitivesIsoTest(int lev)
@@ -2921,231 +3040,297 @@ void CudaMemoryManager::cudaFree2ndOrderDerivitivesIsoTest(int lev)
 
 }
 
-void CudaMemoryManager::cudaAllocFluidNodeIndices(int lev) {
-    uint mem_size_geo_fluid_nodes = sizeof(uint) * parameter->getParH(lev)->numberOfFluidNodes;
+void CudaMemoryManager::cudaAllocTaggedFluidNodeIndices(CollisionTemplate tag, int lev) {
+    uint mem_size_tagged_fluid_nodes = sizeof(uint) * parameter->getParH(lev)->numberOfTaggedFluidNodes[tag];
     // Host
-    checkCudaErrors(cudaMallocHost((void **)&(parameter->getParH(lev)->fluidNodeIndices), mem_size_geo_fluid_nodes));
+    checkCudaErrors(cudaMallocHost((void **)&(parameter->getParH(lev)->taggedFluidNodeIndices[tag]), mem_size_tagged_fluid_nodes));
     // Device
-    checkCudaErrors(cudaMalloc((void **)&(parameter->getParD(lev)->fluidNodeIndices), mem_size_geo_fluid_nodes));
+    checkCudaErrors(cudaMalloc((void **)&(parameter->getParD(lev)->taggedFluidNodeIndices[tag]), mem_size_tagged_fluid_nodes));
     //////////////////////////////////////////////////////////////////////////
-    setMemsizeGPU((double)mem_size_geo_fluid_nodes, false);
+    setMemsizeGPU((double)mem_size_tagged_fluid_nodes, false);
 }
 
-void CudaMemoryManager::cudaCopyFluidNodeIndices(int lev) {
-    uint mem_size_geo_fluid_nodes = sizeof(uint) * parameter->getParH(lev)->numberOfFluidNodes;
-    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->fluidNodeIndices,
-                               parameter->getParH(lev)->fluidNodeIndices,
-                               mem_size_geo_fluid_nodes, cudaMemcpyHostToDevice));
+void CudaMemoryManager::cudaCopyTaggedFluidNodeIndices(CollisionTemplate tag, int lev) {
+    uint mem_size_tagged_fluid_nodes = sizeof(uint) * parameter->getParH(lev)->numberOfTaggedFluidNodes[tag];
+    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->taggedFluidNodeIndices[tag],
+                               parameter->getParH(lev)->taggedFluidNodeIndices[tag],
+                               mem_size_tagged_fluid_nodes, cudaMemcpyHostToDevice));
 }
 
-void CudaMemoryManager::cudaFreeFluidNodeIndices(int lev) {
-    checkCudaErrors(cudaFreeHost(parameter->getParH(lev)->fluidNodeIndices));
+void CudaMemoryManager::cudaFreeTaggedFluidNodeIndices(CollisionTemplate tag, int lev) {
+    checkCudaErrors(cudaFreeHost(parameter->getParH(lev)->taggedFluidNodeIndices[tag]));
 }
 
-void CudaMemoryManager::cudaAllocFluidNodeIndicesBorder(int lev) {
-    uint mem_size_fluid_nodes_border = sizeof(uint) * parameter->getParH(lev)->numberOfFluidNodesBorder;
-    // Host
-    checkCudaErrors(
-        cudaMallocHost((void **)&(parameter->getParH(lev)->fluidNodeIndicesBorder), mem_size_fluid_nodes_border));
-    // Device
-    checkCudaErrors(
-        cudaMalloc((void **)&(parameter->getParD(lev)->fluidNodeIndicesBorder), mem_size_fluid_nodes_border));
-    //////////////////////////////////////////////////////////////////////////
-    setMemsizeGPU((double)mem_size_fluid_nodes_border, false);
-}
+////////////////////////////////////////////////////////////////////////////////////
+//  ActuatorFarm
+///////////////////////////////////////////////////////////////////////////////
+void CudaMemoryManager::cudaAllocBladeGeometries(ActuatorFarm* actuatorFarm)
+{
+    uint sizeRealTurbine = sizeof(real)*actuatorFarm->getNumberOfTurbines();
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->bladeRadiiH, sizeRealTurbine*actuatorFarm->getNumberOfNodesPerBlade()) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->diametersH, sizeRealTurbine) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->turbinePosXH, sizeRealTurbine) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->turbinePosYH, sizeRealTurbine) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->turbinePosZH, sizeRealTurbine) );
+
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeRadiiD, sizeRealTurbine*actuatorFarm->getNumberOfNodesPerBlade()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->diametersD, sizeRealTurbine) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->turbinePosXD, sizeRealTurbine) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->turbinePosYD, sizeRealTurbine) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->turbinePosZD, sizeRealTurbine) );
+    setMemsizeGPU(sizeof(real)*(actuatorFarm->getNumberOfNodesPerBlade()+4)*actuatorFarm->getNumberOfTurbines(), false);
 
-void CudaMemoryManager::cudaCopyFluidNodeIndicesBorder(int lev) {
-    uint mem_size_fluid_nodes_border = sizeof(uint) * parameter->getParH(lev)->numberOfFluidNodesBorder;
-    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->fluidNodeIndicesBorder,
-                               parameter->getParH(lev)->fluidNodeIndicesBorder,
-                               mem_size_fluid_nodes_border, cudaMemcpyHostToDevice));
 }
+void CudaMemoryManager::cudaCopyBladeGeometriesHtoD(ActuatorFarm* actuatorFarm)
+{
+    uint sizeRealTurbine = sizeof(real)*actuatorFarm->getNumberOfTurbines();
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeRadiiD, actuatorFarm->bladeRadiiH, sizeRealTurbine*actuatorFarm->getNumberOfNodesPerBlade(), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->diametersD, actuatorFarm->diametersH, sizeRealTurbine, cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->turbinePosXD, actuatorFarm->turbinePosXH, sizeRealTurbine, cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->turbinePosYD, actuatorFarm->turbinePosYH, sizeRealTurbine, cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->turbinePosZD, actuatorFarm->turbinePosZH, sizeRealTurbine, cudaMemcpyHostToDevice) );
 
-void CudaMemoryManager::cudaFreeFluidNodeIndicesBorder(int lev) {
-    checkCudaErrors(cudaFreeHost(parameter->getParH(lev)->fluidNodeIndicesBorder));
 }
+void CudaMemoryManager::cudaCopyBladeGeometriesDtoH(ActuatorFarm* actuatorFarm)
+{
+    uint sizeRealTurbine = sizeof(real)*actuatorFarm->getNumberOfTurbines();
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeRadiiH, actuatorFarm->bladeRadiiD, sizeRealTurbine*actuatorFarm->getNumberOfNodesPerBlade(), cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->diametersH, actuatorFarm->diametersD, sizeRealTurbine, cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->turbinePosXH, actuatorFarm->turbinePosXD, sizeRealTurbine, cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->turbinePosYH, actuatorFarm->turbinePosYD, sizeRealTurbine, cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->turbinePosZH, actuatorFarm->turbinePosZD, sizeRealTurbine, cudaMemcpyDeviceToHost) );
 
-////////////////////////////////////////////////////////////////////////////////////
-//  ActuatorLine
-///////////////////////////////////////////////////////////////////////////////
+}
+void CudaMemoryManager::cudaFreeBladeGeometries(ActuatorFarm* actuatorFarm)
+{
+    checkCudaErrors( cudaFree(actuatorFarm->bladeRadiiD) );
+    checkCudaErrors( cudaFree(actuatorFarm->diametersD) );
+    checkCudaErrors( cudaFree(actuatorFarm->turbinePosXD) );
+    checkCudaErrors( cudaFree(actuatorFarm->turbinePosYD) );
+    checkCudaErrors( cudaFree(actuatorFarm->turbinePosZD) );    
+    
+    checkCudaErrors( cudaFreeHost(actuatorFarm->bladeRadiiH) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->diametersH) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->turbinePosXH) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->turbinePosYH) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->turbinePosZH) );
+}
 
-void CudaMemoryManager::cudaAllocBladeRadii(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaAllocBladeOrientations(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMallocHost((void**) &actuatorLine->bladeRadiiH, sizeof(real)*actuatorLine->getNBladeNodes()) );
+    uint sizeRealTurbine = sizeof(real)*actuatorFarm->getNumberOfTurbines();
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->omegasH, sizeRealTurbine) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->azimuthsH, sizeRealTurbine) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->yawsH, sizeRealTurbine) );
 
-    checkCudaErrors( cudaMalloc((void**) &actuatorLine->bladeRadiiD, sizeof(real)*actuatorLine->getNBladeNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->omegasD, sizeRealTurbine) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->azimuthsD, sizeRealTurbine) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->yawsD, sizeRealTurbine) );
 
-    setMemsizeGPU(sizeof(real)*actuatorLine->getNBladeNodes(), false);
-}
+    setMemsizeGPU(3*sizeRealTurbine, false);
 
-void CudaMemoryManager::cudaCopyBladeRadiiHtoD(ActuatorLine* actuatorLine)
-{
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeRadiiD, actuatorLine->bladeRadiiH, sizeof(real)*actuatorLine->getNBladeNodes(), cudaMemcpyHostToDevice) );
 }
+void CudaMemoryManager::cudaCopyBladeOrientationsHtoD(ActuatorFarm* actuatorFarm)
+{
+    uint sizeRealTurbine = sizeof(real)*actuatorFarm->getNumberOfTurbines();
+    checkCudaErrors( cudaMemcpy(actuatorFarm->omegasD, actuatorFarm->omegasH, sizeRealTurbine, cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->azimuthsD, actuatorFarm->azimuthsH, sizeRealTurbine, cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->yawsD, actuatorFarm->yawsH, sizeRealTurbine, cudaMemcpyHostToDevice) );
 
-void CudaMemoryManager::cudaCopyBladeRadiiDtoH(ActuatorLine* actuatorLine)
+}
+void CudaMemoryManager::cudaCopyBladeOrientationsDtoH(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeRadiiH, actuatorLine->bladeRadiiD, sizeof(real)*actuatorLine->getNBladeNodes(), cudaMemcpyDeviceToHost) );
+    uint sizeRealTurbine = sizeof(real)*actuatorFarm->getNumberOfTurbines();
+    checkCudaErrors( cudaMemcpy(actuatorFarm->omegasH, actuatorFarm->omegasD, sizeRealTurbine, cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->azimuthsH, actuatorFarm->azimuthsD, sizeRealTurbine, cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->yawsH, actuatorFarm->yawsD, sizeRealTurbine, cudaMemcpyDeviceToHost) );
 }
-
-void CudaMemoryManager::cudaFreeBladeRadii(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaFreeBladeOrientations(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaFree(actuatorLine->bladeRadiiD) );
+    checkCudaErrors( cudaFree((void**) &actuatorFarm->omegasD) );
+    checkCudaErrors( cudaFree((void**) &actuatorFarm->azimuthsD) );
+    checkCudaErrors( cudaFree((void**) &actuatorFarm->yawsD) );
 
-    checkCudaErrors( cudaFreeHost(actuatorLine->bladeRadiiH) );
+    checkCudaErrors( cudaFreeHost((void**) &actuatorFarm->omegasH) );
+    checkCudaErrors( cudaFreeHost((void**) &actuatorFarm->azimuthsH) );
+    checkCudaErrors( cudaFreeHost((void**) &actuatorFarm->yawsH) );
 }
 
-void CudaMemoryManager::cudaAllocBladeCoords(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaAllocBladeCoords(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMallocHost((void**) &actuatorLine->bladeCoordsXH, sizeof(real)*actuatorLine->getNNodes()) );
-    checkCudaErrors( cudaMallocHost((void**) &actuatorLine->bladeCoordsYH, sizeof(real)*actuatorLine->getNNodes()) );
-    checkCudaErrors( cudaMallocHost((void**) &actuatorLine->bladeCoordsZH, sizeof(real)*actuatorLine->getNNodes()) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->bladeCoordsXH, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->bladeCoordsYH, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->bladeCoordsZH, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
 
-    checkCudaErrors( cudaMalloc((void**) &actuatorLine->bladeCoordsXD, sizeof(real)*actuatorLine->getNNodes()) );
-    checkCudaErrors( cudaMalloc((void**) &actuatorLine->bladeCoordsYD, sizeof(real)*actuatorLine->getNNodes()) );
-    checkCudaErrors( cudaMalloc((void**) &actuatorLine->bladeCoordsZD, sizeof(real)*actuatorLine->getNNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeCoordsXDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeCoordsYDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeCoordsZDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );    
+    
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeCoordsXDPreviousTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeCoordsYDPreviousTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeCoordsZDPreviousTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
 
-    setMemsizeGPU(3.f*actuatorLine->getNNodes(), false);
+    setMemsizeGPU(6.f*actuatorFarm->getNumberOfNodes(), false);
 }
 
-void CudaMemoryManager::cudaCopyBladeCoordsHtoD(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaCopyBladeCoordsHtoD(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeCoordsXD, actuatorLine->bladeCoordsXH, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyHostToDevice) );
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeCoordsYD, actuatorLine->bladeCoordsYH, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyHostToDevice) );
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeCoordsZD, actuatorLine->bladeCoordsZH, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeCoordsXDCurrentTimestep, actuatorFarm->bladeCoordsXH, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeCoordsYDCurrentTimestep, actuatorFarm->bladeCoordsYH, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeCoordsZDCurrentTimestep, actuatorFarm->bladeCoordsZH, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyHostToDevice) );
 }
 
-void CudaMemoryManager::cudaCopyBladeCoordsDtoH(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaCopyBladeCoordsDtoH(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeCoordsXH, actuatorLine->bladeCoordsXD, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyDeviceToHost) );
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeCoordsYH, actuatorLine->bladeCoordsYD, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyDeviceToHost) );
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeCoordsZH, actuatorLine->bladeCoordsZD, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeCoordsXH, actuatorFarm->bladeCoordsXDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeCoordsYH, actuatorFarm->bladeCoordsYDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeCoordsZH, actuatorFarm->bladeCoordsZDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyDeviceToHost) );
 }
 
-void CudaMemoryManager::cudaFreeBladeCoords(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaFreeBladeCoords(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaFree(actuatorLine->bladeCoordsXD) );
-    checkCudaErrors( cudaFree(actuatorLine->bladeCoordsYD) );
-    checkCudaErrors( cudaFree(actuatorLine->bladeCoordsZD) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeCoordsXDCurrentTimestep) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeCoordsYDCurrentTimestep) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeCoordsZDCurrentTimestep) );
 
-    checkCudaErrors( cudaFreeHost(actuatorLine->bladeCoordsXH) );
-    checkCudaErrors( cudaFreeHost(actuatorLine->bladeCoordsYH) );
-    checkCudaErrors( cudaFreeHost(actuatorLine->bladeCoordsZH) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeCoordsXDPreviousTimestep) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeCoordsYDPreviousTimestep) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeCoordsZDPreviousTimestep) );
+
+    checkCudaErrors( cudaFreeHost(actuatorFarm->bladeCoordsXH) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->bladeCoordsYH) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->bladeCoordsZH) );
 }
 
-void CudaMemoryManager::cudaAllocBladeIndices(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaAllocBladeIndices(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMallocHost((void**) &actuatorLine->bladeIndicesH, sizeof(uint)*actuatorLine->getNNodes()) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->bladeIndicesH, sizeof(uint)*actuatorFarm->getNumberOfNodes()) );
 
-    checkCudaErrors( cudaMalloc((void**) &actuatorLine->bladeIndicesD, sizeof(uint)*actuatorLine->getNNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeIndicesD, sizeof(uint)*actuatorFarm->getNumberOfNodes()) );
 
-    setMemsizeGPU(sizeof(uint)*actuatorLine->getNNodes(), false);
+    setMemsizeGPU(sizeof(uint)*actuatorFarm->getNumberOfNodes(), false);
 }
 
-void CudaMemoryManager::cudaCopyBladeIndicesHtoD(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaCopyBladeIndicesHtoD(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeIndicesD, actuatorLine->bladeIndicesH, sizeof(uint)*actuatorLine->getNNodes(), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeIndicesD, actuatorFarm->bladeIndicesH, sizeof(uint)*actuatorFarm->getNumberOfNodes(), cudaMemcpyHostToDevice) );
 }
 
-void CudaMemoryManager::cudaFreeBladeIndices(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaFreeBladeIndices(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaFree(actuatorLine->bladeIndicesD) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeIndicesD) );
 
-    checkCudaErrors( cudaFreeHost(actuatorLine->bladeIndicesH) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->bladeIndicesH) );
 }
 
-void CudaMemoryManager::cudaAllocBladeVelocities(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaAllocBladeVelocities(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMallocHost((void**) &actuatorLine->bladeVelocitiesXH, sizeof(real)*actuatorLine->getNNodes()) );
-    checkCudaErrors( cudaMallocHost((void**) &actuatorLine->bladeVelocitiesYH, sizeof(real)*actuatorLine->getNNodes()) );
-    checkCudaErrors( cudaMallocHost((void**) &actuatorLine->bladeVelocitiesZH, sizeof(real)*actuatorLine->getNNodes()) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->bladeVelocitiesXH, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->bladeVelocitiesYH, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->bladeVelocitiesZH, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeVelocitiesXDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeVelocitiesYDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeVelocitiesZDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
 
-    checkCudaErrors( cudaMalloc((void**) &actuatorLine->bladeVelocitiesXD, sizeof(real)*actuatorLine->getNNodes()) );
-    checkCudaErrors( cudaMalloc((void**) &actuatorLine->bladeVelocitiesYD, sizeof(real)*actuatorLine->getNNodes()) );
-    checkCudaErrors( cudaMalloc((void**) &actuatorLine->bladeVelocitiesZD, sizeof(real)*actuatorLine->getNNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeVelocitiesXDPreviousTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeVelocitiesYDPreviousTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeVelocitiesZDPreviousTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
 
-    setMemsizeGPU(3.*sizeof(real)*actuatorLine->getNNodes(), false);
+    setMemsizeGPU(3.*sizeof(real)*actuatorFarm->getNumberOfNodes(), false);
 }
 
-void CudaMemoryManager::cudaCopyBladeVelocitiesHtoD(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaCopyBladeVelocitiesHtoD(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeVelocitiesXD, actuatorLine->bladeVelocitiesXH, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyHostToDevice) );
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeVelocitiesYD, actuatorLine->bladeVelocitiesYH, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyHostToDevice) );
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeVelocitiesZD, actuatorLine->bladeVelocitiesZH, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeVelocitiesXDCurrentTimestep, actuatorFarm->bladeVelocitiesXH, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeVelocitiesYDCurrentTimestep, actuatorFarm->bladeVelocitiesYH, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeVelocitiesZDCurrentTimestep, actuatorFarm->bladeVelocitiesZH, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyHostToDevice) );
 }
 
-void CudaMemoryManager::cudaCopyBladeVelocitiesDtoH(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaCopyBladeVelocitiesDtoH(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeVelocitiesXH, actuatorLine->bladeVelocitiesXD, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyDeviceToHost) );
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeVelocitiesYH, actuatorLine->bladeVelocitiesYD, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyDeviceToHost) );
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeVelocitiesZH, actuatorLine->bladeVelocitiesZD, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeVelocitiesXH, actuatorFarm->bladeVelocitiesXDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeVelocitiesYH, actuatorFarm->bladeVelocitiesYDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeVelocitiesZH, actuatorFarm->bladeVelocitiesZDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyDeviceToHost) );
 }
 
-void CudaMemoryManager::cudaFreeBladeVelocities(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaFreeBladeVelocities(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaFree(actuatorLine->bladeVelocitiesXD) );
-    checkCudaErrors( cudaFree(actuatorLine->bladeVelocitiesYD) );
-    checkCudaErrors( cudaFree(actuatorLine->bladeVelocitiesZD) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeVelocitiesXDCurrentTimestep) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeVelocitiesYDCurrentTimestep) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeVelocitiesZDCurrentTimestep) );    
+    
+    checkCudaErrors( cudaFree(actuatorFarm->bladeVelocitiesXDPreviousTimestep) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeVelocitiesYDPreviousTimestep) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeVelocitiesZDPreviousTimestep) );
 
-    checkCudaErrors( cudaFreeHost(actuatorLine->bladeVelocitiesXH) );
-    checkCudaErrors( cudaFreeHost(actuatorLine->bladeVelocitiesYH) );
-    checkCudaErrors( cudaFreeHost(actuatorLine->bladeVelocitiesZH) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->bladeVelocitiesXH) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->bladeVelocitiesYH) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->bladeVelocitiesZH) );
 }
 
-void CudaMemoryManager::cudaAllocBladeForces(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaAllocBladeForces(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMallocHost((void**) &actuatorLine->bladeForcesXH, sizeof(real)*actuatorLine->getNNodes()) );
-    checkCudaErrors( cudaMallocHost((void**) &actuatorLine->bladeForcesYH, sizeof(real)*actuatorLine->getNNodes()) );
-    checkCudaErrors( cudaMallocHost((void**) &actuatorLine->bladeForcesZH, sizeof(real)*actuatorLine->getNNodes()) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->bladeForcesXH, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->bladeForcesYH, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMallocHost((void**) &actuatorFarm->bladeForcesZH, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeForcesXDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeForcesYDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeForcesZDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
 
-    checkCudaErrors( cudaMalloc((void**) &actuatorLine->bladeForcesXD, sizeof(real)*actuatorLine->getNNodes()) );
-    checkCudaErrors( cudaMalloc((void**) &actuatorLine->bladeForcesYD, sizeof(real)*actuatorLine->getNNodes()) );
-    checkCudaErrors( cudaMalloc((void**) &actuatorLine->bladeForcesZD, sizeof(real)*actuatorLine->getNNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeForcesXDPreviousTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeForcesYDPreviousTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
+    checkCudaErrors( cudaMalloc((void**) &actuatorFarm->bladeForcesZDPreviousTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes()) );
 
-    setMemsizeGPU(3.*sizeof(real)*actuatorLine->getNNodes(), false);
+    setMemsizeGPU(3.*sizeof(real)*actuatorFarm->getNumberOfNodes(), false);
 }
 
-void CudaMemoryManager::cudaCopyBladeForcesHtoD(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaCopyBladeForcesHtoD(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeForcesXD, actuatorLine->bladeForcesXH, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyHostToDevice) );
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeForcesYD, actuatorLine->bladeForcesYH, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyHostToDevice) );
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeForcesZD, actuatorLine->bladeForcesZH, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeForcesXDCurrentTimestep, actuatorFarm->bladeForcesXH, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeForcesYDCurrentTimestep, actuatorFarm->bladeForcesYH, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeForcesZDCurrentTimestep, actuatorFarm->bladeForcesZH, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyHostToDevice) );
 }
 
-void CudaMemoryManager::cudaCopyBladeForcesDtoH(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaCopyBladeForcesDtoH(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeForcesXH, actuatorLine->bladeForcesXD, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyDeviceToHost) );
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeForcesYH, actuatorLine->bladeForcesYD, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyDeviceToHost) );
-    checkCudaErrors( cudaMemcpy(actuatorLine->bladeForcesZH, actuatorLine->bladeForcesZD, sizeof(real)*actuatorLine->getNNodes(), cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeForcesXH, actuatorFarm->bladeForcesXDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeForcesYH, actuatorFarm->bladeForcesYDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->bladeForcesZH, actuatorFarm->bladeForcesZDCurrentTimestep, sizeof(real)*actuatorFarm->getNumberOfNodes(), cudaMemcpyDeviceToHost) );
 }
 
-void CudaMemoryManager::cudaFreeBladeForces(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaFreeBladeForces(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaFree(actuatorLine->bladeForcesXD) );
-    checkCudaErrors( cudaFree(actuatorLine->bladeForcesYD) );
-    checkCudaErrors( cudaFree(actuatorLine->bladeForcesZD) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeForcesXDCurrentTimestep) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeForcesYDCurrentTimestep) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeForcesZDCurrentTimestep) );
 
-    checkCudaErrors( cudaFreeHost(actuatorLine->bladeForcesXH) );
-    checkCudaErrors( cudaFreeHost(actuatorLine->bladeForcesYH) );
-    checkCudaErrors( cudaFreeHost(actuatorLine->bladeForcesZH) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeForcesXDPreviousTimestep) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeForcesYDPreviousTimestep) );
+    checkCudaErrors( cudaFree(actuatorFarm->bladeForcesZDPreviousTimestep) );
+
+    checkCudaErrors( cudaFreeHost(actuatorFarm->bladeForcesXH) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->bladeForcesYH) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->bladeForcesZH) );
 }
 
-void CudaMemoryManager::cudaAllocSphereIndices(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaAllocSphereIndices(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMallocHost((void**) &(actuatorLine->boundingSphereIndicesH), sizeof(int)*actuatorLine->getNIndices()));
-    checkCudaErrors( cudaMalloc((void**) &(actuatorLine->boundingSphereIndicesD), sizeof(int)*actuatorLine->getNIndices()));
-    setMemsizeGPU(sizeof(int)*actuatorLine->getNIndices(), false);
+    checkCudaErrors( cudaMallocHost((void**) &(actuatorFarm->boundingSphereIndicesH), sizeof(int)*actuatorFarm->getNumberOfIndices()));
+    checkCudaErrors( cudaMalloc((void**) &(actuatorFarm->boundingSphereIndicesD), sizeof(int)*actuatorFarm->getNumberOfIndices()));
+    setMemsizeGPU(sizeof(int)*actuatorFarm->getNumberOfIndices(), false);
 }
 
-void CudaMemoryManager::cudaCopySphereIndicesHtoD(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaCopySphereIndicesHtoD(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaMemcpy(actuatorLine->boundingSphereIndicesD, actuatorLine->boundingSphereIndicesH, sizeof(int)*actuatorLine->getNIndices(), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(actuatorFarm->boundingSphereIndicesD, actuatorFarm->boundingSphereIndicesH, sizeof(int)*actuatorFarm->getNumberOfIndices(), cudaMemcpyHostToDevice) );
 }
 
-void CudaMemoryManager::cudaFreeSphereIndices(ActuatorLine* actuatorLine)
+void CudaMemoryManager::cudaFreeSphereIndices(ActuatorFarm* actuatorFarm)
 {
-    checkCudaErrors( cudaFreeHost(actuatorLine->boundingSphereIndicesH) );
-    checkCudaErrors( cudaFree(actuatorLine->boundingSphereIndicesD) );
+    checkCudaErrors( cudaFreeHost(actuatorFarm->boundingSphereIndicesH) );
+    checkCudaErrors( cudaFree(actuatorFarm->boundingSphereIndicesD) );
 }
 
 ////////////////////////////////////////////////////////////////////////////////////
@@ -3231,8 +3416,11 @@ void CudaMemoryManager::cudaCopyProbeQuantityArrayHtoD(Probe* probe, int level)
 }
 void CudaMemoryManager::cudaCopyProbeQuantityArrayDtoH(Probe* probe, int level)
 {
-    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->quantitiesArrayH, probe->getProbeStruct(level)->quantitiesArrayD, probe->getProbeStruct(level)->nArrays*sizeof(real)*probe->getProbeStruct(level)->nPoints, cudaMemcpyDeviceToHost) );
+    auto probeStruct = probe->getProbeStruct(level);
+
+    checkCudaErrors( cudaMemcpy(probeStruct->quantitiesArrayH, probeStruct->quantitiesArrayD, probeStruct->nArrays*sizeof(real)*probeStruct->nPoints, cudaMemcpyDeviceToHost) );
 }
+
 void CudaMemoryManager::cudaFreeProbeQuantityArray(Probe* probe, int level)
 {
     checkCudaErrors( cudaFreeHost(probe->getProbeStruct(level)->quantitiesArrayH) );
@@ -3262,6 +3450,7 @@ void CudaMemoryManager::cudaCopyProbeQuantitiesAndOffsetsDtoH(Probe* probe, int
     checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->quantitiesH, probe->getProbeStruct(level)->quantitiesD, int(Statistic::LAST)*sizeof(bool), cudaMemcpyDeviceToHost) );
     checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->arrayOffsetsH, probe->getProbeStruct(level)->arrayOffsetsD, int(Statistic::LAST)*sizeof(int), cudaMemcpyDeviceToHost) );
 }
+
 void CudaMemoryManager::cudaFreeProbeQuantitiesAndOffsets(Probe* probe, int level)
 {
     checkCudaErrors( cudaFreeHost(probe->getProbeStruct(level)->quantitiesH) );
@@ -3270,23 +3459,51 @@ void CudaMemoryManager::cudaFreeProbeQuantitiesAndOffsets(Probe* probe, int leve
     checkCudaErrors( cudaFree    (probe->getProbeStruct(level)->arrayOffsetsD) );
 }
 
+void CudaMemoryManager::cudaAllocPrecursorWriter(PrecursorWriter* writer, int level)
+{
+    auto prec =  writer->getPrecursorStruct(level);
+    size_t indSize = prec->numberOfPointsInBC*sizeof(uint);
 
+    checkCudaErrors( cudaStreamCreate(&prec->stream) );
 
+    checkCudaErrors( cudaMallocHost((void**) &prec->indicesH, indSize));
+    checkCudaErrors( cudaMalloc((void**) &prec->indicesD, indSize));
 
+    size_t dataSize  = prec->numberOfPointsInBC*sizeof(real)*prec->numberOfQuantities;
+    size_t dataSizeH = dataSize * prec->numberOfTimestepsPerFile;
+    
+    checkCudaErrors( cudaMallocHost((void**) &prec->dataH, dataSizeH));
+    checkCudaErrors( cudaMallocHost((void**) &prec->bufferH, dataSizeH));
+    checkCudaErrors( cudaMalloc((void**) &prec->dataD, dataSize));
+    checkCudaErrors( cudaMalloc((void**) &prec->bufferD, dataSize));
 
+    setMemsizeGPU(indSize+2*dataSize, false);
+}
 
+void CudaMemoryManager::cudaCopyPrecursorWriterIndicesHtoD(PrecursorWriter* writer, int level)
+{
+    checkCudaErrors( cudaMemcpy(writer->getPrecursorStruct(level)->indicesD, writer->getPrecursorStruct(level)->indicesH, writer->getPrecursorStruct(level)->numberOfPointsInBC*sizeof(uint), cudaMemcpyHostToDevice) );
+}
 
+void CudaMemoryManager::cudaCopyPrecursorWriterOutputVariablesDtoH(PrecursorWriter* writer, int level)
+{
+    auto prec =  writer->getPrecursorStruct(level);
+    int sizeTimestep = prec->numberOfPointsInBC*prec->numberOfQuantities;
 
+    checkCudaErrors( cudaStreamSynchronize(prec->stream) );
+    checkCudaErrors( cudaMemcpyAsync( &prec->bufferH[prec->numberOfTimestepsBuffered*sizeTimestep], prec->bufferD, sizeof(real)*sizeTimestep, cudaMemcpyDeviceToHost, prec->stream));
+}
 
+void CudaMemoryManager::cudaFreePrecursorWriter(PrecursorWriter* writer, int level)
+{
+    checkCudaErrors( cudaFreeHost(writer->getPrecursorStruct(level)->indicesH));
+    checkCudaErrors( cudaFree(writer->getPrecursorStruct(level)->indicesD));
 
-
-
-
-
-
-
-
-
+    checkCudaErrors( cudaFreeHost(writer->getPrecursorStruct(level)->dataH));
+    checkCudaErrors( cudaFreeHost(writer->getPrecursorStruct(level)->bufferH));
+    checkCudaErrors( cudaFree(writer->getPrecursorStruct(level)->dataD));
+    checkCudaErrors( cudaFree(writer->getPrecursorStruct(level)->bufferD));
+}
 
 
 CudaMemoryManager::CudaMemoryManager(std::shared_ptr<Parameter> parameter) : parameter(parameter)
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
index d410340d2de7797cf23a781a64d11f592d62a6fb..e2f2e8658b6ef7a9453546454dd8e1f643574e17 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
+++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
@@ -18,8 +18,10 @@
 
 class Parameter;
 class PorousMedia;
-class ActuatorLine;
+class ActuatorFarm;
 class Probe;
+class VelocitySetter;
+class PrecursorWriter;
 
 class VIRTUALFLUIDS_GPU_EXPORT CudaMemoryManager
 {
@@ -30,8 +32,8 @@ public:
     void setMemsizeGPU(double admem, bool reset);
     double getMemsizeGPU();
 
-    void cudaAllocFull(int lev);
-    void cudaFreeFull(int lev);
+    //void cudaAllocFull(int lev); //DEPRECATED: related to full matrix
+    //void cudaFreeFull(int lev);  //DEPRECATED: related to full matrix
 
     void cudaCopyPrint(int lev);
     void cudaCopyMedianPrint(int lev);
@@ -92,26 +94,20 @@ public:
     //////////////////////////////////////////////////////////////////////////
     //3D domain decomposition
     virtual void cudaAllocProcessNeighborX(int lev, unsigned int processNeighbor);
-    void cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv,
-                                      int streamIndex);
-    void cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend,
-                                      int streamIndex);
+    void cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv);
+    void cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend);
     virtual void cudaCopyProcessNeighborXIndex(int lev, unsigned int processNeighbor);
     void cudaFreeProcessNeighborX(int lev, unsigned int processNeighbor);
     //
     virtual void cudaAllocProcessNeighborY(int lev, unsigned int processNeighbor);
-    void cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv,
-                                      int streamIndex);
-    void cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend,
-                                      int streamIndex);
+    void cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv);
+    void cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend);
     virtual void cudaCopyProcessNeighborYIndex(int lev, unsigned int processNeighbor);
     void cudaFreeProcessNeighborY(int lev, unsigned int processNeighbor);
     //
     virtual void cudaAllocProcessNeighborZ(int lev, unsigned int processNeighbor);
-    void cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv,
-                                      int streamIndex);
-    void cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend,
-                                      int streamIndex);
+    void cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv);
+    void cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend);
     virtual void cudaCopyProcessNeighborZIndex(int lev, unsigned int processNeighbor);
     void cudaFreeProcessNeighborZ(int lev, unsigned int processNeighbor);
 
@@ -183,6 +179,13 @@ public:
     void cudaCopyStressBC(int lev);
     void cudaFreeStressBC(int lev);
 
+    void cudaAllocPrecursorBC(int lev);
+    void cudaAllocPrecursorData(int lev);
+    void cudaCopyPrecursorBC(int lev);
+    void cudaCopyPrecursorData(int lev);
+    void cudaFreePrecursorBC(int lev);
+    void cudaFreePrecursorData(int lev);
+
     void cudaAllocWallModel(int lev, bool hasWallModelMonitor);
     void cudaCopyWallModel(int lev,  bool hasWallModelMonitor);
     void cudaFreeWallModel(int lev,  bool hasWallModelMonitor);
@@ -346,42 +349,44 @@ public:
     void cudaCopyProcessNeighborADZIndex(int lev, unsigned int processNeighbor);
     void cudaFreeProcessNeighborADZ(int lev, unsigned int processNeighbor);
 
-    void cudaAllocFluidNodeIndices(int lev);
-    void cudaCopyFluidNodeIndices(int lev);
-    void cudaFreeFluidNodeIndices(int lev);
-    void cudaAllocFluidNodeIndicesBorder(int lev);
-    void cudaCopyFluidNodeIndicesBorder(int lev);
-    void cudaFreeFluidNodeIndicesBorder(int lev);
-
-    // Actuator Line
-    void cudaAllocBladeRadii(ActuatorLine* actuatorLine);
-    void cudaCopyBladeRadiiHtoD(ActuatorLine* actuatorLine);
-    void cudaCopyBladeRadiiDtoH(ActuatorLine* actuatorLine);
-    void cudaFreeBladeRadii(ActuatorLine* actuatorLine);
-
-    void cudaAllocBladeCoords(ActuatorLine* actuatorLine);
-    void cudaCopyBladeCoordsHtoD(ActuatorLine* actuatorLine);
-    void cudaCopyBladeCoordsDtoH(ActuatorLine* actuatorLine);
-    void cudaFreeBladeCoords(ActuatorLine* actuatorLine);
-
-    void cudaAllocBladeIndices(ActuatorLine* actuatorLine);
-    void cudaCopyBladeIndicesHtoD(ActuatorLine* actuatorLine);
-    void cudaFreeBladeIndices(ActuatorLine* actuatorLine);
-
-    void cudaAllocBladeVelocities(ActuatorLine* actuatorLine);
-    void cudaCopyBladeVelocitiesHtoD(ActuatorLine* actuatorLine);
-    void cudaCopyBladeVelocitiesDtoH(ActuatorLine* actuatorLine);
-    void cudaFreeBladeVelocities(ActuatorLine* actuatorLine);
-
-    void cudaAllocBladeForces(ActuatorLine* actuatorLine);
-    void cudaCopyBladeForcesHtoD(ActuatorLine* actuatorLine);
-    void cudaCopyBladeForcesDtoH(ActuatorLine* actuatorLine);
-    void cudaFreeBladeForces(ActuatorLine* actuatorLine);
-
-    void cudaAllocSphereIndices(ActuatorLine* actuatorLine);
-    void cudaCopySphereIndicesHtoD(ActuatorLine* actuatorLine);
-    void cudaFreeSphereIndices(ActuatorLine* actuatorLine);
-
+    void cudaAllocTaggedFluidNodeIndices(CollisionTemplate tag, int lev);
+    void cudaCopyTaggedFluidNodeIndices(CollisionTemplate tag, int lev);
+    void cudaFreeTaggedFluidNodeIndices(CollisionTemplate tag, int lev);
+
+    // ActuatorFarm
+    void cudaAllocBladeGeometries(ActuatorFarm* actuatorFarm);
+    void cudaCopyBladeGeometriesHtoD(ActuatorFarm* actuatorFarm);
+    void cudaCopyBladeGeometriesDtoH(ActuatorFarm* actuatorFarm);
+    void cudaFreeBladeGeometries(ActuatorFarm* actuatorFarm);
+
+    void cudaAllocBladeOrientations(ActuatorFarm* actuatorFarm);
+    void cudaCopyBladeOrientationsHtoD(ActuatorFarm* actuatorFarm);
+    void cudaCopyBladeOrientationsDtoH(ActuatorFarm* actuatorFarm);
+    void cudaFreeBladeOrientations(ActuatorFarm* actuatorFarm);
+
+    void cudaAllocBladeCoords(ActuatorFarm* actuatorFarm);
+    void cudaCopyBladeCoordsHtoD(ActuatorFarm* actuatorFarm);
+    void cudaCopyBladeCoordsDtoH(ActuatorFarm* actuatorFarm);
+    void cudaFreeBladeCoords(ActuatorFarm* actuatorFarm);
+
+    void cudaAllocBladeIndices(ActuatorFarm* actuatorFarm);
+    void cudaCopyBladeIndicesHtoD(ActuatorFarm* actuatorFarm);
+    void cudaFreeBladeIndices(ActuatorFarm* actuatorFarm);
+
+    void cudaAllocBladeVelocities(ActuatorFarm* actuatorFarm);
+    void cudaCopyBladeVelocitiesHtoD(ActuatorFarm* actuatorFarm);
+    void cudaCopyBladeVelocitiesDtoH(ActuatorFarm* actuatorFarm);
+    void cudaFreeBladeVelocities(ActuatorFarm* actuatorFarm);
+
+    void cudaAllocBladeForces(ActuatorFarm* actuatorFarm);
+    void cudaCopyBladeForcesHtoD(ActuatorFarm* actuatorFarm);
+    void cudaCopyBladeForcesDtoH(ActuatorFarm* actuatorFarm);
+    void cudaFreeBladeForces(ActuatorFarm* actuatorFarm);
+
+    void cudaAllocSphereIndices(ActuatorFarm* actuatorFarm);
+    void cudaCopySphereIndicesHtoD(ActuatorFarm* actuatorFarm);
+    void cudaFreeSphereIndices(ActuatorFarm* actuatorFarm);
+    // Probes
     void cudaAllocProbeDistances(Probe* probe, int level);
     void cudaCopyProbeDistancesHtoD(Probe* probe, int level);
     void cudaCopyProbeDistancesDtoH(Probe* probe, int level);
@@ -402,6 +407,12 @@ public:
     void cudaCopyProbeQuantitiesAndOffsetsDtoH(Probe* probe, int level);
     void cudaFreeProbeQuantitiesAndOffsets(Probe* probe, int level);
 
+    //Precursor Writer
+    void cudaAllocPrecursorWriter(PrecursorWriter* writer, int level);
+    void cudaCopyPrecursorWriterIndicesHtoD(PrecursorWriter* writer, int level);
+    void cudaCopyPrecursorWriterOutputVariablesDtoH(PrecursorWriter* writer, int level);
+    void cudaFreePrecursorWriter(PrecursorWriter* writer, int level);
+
 private:
     std::shared_ptr<Parameter> parameter;
     double memsizeGPU = 0.0;
diff --git a/src/gpu/VirtualFluids_GPU/GPU/Cumulant27.cu b/src/gpu/VirtualFluids_GPU/GPU/Cumulant27.cu
index bbce8181d814fc8b9dbb086764becb73a86c0eda..553e1f34f7993a42682605b66d53407ede9292fd 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/Cumulant27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/Cumulant27.cu
@@ -21,7 +21,7 @@ __global__ void LB_Kernel_Kum_AA2016_Comp_Bulk_SP_27(real omega,
 																unsigned int* neighborY,
 																unsigned int* neighborZ,
 																real* DDStart,
-																int size_Mat,
+																unsigned long long numberOfLBnodes,
 																int level,
 																real* forces,
 																bool EvenOrOdd)
@@ -37,7 +37,7 @@ __global__ void LB_Kernel_Kum_AA2016_Comp_Bulk_SP_27(real omega,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if(k<size_Mat)
+	if(k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -48,63 +48,63 @@ __global__ void LB_Kernel_Kum_AA2016_Comp_Bulk_SP_27(real omega,
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -137,33 +137,33 @@ __global__ void LB_Kernel_Kum_AA2016_Comp_Bulk_SP_27(real omega,
 			//unsigned int ktne = k;
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00   ])[kw ];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0   ])[ks ];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M   ])[kb ];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0  ])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0  ])[kw ];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M  ])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P  ])[kw ];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP  ])[ks ];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k  ];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw ];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks ];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k  ];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb ];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k  ];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks ];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw ];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k  ];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb ];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw ];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k  ];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb ];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks ];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k  ];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP ])[k  ];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP ])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP ])[ks ];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP ])[kw ];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM ])[kb ];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM ])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM ])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k  ];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks ];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw ];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb ];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 			////////////////////////////////////////////////////////////////////////////////////
 			real drho = ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
 							(((mfbac+mfbca) + (mfbaa+mfbcc)) + ((mfabc+mfcba) + (mfaba+mfcbc)) + ((mfacb+mfcab) + (mfaab+mfccb))) +
@@ -975,7 +975,7 @@ __global__ void LB_Kernel_Kum_IsoTest_SP_27( real omega,
 														real* dxxUx,
 														real* dyyUy,
 														real* dzzUz,
-														int size_Mat,
+														unsigned long long numberOfLBnodes,
 														bool EvenOrOdd)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -989,7 +989,7 @@ __global__ void LB_Kernel_Kum_IsoTest_SP_27( real omega,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if(k<size_Mat)
+	if(k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -1000,63 +1000,63 @@ __global__ void LB_Kernel_Kum_IsoTest_SP_27( real omega,
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -1089,33 +1089,33 @@ __global__ void LB_Kernel_Kum_IsoTest_SP_27( real omega,
 			//unsigned int ktne = k;
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00   ])[kw ];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0   ])[ks ];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M   ])[kb ];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0  ])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0  ])[kw ];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M  ])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P  ])[kw ];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP  ])[ks ];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k  ];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw ];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks ];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k  ];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb ];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k  ];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks ];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw ];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k  ];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb ];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw ];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k  ];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb ];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks ];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k  ];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP ])[k  ];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP ])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP ])[ks ];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP ])[kw ];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM ])[kb ];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM ])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM ])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k  ];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks ];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw ];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb ];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 			////////////////////////////////////////////////////////////////////////////////////
 			//slow
 			//real oMdrho = one - ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
@@ -2016,7 +2016,7 @@ __global__ void LB_Kernel_Kum_1h_SP_27(  real omega,
 													real* coordY,
 													real* coordZ,
 													real* DDStart,
-													int size_Mat,
+													unsigned long long numberOfLBnodes,
 													bool EvenOrOdd)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -2030,7 +2030,7 @@ __global__ void LB_Kernel_Kum_1h_SP_27(  real omega,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if(k<size_Mat)
+	if(k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -2041,63 +2041,63 @@ __global__ void LB_Kernel_Kum_1h_SP_27(  real omega,
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -2158,33 +2158,33 @@ __global__ void LB_Kernel_Kum_1h_SP_27(  real omega,
 			//unsigned int ktne = k;
 			//unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00   ])[kw ];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0   ])[ks ];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M   ])[kb ];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0  ])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0  ])[kw ];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M  ])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P  ])[kw ];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP  ])[ks ];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k  ];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw ];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks ];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k  ];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb ];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k  ];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks ];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw ];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k  ];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb ];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw ];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k  ];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb ];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks ];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k  ];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP ])[k  ];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP ])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP ])[ks ];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP ])[kw ];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM ])[kb ];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM ])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM ])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k  ];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks ];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw ];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb ];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 			////////////////////////////////////////////////////////////////////////////////////
 			//Ship
 			real coord0X = 281.125f;//7.5f;
@@ -3238,7 +3238,7 @@ __global__ void LB_Kernel_Kum_New_SP_27(     real omega,
 														unsigned int* neighborY,
 														unsigned int* neighborZ,
 														real* DDStart,
-														int size_Mat,
+														unsigned long long numberOfLBnodes,
 														bool EvenOrOdd)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -3252,7 +3252,7 @@ __global__ void LB_Kernel_Kum_New_SP_27(     real omega,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if(k<size_Mat)
+	if(k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -3263,63 +3263,63 @@ __global__ void LB_Kernel_Kum_New_SP_27(     real omega,
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -3380,33 +3380,33 @@ __global__ void LB_Kernel_Kum_New_SP_27(     real omega,
 			//unsigned int ktne = k;
 			//unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00   ])[kw ];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0   ])[ks ];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M   ])[kb ];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0  ])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0  ])[kw ];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M  ])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P  ])[kw ];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP  ])[ks ];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k  ];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw ];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks ];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k  ];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb ];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k  ];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks ];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw ];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k  ];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb ];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw ];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k  ];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb ];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks ];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k  ];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP ])[k  ];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP ])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP ])[ks ];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP ])[kw ];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM ])[kb ];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM ])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM ])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k  ];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks ];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw ];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb ];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 			////////////////////////////////////////////////////////////////////////////////////
 			//slow
 			//real oMdrho = one - ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
@@ -4510,7 +4510,7 @@ __global__ void LB_Kernel_Kum_Comp_SP_27(    real omega,
 														unsigned int* neighborY,
 														unsigned int* neighborZ,
 														real* DDStart,
-														int size_Mat,
+														unsigned long long numberOfLBnodes,
 														bool EvenOrOdd)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -4524,7 +4524,7 @@ __global__ void LB_Kernel_Kum_Comp_SP_27(    real omega,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if(k<size_Mat)
+	if(k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -4535,63 +4535,63 @@ __global__ void LB_Kernel_Kum_Comp_SP_27(    real omega,
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -4624,33 +4624,33 @@ __global__ void LB_Kernel_Kum_Comp_SP_27(    real omega,
 			unsigned int ktne = k;
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real f_E     = (D.f[DIR_P00   ])[ke   ];// +  c2over27 ;
-			real f_W     = (D.f[DIR_M00   ])[kw   ];// +  c2over27 ;
-			real f_N     = (D.f[DIR_0P0   ])[kn   ];// +  c2over27 ;
-			real f_S     = (D.f[DIR_0M0   ])[ks   ];// +  c2over27 ;
-			real f_T     = (D.f[DIR_00P   ])[kt   ];// +  c2over27 ;
-			real f_B     = (D.f[DIR_00M   ])[kb   ];// +  c2over27 ;
-			real f_NE    = (D.f[DIR_PP0  ])[kne  ];// +  c1over54 ;
-			real f_SW    = (D.f[DIR_MM0  ])[ksw  ];// +  c1over54 ;
-			real f_SE    = (D.f[DIR_PM0  ])[kse  ];// +  c1over54 ;
-			real f_NW    = (D.f[DIR_MP0  ])[knw  ];// +  c1over54 ;
-			real f_TE    = (D.f[DIR_P0P  ])[kte  ];// +  c1over54 ;
-			real f_BW    = (D.f[DIR_M0M  ])[kbw  ];// +  c1over54 ;
-			real f_BE    = (D.f[DIR_P0M  ])[kbe  ];// +  c1over54 ;
-			real f_TW    = (D.f[DIR_M0P  ])[ktw  ];// +  c1over54 ;
-			real f_TN    = (D.f[DIR_0PP  ])[ktn  ];// +  c1over54 ;
-			real f_BS    = (D.f[DIR_0MM  ])[kbs  ];// +  c1over54 ;
-			real f_BN    = (D.f[DIR_0PM  ])[kbn  ];// +  c1over54 ;
-			real f_TS    = (D.f[DIR_0MP  ])[kts  ];// +  c1over54 ;
+			real f_E     = (D.f[DIR_P00])[ke   ];// +  c2over27 ;
+			real f_W     = (D.f[DIR_M00])[kw   ];// +  c2over27 ;
+			real f_N     = (D.f[DIR_0P0])[kn   ];// +  c2over27 ;
+			real f_S     = (D.f[DIR_0M0])[ks   ];// +  c2over27 ;
+			real f_T     = (D.f[DIR_00P])[kt   ];// +  c2over27 ;
+			real f_B     = (D.f[DIR_00M])[kb   ];// +  c2over27 ;
+			real f_NE    = (D.f[DIR_PP0])[kne  ];// +  c1over54 ;
+			real f_SW    = (D.f[DIR_MM0])[ksw  ];// +  c1over54 ;
+			real f_SE    = (D.f[DIR_PM0])[kse  ];// +  c1over54 ;
+			real f_NW    = (D.f[DIR_MP0])[knw  ];// +  c1over54 ;
+			real f_TE    = (D.f[DIR_P0P])[kte  ];// +  c1over54 ;
+			real f_BW    = (D.f[DIR_M0M])[kbw  ];// +  c1over54 ;
+			real f_BE    = (D.f[DIR_P0M])[kbe  ];// +  c1over54 ;
+			real f_TW    = (D.f[DIR_M0P])[ktw  ];// +  c1over54 ;
+			real f_TN    = (D.f[DIR_0PP])[ktn  ];// +  c1over54 ;
+			real f_BS    = (D.f[DIR_0MM])[kbs  ];// +  c1over54 ;
+			real f_BN    = (D.f[DIR_0PM])[kbn  ];// +  c1over54 ;
+			real f_TS    = (D.f[DIR_0MP])[kts  ];// +  c1over54 ;
 			real f_R     = (D.f[DIR_000])[kzero];// +  c8over27 ;
-			real f_TNE   = (D.f[DIR_PPP ])[ktne ];// +  c1over216;
-			real f_TSW   = (D.f[DIR_MMP ])[ktsw ];// +  c1over216;
-			real f_TSE   = (D.f[DIR_PMP ])[ktse ];// +  c1over216;
-			real f_TNW   = (D.f[DIR_MPP ])[ktnw ];// +  c1over216;
-			real f_BNE   = (D.f[DIR_PPM ])[kbne ];// +  c1over216;
-			real f_BSW   = (D.f[DIR_MMM ])[kbsw ];// +  c1over216;
-			real f_BSE   = (D.f[DIR_PMM ])[kbse ];// +  c1over216;
-			real f_BNW   = (D.f[DIR_MPM ])[kbnw ];// +  c1over216;
+			real f_TNE   = (D.f[DIR_PPP])[ktne ];// +  c1over216;
+			real f_TSW   = (D.f[DIR_MMP])[ktsw ];// +  c1over216;
+			real f_TSE   = (D.f[DIR_PMP])[ktse ];// +  c1over216;
+			real f_TNW   = (D.f[DIR_MPP])[ktnw ];// +  c1over216;
+			real f_BNE   = (D.f[DIR_PPM])[kbne ];// +  c1over216;
+			real f_BSW   = (D.f[DIR_MMM])[kbsw ];// +  c1over216;
+			real f_BSE   = (D.f[DIR_PMM])[kbse ];// +  c1over216;
+			real f_BNW   = (D.f[DIR_MPM])[kbnw ];// +  c1over216;
 			////////////////////////////////////////////////////////////////////////////////////
 			real fx = c0o1;
 			real fy = c0o1;
@@ -5451,7 +5451,7 @@ __global__ void LB_Kernel_Kum_New_Comp_SRT_SP_27(
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
 	real* DDStart,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	int level,
 	real* forces,
 	bool EvenOrOdd)
@@ -5467,7 +5467,7 @@ __global__ void LB_Kernel_Kum_New_Comp_SRT_SP_27(
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if (k<size_Mat)
+	if (k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -5478,63 +5478,63 @@ __global__ void LB_Kernel_Kum_New_Comp_SRT_SP_27(
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -5568,33 +5568,33 @@ __global__ void LB_Kernel_Kum_New_Comp_SRT_SP_27(
 			unsigned int kbsw = neighborZ[ksw];
 
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k   ];
-			real mfabb = (D.f[DIR_M00   ])[kw  ];
-			real mfbcb = (D.f[DIR_0P0   ])[k   ];
-			real mfbab = (D.f[DIR_0M0   ])[ks  ];
-			real mfbbc = (D.f[DIR_00P   ])[k   ];
-			real mfbba = (D.f[DIR_00M   ])[kb  ];
-			real mfccb = (D.f[DIR_PP0  ])[k   ];
-			real mfaab = (D.f[DIR_MM0  ])[ksw ];
-			real mfcab = (D.f[DIR_PM0  ])[ks  ];
-			real mfacb = (D.f[DIR_MP0  ])[kw  ];
-			real mfcbc = (D.f[DIR_P0P  ])[k   ];
-			real mfaba = (D.f[DIR_M0M  ])[kbw ];
-			real mfcba = (D.f[DIR_P0M  ])[kb  ];
-			real mfabc = (D.f[DIR_M0P  ])[kw  ];
-			real mfbcc = (D.f[DIR_0PP  ])[k   ];
-			real mfbaa = (D.f[DIR_0MM  ])[kbs ];
-			real mfbca = (D.f[DIR_0PM  ])[kb  ];
-			real mfbac = (D.f[DIR_0MP  ])[ks  ];
+			real mfcbb = (D.f[DIR_P00])[k   ];
+			real mfabb = (D.f[DIR_M00])[kw  ];
+			real mfbcb = (D.f[DIR_0P0])[k   ];
+			real mfbab = (D.f[DIR_0M0])[ks  ];
+			real mfbbc = (D.f[DIR_00P])[k   ];
+			real mfbba = (D.f[DIR_00M])[kb  ];
+			real mfccb = (D.f[DIR_PP0])[k   ];
+			real mfaab = (D.f[DIR_MM0])[ksw ];
+			real mfcab = (D.f[DIR_PM0])[ks  ];
+			real mfacb = (D.f[DIR_MP0])[kw  ];
+			real mfcbc = (D.f[DIR_P0P])[k   ];
+			real mfaba = (D.f[DIR_M0M])[kbw ];
+			real mfcba = (D.f[DIR_P0M])[kb  ];
+			real mfabc = (D.f[DIR_M0P])[kw  ];
+			real mfbcc = (D.f[DIR_0PP])[k   ];
+			real mfbaa = (D.f[DIR_0MM])[kbs ];
+			real mfbca = (D.f[DIR_0PM])[kb  ];
+			real mfbac = (D.f[DIR_0MP])[ks  ];
 			real mfbbb = (D.f[DIR_000])[k   ];
-			real mfccc = (D.f[DIR_PPP ])[k   ];
-			real mfaac = (D.f[DIR_MMP ])[ksw ];
-			real mfcac = (D.f[DIR_PMP ])[ks  ];
-			real mfacc = (D.f[DIR_MPP ])[kw  ];
-			real mfcca = (D.f[DIR_PPM ])[kb  ];
-			real mfaaa = (D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs ];
-			real mfaca = (D.f[DIR_MPM ])[kbw ];
+			real mfccc = (D.f[DIR_PPP])[k   ];
+			real mfaac = (D.f[DIR_MMP])[ksw ];
+			real mfcac = (D.f[DIR_PMP])[ks  ];
+			real mfacc = (D.f[DIR_MPP])[kw  ];
+			real mfcca = (D.f[DIR_PPM])[kb  ];
+			real mfaaa = (D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs ];
+			real mfaca = (D.f[DIR_MPM])[kbw ];
 			////////////////////////////////////////////////////////////////////////////////////
 			real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
 				(((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
@@ -6349,33 +6349,33 @@ __global__ void LB_Kernel_Kum_New_Comp_SRT_SP_27(
 					((mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc))) + mfbbb;
 			mfbbb += drho - drhoPost;
 			////////////////////////////////////////////////////////////////////////////////////
-			(D.f[DIR_P00   ])[k   ] = mfabb;                                                                   
-			(D.f[DIR_M00   ])[kw  ] = mfcbb;                                                                 
-			(D.f[DIR_0P0   ])[k   ] = mfbab;
-			(D.f[DIR_0M0   ])[ks  ] = mfbcb;
-			(D.f[DIR_00P   ])[k   ] = mfbba;
-			(D.f[DIR_00M   ])[kb  ] = mfbbc;
-			(D.f[DIR_PP0  ])[k   ] = mfaab;
-			(D.f[DIR_MM0  ])[ksw ] = mfccb;
-			(D.f[DIR_PM0  ])[ks  ] = mfacb;
-			(D.f[DIR_MP0  ])[kw  ] = mfcab;
-			(D.f[DIR_P0P  ])[k   ] = mfaba;
-			(D.f[DIR_M0M  ])[kbw ] = mfcbc;
-			(D.f[DIR_P0M  ])[kb  ] = mfabc;
-			(D.f[DIR_M0P  ])[kw  ] = mfcba;
-			(D.f[DIR_0PP  ])[k   ] = mfbaa;
-			(D.f[DIR_0MM  ])[kbs ] = mfbcc;
-			(D.f[DIR_0PM  ])[kb  ] = mfbac;
-			(D.f[DIR_0MP  ])[ks  ] = mfbca;
+			(D.f[DIR_P00])[k   ] = mfabb;                                                                   
+			(D.f[DIR_M00])[kw  ] = mfcbb;                                                                 
+			(D.f[DIR_0P0])[k   ] = mfbab;
+			(D.f[DIR_0M0])[ks  ] = mfbcb;
+			(D.f[DIR_00P])[k   ] = mfbba;
+			(D.f[DIR_00M])[kb  ] = mfbbc;
+			(D.f[DIR_PP0])[k   ] = mfaab;
+			(D.f[DIR_MM0])[ksw ] = mfccb;
+			(D.f[DIR_PM0])[ks  ] = mfacb;
+			(D.f[DIR_MP0])[kw  ] = mfcab;
+			(D.f[DIR_P0P])[k   ] = mfaba;
+			(D.f[DIR_M0M])[kbw ] = mfcbc;
+			(D.f[DIR_P0M])[kb  ] = mfabc;
+			(D.f[DIR_M0P])[kw  ] = mfcba;
+			(D.f[DIR_0PP])[k   ] = mfbaa;
+			(D.f[DIR_0MM])[kbs ] = mfbcc;
+			(D.f[DIR_0PM])[kb  ] = mfbac;
+			(D.f[DIR_0MP])[ks  ] = mfbca;
 			(D.f[DIR_000])[k   ] = mfbbb;
-			(D.f[DIR_PPP ])[k   ] = mfaaa;
-			(D.f[DIR_PMP ])[ks  ] = mfaca;
-			(D.f[DIR_PPM ])[kb  ] = mfaac;
-			(D.f[DIR_PMM ])[kbs ] = mfacc;
-			(D.f[DIR_MPP ])[kw  ] = mfcaa;
-			(D.f[DIR_MMP ])[ksw ] = mfcca;
-			(D.f[DIR_MPM ])[kbw ] = mfcac;
-			(D.f[DIR_MMM ])[kbsw] = mfccc;
+			(D.f[DIR_PPP])[k   ] = mfaaa;
+			(D.f[DIR_PMP])[ks  ] = mfaca;
+			(D.f[DIR_PPM])[kb  ] = mfaac;
+			(D.f[DIR_PMM])[kbs ] = mfacc;
+			(D.f[DIR_MPP])[kw  ] = mfcaa;
+			(D.f[DIR_MMP])[ksw ] = mfcca;
+			(D.f[DIR_MPM])[kbw ] = mfcac;
+			(D.f[DIR_MMM])[kbsw] = mfccc;
 		}
 	}
 }
diff --git a/src/gpu/VirtualFluids_GPU/GPU/Cumulant27chim.cu b/src/gpu/VirtualFluids_GPU/GPU/Cumulant27chim.cu
index 97c1aff4d26cb85deaf1dd0d145245f28affc2e3..3706e5f929b50a2a72c107a982525ec3172eb144 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/Cumulant27chim.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/Cumulant27chim.cu
@@ -51,7 +51,7 @@ __global__ void Cumulant_One_preconditioned_errorDiffusion_chim_Comp_SP_27(
     unsigned int* neighborY,
     unsigned int* neighborZ,
     real* DDStart,
-    int size_Mat,
+    unsigned long long numberOfLBnodes,
     int level,
     real* forces,
     bool EvenOrOdd)
@@ -67,7 +67,7 @@ __global__ void Cumulant_One_preconditioned_errorDiffusion_chim_Comp_SP_27(
     const unsigned k = nx*(ny*z + y) + x;
     //////////////////////////////////////////////////////////////////////////
 
-    if (k<size_Mat)
+    if (k<numberOfLBnodes)
     {
         ////////////////////////////////////////////////////////////////////////////////
         unsigned int BC;
@@ -78,63 +78,63 @@ __global__ void Cumulant_One_preconditioned_errorDiffusion_chim_Comp_SP_27(
             Distributions27 D;
             if (EvenOrOdd == true)
             {
-                D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-                D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-                D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-                D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-                D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-                D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-                D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-                D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-                D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-                D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-                D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-                D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-                D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-                D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-                D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-                D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-                D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-                D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-                D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-                D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-                D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-                D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-                D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-                D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-                D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-                D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-                D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+                D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+                D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+                D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+                D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+                D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+                D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+                D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+                D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+                D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+                D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+                D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+                D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+                D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+                D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+                D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+                D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+                D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+                D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+                D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+                D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+                D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+                D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+                D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+                D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+                D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+                D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+                D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
             }
             else
             {
-                D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-                D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-                D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-                D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-                D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-                D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-                D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-                D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-                D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-                D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-                D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-                D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-                D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-                D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-                D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-                D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-                D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-                D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-                D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-                D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-                D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-                D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-                D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-                D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-                D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-                D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-                D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+                D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+                D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+                D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+                D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+                D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+                D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+                D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+                D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+                D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+                D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+                D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+                D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+                D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+                D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+                D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+                D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+                D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+                D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+                D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+                D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+                D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+                D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+                D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+                D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+                D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+                D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+                D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
             }
 
             ////////////////////////////////////////////////////////////////////////////////
@@ -170,33 +170,33 @@ __global__ void Cumulant_One_preconditioned_errorDiffusion_chim_Comp_SP_27(
 
 
             //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-            real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-            real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-            real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-            real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-            real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-            real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-            real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-            real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-            real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-            real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-            real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-            real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-            real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-            real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-            real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-            real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-            real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-            real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+            real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+            real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+            real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+            real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+            real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+            real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+            real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+            real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+            real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+            real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+            real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+            real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+            real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+            real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+            real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+            real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+            real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+            real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
             real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-            real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-            real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-            real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-            real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-            real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-            real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-            real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-            real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+            real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+            real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+            real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+            real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+            real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+            real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+            real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+            real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
                                                ////////////////////////////////////////////////////////////////////////////////////
             real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
                 (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
@@ -966,7 +966,7 @@ __global__ void Cumulant_One_preconditioned_chim_Comp_SP_27(
     unsigned int* neighborY,
     unsigned int* neighborZ,
     real* DDStart,
-    int size_Mat,
+    unsigned long long numberOfLBnodes,
     int level,
     real* forces,
     bool EvenOrOdd)
@@ -982,7 +982,7 @@ __global__ void Cumulant_One_preconditioned_chim_Comp_SP_27(
     const unsigned k = nx*(ny*z + y) + x;
     //////////////////////////////////////////////////////////////////////////
 
-    if (k<size_Mat)
+    if (k<numberOfLBnodes)
     {
         ////////////////////////////////////////////////////////////////////////////////
         unsigned int BC;
@@ -993,63 +993,63 @@ __global__ void Cumulant_One_preconditioned_chim_Comp_SP_27(
             Distributions27 D;
             if (EvenOrOdd == true)
             {
-                D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-                D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-                D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-                D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-                D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-                D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-                D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-                D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-                D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-                D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-                D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-                D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-                D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-                D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-                D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-                D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-                D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-                D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-                D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-                D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-                D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-                D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-                D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-                D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-                D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-                D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-                D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+                D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+                D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+                D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+                D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+                D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+                D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+                D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+                D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+                D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+                D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+                D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+                D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+                D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+                D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+                D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+                D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+                D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+                D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+                D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+                D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+                D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+                D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+                D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+                D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+                D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+                D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+                D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
             }
             else
             {
-                D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-                D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-                D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-                D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-                D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-                D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-                D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-                D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-                D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-                D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-                D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-                D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-                D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-                D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-                D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-                D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-                D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-                D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-                D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-                D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-                D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-                D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-                D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-                D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-                D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-                D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-                D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+                D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+                D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+                D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+                D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+                D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+                D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+                D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+                D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+                D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+                D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+                D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+                D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+                D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+                D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+                D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+                D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+                D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+                D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+                D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+                D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+                D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+                D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+                D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+                D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+                D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+                D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+                D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
             }
 
             ////////////////////////////////////////////////////////////////////////////////
@@ -1085,33 +1085,33 @@ __global__ void Cumulant_One_preconditioned_chim_Comp_SP_27(
 
 
             //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-            real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-            real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-            real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-            real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-            real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-            real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-            real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-            real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-            real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-            real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-            real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-            real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-            real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-            real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-            real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-            real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-            real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-            real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+            real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+            real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+            real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+            real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+            real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+            real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+            real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+            real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+            real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+            real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+            real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+            real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+            real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+            real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+            real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+            real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+            real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+            real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
             real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-            real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-            real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-            real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-            real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-            real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-            real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-            real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-            real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+            real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+            real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+            real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+            real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+            real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+            real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+            real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+            real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
                                                ////////////////////////////////////////////////////////////////////////////////////
             real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
                 (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
@@ -1762,7 +1762,7 @@ __global__ void Cumulant_One_chim_Comp_SP_27(
     unsigned int* neighborY,
     unsigned int* neighborZ,
     real* DDStart,
-    int size_Mat,
+    unsigned long long numberOfLBnodes,
     int level,
     real* forces,
     bool EvenOrOdd)
@@ -1778,7 +1778,7 @@ __global__ void Cumulant_One_chim_Comp_SP_27(
     const unsigned k = nx*(ny*z + y) + x;
     //////////////////////////////////////////////////////////////////////////
 
-    if (k<size_Mat)
+    if (k<numberOfLBnodes)
     {
         ////////////////////////////////////////////////////////////////////////////////
         unsigned int BC;
@@ -1789,63 +1789,63 @@ __global__ void Cumulant_One_chim_Comp_SP_27(
             Distributions27 D;
             if (EvenOrOdd == true)
             {
-                D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-                D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-                D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-                D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-                D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-                D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-                D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-                D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-                D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-                D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-                D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-                D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-                D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-                D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-                D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-                D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-                D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-                D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-                D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-                D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-                D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-                D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-                D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-                D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-                D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-                D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-                D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+                D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+                D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+                D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+                D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+                D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+                D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+                D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+                D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+                D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+                D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+                D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+                D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+                D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+                D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+                D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+                D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+                D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+                D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+                D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+                D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+                D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+                D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+                D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+                D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+                D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+                D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+                D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
             }
             else
             {
-                D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-                D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-                D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-                D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-                D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-                D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-                D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-                D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-                D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-                D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-                D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-                D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-                D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-                D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-                D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-                D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-                D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-                D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-                D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-                D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-                D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-                D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-                D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-                D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-                D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-                D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-                D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+                D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+                D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+                D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+                D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+                D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+                D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+                D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+                D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+                D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+                D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+                D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+                D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+                D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+                D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+                D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+                D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+                D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+                D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+                D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+                D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+                D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+                D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+                D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+                D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+                D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+                D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+                D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
             }
             ////////////////////////////////////////////////////////////////////////////////
             //index
@@ -1857,33 +1857,33 @@ __global__ void Cumulant_One_chim_Comp_SP_27(
             unsigned int kbs = neighborZ[ks];
             unsigned int kbsw = neighborZ[ksw];
             ////////////////////////////////////////////////////////////////////////////////////
-            real mfcbb = (D.f[DIR_P00   ])[k   ];
-            real mfabb = (D.f[DIR_M00   ])[kw  ];
-            real mfbcb = (D.f[DIR_0P0   ])[k   ];
-            real mfbab = (D.f[DIR_0M0   ])[ks  ];
-            real mfbbc = (D.f[DIR_00P   ])[k   ];
-            real mfbba = (D.f[DIR_00M   ])[kb  ];
-            real mfccb = (D.f[DIR_PP0  ])[k   ];
-            real mfaab = (D.f[DIR_MM0  ])[ksw ];
-            real mfcab = (D.f[DIR_PM0  ])[ks  ];
-            real mfacb = (D.f[DIR_MP0  ])[kw  ];
-            real mfcbc = (D.f[DIR_P0P  ])[k   ];
-            real mfaba = (D.f[DIR_M0M  ])[kbw ];
-            real mfcba = (D.f[DIR_P0M  ])[kb  ];
-            real mfabc = (D.f[DIR_M0P  ])[kw  ];
-            real mfbcc = (D.f[DIR_0PP  ])[k   ];
-            real mfbaa = (D.f[DIR_0MM  ])[kbs ];
-            real mfbca = (D.f[DIR_0PM  ])[kb  ];
-            real mfbac = (D.f[DIR_0MP  ])[ks  ];
+            real mfcbb = (D.f[DIR_P00])[k   ];
+            real mfabb = (D.f[DIR_M00])[kw  ];
+            real mfbcb = (D.f[DIR_0P0])[k   ];
+            real mfbab = (D.f[DIR_0M0])[ks  ];
+            real mfbbc = (D.f[DIR_00P])[k   ];
+            real mfbba = (D.f[DIR_00M])[kb  ];
+            real mfccb = (D.f[DIR_PP0])[k   ];
+            real mfaab = (D.f[DIR_MM0])[ksw ];
+            real mfcab = (D.f[DIR_PM0])[ks  ];
+            real mfacb = (D.f[DIR_MP0])[kw  ];
+            real mfcbc = (D.f[DIR_P0P])[k   ];
+            real mfaba = (D.f[DIR_M0M])[kbw ];
+            real mfcba = (D.f[DIR_P0M])[kb  ];
+            real mfabc = (D.f[DIR_M0P])[kw  ];
+            real mfbcc = (D.f[DIR_0PP])[k   ];
+            real mfbaa = (D.f[DIR_0MM])[kbs ];
+            real mfbca = (D.f[DIR_0PM])[kb  ];
+            real mfbac = (D.f[DIR_0MP])[ks  ];
             real mfbbb = (D.f[DIR_000])[k   ];
-            real mfccc = (D.f[DIR_PPP ])[k   ];
-            real mfaac = (D.f[DIR_MMP ])[ksw ];
-            real mfcac = (D.f[DIR_PMP ])[ks  ];
-            real mfacc = (D.f[DIR_MPP ])[kw  ];
-            real mfcca = (D.f[DIR_PPM ])[kb  ];
-            real mfaaa = (D.f[DIR_MMM ])[kbsw];
-            real mfcaa = (D.f[DIR_PMM ])[kbs ];
-            real mfaca = (D.f[DIR_MPM ])[kbw ];
+            real mfccc = (D.f[DIR_PPP])[k   ];
+            real mfaac = (D.f[DIR_MMP])[ksw ];
+            real mfcac = (D.f[DIR_PMP])[ks  ];
+            real mfacc = (D.f[DIR_MPP])[kw  ];
+            real mfcca = (D.f[DIR_PPM])[kb  ];
+            real mfaaa = (D.f[DIR_MMM])[kbsw];
+            real mfcaa = (D.f[DIR_PMM])[kbs ];
+            real mfaca = (D.f[DIR_MPM])[kbw ];
             ////////////////////////////////////////////////////////////////////////////////////
             real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
                 (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
@@ -2204,33 +2204,33 @@ __global__ void Cumulant_One_chim_Comp_SP_27(
                     ((mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc))) + mfbbb;
             mfbbb += drho - drhoPost;
             ////////////////////////////////////////////////////////////////////////////////////
-            (D.f[DIR_P00   ])[k   ] = mfabb;                                                                   
-            (D.f[DIR_M00   ])[kw  ] = mfcbb;                                                                 
-            (D.f[DIR_0P0   ])[k   ] = mfbab;
-            (D.f[DIR_0M0   ])[ks  ] = mfbcb;
-            (D.f[DIR_00P   ])[k   ] = mfbba;
-            (D.f[DIR_00M   ])[kb  ] = mfbbc;
-            (D.f[DIR_PP0  ])[k   ] = mfaab;
-            (D.f[DIR_MM0  ])[ksw ] = mfccb;
-            (D.f[DIR_PM0  ])[ks  ] = mfacb;
-            (D.f[DIR_MP0  ])[kw  ] = mfcab;
-            (D.f[DIR_P0P  ])[k   ] = mfaba;
-            (D.f[DIR_M0M  ])[kbw ] = mfcbc;
-            (D.f[DIR_P0M  ])[kb  ] = mfabc;
-            (D.f[DIR_M0P  ])[kw  ] = mfcba;
-            (D.f[DIR_0PP  ])[k   ] = mfbaa;
-            (D.f[DIR_0MM  ])[kbs ] = mfbcc;
-            (D.f[DIR_0PM  ])[kb  ] = mfbac;
-            (D.f[DIR_0MP  ])[ks  ] = mfbca;
+            (D.f[DIR_P00])[k   ] = mfabb;                                                                   
+            (D.f[DIR_M00])[kw  ] = mfcbb;                                                                 
+            (D.f[DIR_0P0])[k   ] = mfbab;
+            (D.f[DIR_0M0])[ks  ] = mfbcb;
+            (D.f[DIR_00P])[k   ] = mfbba;
+            (D.f[DIR_00M])[kb  ] = mfbbc;
+            (D.f[DIR_PP0])[k   ] = mfaab;
+            (D.f[DIR_MM0])[ksw ] = mfccb;
+            (D.f[DIR_PM0])[ks  ] = mfacb;
+            (D.f[DIR_MP0])[kw  ] = mfcab;
+            (D.f[DIR_P0P])[k   ] = mfaba;
+            (D.f[DIR_M0M])[kbw ] = mfcbc;
+            (D.f[DIR_P0M])[kb  ] = mfabc;
+            (D.f[DIR_M0P])[kw  ] = mfcba;
+            (D.f[DIR_0PP])[k   ] = mfbaa;
+            (D.f[DIR_0MM])[kbs ] = mfbcc;
+            (D.f[DIR_0PM])[kb  ] = mfbac;
+            (D.f[DIR_0MP])[ks  ] = mfbca;
             (D.f[DIR_000])[k   ] = mfbbb;
-            (D.f[DIR_PPP ])[k   ] = mfaaa;
-            (D.f[DIR_PMP ])[ks  ] = mfaca;
-            (D.f[DIR_PPM ])[kb  ] = mfaac;
-            (D.f[DIR_PMM ])[kbs ] = mfacc;
-            (D.f[DIR_MPP ])[kw  ] = mfcaa;
-            (D.f[DIR_MMP ])[ksw ] = mfcca;
-            (D.f[DIR_MPM ])[kbw ] = mfcac;
-            (D.f[DIR_MMM ])[kbsw] = mfccc;
+            (D.f[DIR_PPP])[k   ] = mfaaa;
+            (D.f[DIR_PMP])[ks  ] = mfaca;
+            (D.f[DIR_PPM])[kb  ] = mfaac;
+            (D.f[DIR_PMM])[kbs ] = mfacc;
+            (D.f[DIR_MPP])[kw  ] = mfcaa;
+            (D.f[DIR_MMP])[ksw ] = mfcca;
+            (D.f[DIR_MPM])[kbw ] = mfcac;
+            (D.f[DIR_MMM])[kbsw] = mfccc;
         }
     }
 }
diff --git a/src/gpu/VirtualFluids_GPU/GPU/Cumulant_F3_27.cu b/src/gpu/VirtualFluids_GPU/GPU/Cumulant_F3_27.cu
index 7adfd40da157d825d83c63b084bf1f855ea6dca2..c89c3cfe87560c808d47163b45d512fa0d7e494f 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/Cumulant_F3_27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/Cumulant_F3_27.cu
@@ -27,7 +27,7 @@ __global__ void LB_PostProcessor_F3_2018_Fehlberg(real omega,
 															 real* vzOut,
 															 real* DDStart,
 															 real* G6,
-															 int size_Mat,
+															 unsigned long long numberOfLBnodes,
 															 int level,
 															 real* forces,
 															 bool EvenOrOdd)
@@ -43,7 +43,7 @@ __global__ void LB_PostProcessor_F3_2018_Fehlberg(real omega,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if (k < size_Mat)
+	if (k < numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -54,83 +54,83 @@ __global__ void LB_PostProcessor_F3_2018_Fehlberg(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			Distributions6 G;
 			if (EvenOrOdd == true)
 			{
-				G.g[DIR_P00] = &G6[DIR_P00   *size_Mat];
-				G.g[DIR_M00] = &G6[DIR_M00   *size_Mat];
-				G.g[DIR_0P0] = &G6[DIR_0P0   *size_Mat];
-				G.g[DIR_0M0] = &G6[DIR_0M0   *size_Mat];
-				G.g[DIR_00P] = &G6[DIR_00P   *size_Mat];
-				G.g[DIR_00M] = &G6[DIR_00M   *size_Mat];
+				G.g[DIR_P00] = &G6[DIR_P00 * numberOfLBnodes];
+				G.g[DIR_M00] = &G6[DIR_M00 * numberOfLBnodes];
+				G.g[DIR_0P0] = &G6[DIR_0P0 * numberOfLBnodes];
+				G.g[DIR_0M0] = &G6[DIR_0M0 * numberOfLBnodes];
+				G.g[DIR_00P] = &G6[DIR_00P * numberOfLBnodes];
+				G.g[DIR_00M] = &G6[DIR_00M * numberOfLBnodes];
 			}
 			else
 			{
-				G.g[DIR_M00] = &G6[DIR_P00   *size_Mat];
-				G.g[DIR_P00] = &G6[DIR_M00   *size_Mat];
-				G.g[DIR_0M0] = &G6[DIR_0P0   *size_Mat];
-				G.g[DIR_0P0] = &G6[DIR_0M0   *size_Mat];
-				G.g[DIR_00M] = &G6[DIR_00P   *size_Mat];
-				G.g[DIR_00P] = &G6[DIR_00M   *size_Mat];
+				G.g[DIR_M00] = &G6[DIR_P00 * numberOfLBnodes];
+				G.g[DIR_P00] = &G6[DIR_M00 * numberOfLBnodes];
+				G.g[DIR_0M0] = &G6[DIR_0P0 * numberOfLBnodes];
+				G.g[DIR_0P0] = &G6[DIR_0M0 * numberOfLBnodes];
+				G.g[DIR_00M] = &G6[DIR_00P * numberOfLBnodes];
+				G.g[DIR_00P] = &G6[DIR_00M * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -1026,83 +1026,83 @@ __global__ void LB_PostProcessor_F3_2018_Fehlberg(real omega,
 //			Distributions27 D;
 //			if (EvenOrOdd == true)
 //			{
-//				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-//				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-//				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-//				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-//				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-//				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-//				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-//				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-//				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-//				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-//				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-//				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-//				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-//				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-//				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-//				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-//				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-//				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-//				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-//				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-//				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-//				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-//				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-//				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-//				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-//				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-//				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+//				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+//				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+//				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+//				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+//				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+//				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+//				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+//				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+//				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+//				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+//				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+//				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+//				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+//				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+//				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+//				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+//				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+//				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+//				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+//				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+//				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+//				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+//				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+//				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+//				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+//				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+//				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 //			}
 //			else
 //			{
-//				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-//				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-//				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-//				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-//				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-//				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-//				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-//				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-//				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-//				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-//				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-//				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-//				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-//				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-//				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-//				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-//				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-//				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-//				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-//				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-//				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-//				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-//				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-//				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-//				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-//				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-//				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+//				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+//				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+//				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+//				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+//				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+//				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+//				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+//				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+//				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+//				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+//				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+//				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+//				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+//				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+//				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+//				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+//				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+//				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+//				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+//				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+//				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+//				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+//				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+//				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+//				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+//				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+//				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 //			}
 //
 //			Distributions6 G;
 //			if (EvenOrOdd == true)
 //			{
-//				G.g[DIR_P00] = &G6[DIR_P00   *size_Mat];
-//				G.g[DIR_M00] = &G6[DIR_M00   *size_Mat];
-//				G.g[DIR_0P0] = &G6[DIR_0P0   *size_Mat];
-//				G.g[DIR_0M0] = &G6[DIR_0M0   *size_Mat];
-//				G.g[DIR_00P] = &G6[DIR_00P   *size_Mat];
-//				G.g[DIR_00M] = &G6[DIR_00M   *size_Mat];
+//				G.g[DIR_P00] = &G6[DIR_P00 * size_Mat];
+//				G.g[DIR_M00] = &G6[DIR_M00 * size_Mat];
+//				G.g[DIR_0P0] = &G6[DIR_0P0 * size_Mat];
+//				G.g[DIR_0M0] = &G6[DIR_0M0 * size_Mat];
+//				G.g[DIR_00P] = &G6[DIR_00P * size_Mat];
+//				G.g[DIR_00M] = &G6[DIR_00M * size_Mat];
 //			}
 //			else
 //			{
-//				G.g[DIR_M00] = &G6[DIR_P00   *size_Mat];
-//				G.g[DIR_P00] = &G6[DIR_M00   *size_Mat];
-//				G.g[DIR_0M0] = &G6[DIR_0P0   *size_Mat];
-//				G.g[DIR_0P0] = &G6[DIR_0M0   *size_Mat];
-//				G.g[DIR_00M] = &G6[DIR_00P   *size_Mat];
-//				G.g[DIR_00P] = &G6[DIR_00M   *size_Mat];
+//				G.g[DIR_M00] = &G6[DIR_P00 * size_Mat];
+//				G.g[DIR_P00] = &G6[DIR_M00 * size_Mat];
+//				G.g[DIR_0M0] = &G6[DIR_0P0 * size_Mat];
+//				G.g[DIR_0P0] = &G6[DIR_0M0 * size_Mat];
+//				G.g[DIR_00M] = &G6[DIR_00P * size_Mat];
+//				G.g[DIR_00P] = &G6[DIR_00M * size_Mat];
 //			}
 //
 //			////////////////////////////////////////////////////////////////////////////////
@@ -2006,83 +2006,83 @@ __global__ void LB_PostProcessor_F3_2018_Fehlberg(real omega,
 //			Distributions27 D;
 //			if (EvenOrOdd == true)
 //			{
-//				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-//				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-//				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-//				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-//				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-//				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-//				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-//				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-//				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-//				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-//				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-//				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-//				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-//				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-//				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-//				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-//				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-//				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-//				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-//				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-//				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-//				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-//				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-//				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-//				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-//				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-//				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+//				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+//				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+//				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+//				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+//				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+//				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+//				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+//				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+//				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+//				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+//				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+//				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+//				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+//				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+//				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+//				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+//				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+//				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+//				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+//				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+//				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+//				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+//				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+//				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+//				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+//				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+//				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 //			}
 //			else
 //			{
-//				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-//				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-//				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-//				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-//				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-//				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-//				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-//				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-//				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-//				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-//				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-//				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-//				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-//				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-//				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-//				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-//				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-//				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-//				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-//				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-//				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-//				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-//				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-//				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-//				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-//				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-//				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+//				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+//				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+//				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+//				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+//				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+//				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+//				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+//				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+//				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+//				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+//				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+//				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+//				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+//				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+//				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+//				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+//				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+//				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+//				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+//				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+//				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+//				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+//				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+//				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+//				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+//				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+//				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 //			}
 //
 //			Distributions6 G;
 //			if (EvenOrOdd == true)
 //			{
-//				G.g[DIR_P00] = &G6[DIR_P00   *size_Mat];
-//				G.g[DIR_M00] = &G6[DIR_M00   *size_Mat];
-//				G.g[DIR_0P0] = &G6[DIR_0P0   *size_Mat];
-//				G.g[DIR_0M0] = &G6[DIR_0M0   *size_Mat];
-//				G.g[DIR_00P] = &G6[DIR_00P   *size_Mat];
-//				G.g[DIR_00M] = &G6[DIR_00M   *size_Mat];
+//				G.g[DIR_P00] = &G6[DIR_P00 * size_Mat];
+//				G.g[DIR_M00] = &G6[DIR_M00 * size_Mat];
+//				G.g[DIR_0P0] = &G6[DIR_0P0 * size_Mat];
+//				G.g[DIR_0M0] = &G6[DIR_0M0 * size_Mat];
+//				G.g[DIR_00P] = &G6[DIR_00P * size_Mat];
+//				G.g[DIR_00M] = &G6[DIR_00M * size_Mat];
 //			}
 //			else
 //			{
-//				G.g[DIR_M00] = &G6[DIR_P00   *size_Mat];
-//				G.g[DIR_P00] = &G6[DIR_M00   *size_Mat];
-//				G.g[DIR_0M0] = &G6[DIR_0P0   *size_Mat];
-//				G.g[DIR_0P0] = &G6[DIR_0M0   *size_Mat];
-//				G.g[DIR_00M] = &G6[DIR_00P   *size_Mat];
-//				G.g[DIR_00P] = &G6[DIR_00M   *size_Mat];
+//				G.g[DIR_M00] = &G6[DIR_P00 * size_Mat];
+//				G.g[DIR_P00] = &G6[DIR_M00 * size_Mat];
+//				G.g[DIR_0M0] = &G6[DIR_0P0 * size_Mat];
+//				G.g[DIR_0P0] = &G6[DIR_0M0 * size_Mat];
+//				G.g[DIR_00M] = &G6[DIR_00P * size_Mat];
+//				G.g[DIR_00P] = &G6[DIR_00M * size_Mat];
 //			}
 //
 //			////////////////////////////////////////////////////////////////////////////////
@@ -2153,33 +2153,33 @@ __global__ void LB_PostProcessor_F3_2018_Fehlberg(real omega,
 //			real dyyuy = c1o2 * (-mgbcb + mgbab);
 //			real dzzuz = c1o2 * (-mgbbc + mgbba);
 //			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-//			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-//			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-//			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-//			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-//			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-//			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-//			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-//			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-//			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-//			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-//			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-//			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-//			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-//			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-//			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-//			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-//			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+//			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+//			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+//			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+//			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+//			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+//			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+//			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+//			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+//			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+//			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+//			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+//			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+//			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+//			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+//			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+//			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+//			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+//			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 //			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-//			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-//			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-//			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-//			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-//			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-//			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-//			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-//			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+//			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+//			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+//			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+//			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+//			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+//			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+//			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+//			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 //			////////////////////////////////////////////////////////////////////////////////////
 //			real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
 //				(((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
diff --git a/src/gpu/VirtualFluids_GPU/GPU/DragLift27.cu b/src/gpu/VirtualFluids_GPU/GPU/DragLift27.cu
index 5146242fed374a919b6dcc02774db1d8ce4f864a..0e3945829725c0614ed4da01d0bae3b99ba2720a 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/DragLift27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/DragLift27.cu
@@ -17,69 +17,69 @@ __global__ void DragLiftPost27(  real* DD,
 											unsigned int* neighborX,
 											unsigned int* neighborY,
 											unsigned int* neighborZ,
-											unsigned int size_Mat, 
+											unsigned long long numberOfLBnodes, 
 											bool isEvenTimestep)
 {
 	Distributions27 D;
 	if (isEvenTimestep==true)
 	{
-		D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+		D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
 	} 
 	else
 	{
-		D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+		D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
 	}
 	////////////////////////////////////////////////////////////////////////////////
 	const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -100,24 +100,24 @@ __global__ void DragLiftPost27(  real* DD,
 			*q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
 			*q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
 			*q_dirBSE, *q_dirBNW; 
-		q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-		q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-		q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-		q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-		q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-		q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-		q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-		q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-		q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-		q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-		q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-		q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-		q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-		q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-		q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-		q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-		q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-		q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+		q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+		q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+		q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+		q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+		q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+		q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+		q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+		q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+		q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+		q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+		q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+		q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+		q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+		q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+		q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+		q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+		q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+		q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
 		q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
 		q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
 		q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -160,32 +160,32 @@ __global__ void DragLiftPost27(  real* DD,
 		real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
                 f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-		f_W    = (D.f[DIR_P00   ])[ke   ];
-		f_E    = (D.f[DIR_M00   ])[kw   ];
-		f_S    = (D.f[DIR_0P0   ])[kn   ];
-		f_N    = (D.f[DIR_0M0   ])[ks   ];
-		f_B    = (D.f[DIR_00P   ])[kt   ];
-		f_T    = (D.f[DIR_00M   ])[kb   ];
-		f_SW   = (D.f[DIR_PP0  ])[kne  ];
-		f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-		f_NW   = (D.f[DIR_PM0  ])[kse  ];
-		f_SE   = (D.f[DIR_MP0  ])[knw  ];
-		f_BW   = (D.f[DIR_P0P  ])[kte  ];
-		f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-		f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-		f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-		f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-		f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-		f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-		f_BN   = (D.f[DIR_0MP  ])[kts  ];
-		f_BSW  = (D.f[DIR_PPP ])[ktne ];
-		f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-		f_BNW  = (D.f[DIR_PMP ])[ktse ];
-		f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-		f_TSW  = (D.f[DIR_PPM ])[kbne ];
-		f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-		f_TNW  = (D.f[DIR_PMM ])[kbse ];
-		f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+		f_W    = (D.f[DIR_P00])[ke   ];
+		f_E    = (D.f[DIR_M00])[kw   ];
+		f_S    = (D.f[DIR_0P0])[kn   ];
+		f_N    = (D.f[DIR_0M0])[ks   ];
+		f_B    = (D.f[DIR_00P])[kt   ];
+		f_T    = (D.f[DIR_00M])[kb   ];
+		f_SW   = (D.f[DIR_PP0])[kne  ];
+		f_NE   = (D.f[DIR_MM0])[ksw  ];
+		f_NW   = (D.f[DIR_PM0])[kse  ];
+		f_SE   = (D.f[DIR_MP0])[knw  ];
+		f_BW   = (D.f[DIR_P0P])[kte  ];
+		f_TE   = (D.f[DIR_M0M])[kbw  ];
+		f_TW   = (D.f[DIR_P0M])[kbe  ];
+		f_BE   = (D.f[DIR_M0P])[ktw  ];
+		f_BS   = (D.f[DIR_0PP])[ktn  ];
+		f_TN   = (D.f[DIR_0MM])[kbs  ];
+		f_TS   = (D.f[DIR_0PM])[kbn  ];
+		f_BN   = (D.f[DIR_0MP])[kts  ];
+		f_BSW  = (D.f[DIR_PPP])[ktne ];
+		f_BNE  = (D.f[DIR_MMP])[ktsw ];
+		f_BNW  = (D.f[DIR_PMP])[ktse ];
+		f_BSE  = (D.f[DIR_MPP])[ktnw ];
+		f_TSW  = (D.f[DIR_PPM])[kbne ];
+		f_TNE  = (D.f[DIR_MMM])[kbsw ];
+		f_TNW  = (D.f[DIR_PMM])[kbse ];
+		f_TSE  = (D.f[DIR_MPM])[kbnw ];
 		////////////////////////////////////////////////////////////////////////////////
 		double	OnE   = c0o1, OnW   = c0o1, OnN   = c0o1, OnS   = c0o1, OnT = c0o1, OnB = c0o1, 
 				OnNE  = c0o1, OnSW  = c0o1, OnSE  = c0o1, OnNW  = c0o1, 
@@ -282,69 +282,69 @@ __global__ void DragLiftPre27(   real* DD,
 											unsigned int* neighborX,
 											unsigned int* neighborY,
 											unsigned int* neighborZ,
-											unsigned int size_Mat, 
+											unsigned long long numberOfLBnodes, 
 											bool isEvenTimestep)
 {
 	Distributions27 D;
 	if (isEvenTimestep==true)
 	{
-		D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+		D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
 	} 
 	else
 	{
-		D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+		D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
 	}
 	////////////////////////////////////////////////////////////////////////////////
 	const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -365,24 +365,24 @@ __global__ void DragLiftPre27(   real* DD,
 			*q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
 			*q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
 			*q_dirBSE, *q_dirBNW; 
-		q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-		q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-		q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-		q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-		q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-		q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-		q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-		q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-		q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-		q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-		q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-		q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-		q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-		q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-		q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-		q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-		q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-		q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+		q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+		q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+		q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+		q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+		q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+		q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+		q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+		q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+		q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+		q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+		q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+		q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+		q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+		q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+		q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+		q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+		q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+		q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
 		q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
 		q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
 		q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -425,32 +425,32 @@ __global__ void DragLiftPre27(   real* DD,
 		real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
                 f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-		f_E   = (D.f[DIR_P00   ])[ke   ];
-		f_W   = (D.f[DIR_M00   ])[kw   ];
-		f_N   = (D.f[DIR_0P0   ])[kn   ];
-		f_S   = (D.f[DIR_0M0   ])[ks   ];
-		f_T   = (D.f[DIR_00P   ])[kt   ];
-		f_B   = (D.f[DIR_00M   ])[kb   ];
-		f_NE  = (D.f[DIR_PP0  ])[kne  ];
-		f_SW  = (D.f[DIR_MM0  ])[ksw  ];
-		f_SE  = (D.f[DIR_PM0  ])[kse  ];
-		f_NW  = (D.f[DIR_MP0  ])[knw  ];
-		f_TE  = (D.f[DIR_P0P  ])[kte  ];
-		f_BW  = (D.f[DIR_M0M  ])[kbw  ];
-		f_BE  = (D.f[DIR_P0M  ])[kbe  ];
-		f_TW  = (D.f[DIR_M0P  ])[ktw  ];
-		f_TN  = (D.f[DIR_0PP  ])[ktn  ];
-		f_BS  = (D.f[DIR_0MM  ])[kbs  ];
-		f_BN  = (D.f[DIR_0PM  ])[kbn  ];
-		f_TS  = (D.f[DIR_0MP  ])[kts  ];
-		f_TNE = (D.f[DIR_PPP ])[ktne ];
-		f_TSW = (D.f[DIR_MMP ])[ktsw ];
-		f_TSE = (D.f[DIR_PMP ])[ktse ];
-		f_TNW = (D.f[DIR_MPP ])[ktnw ];
-		f_BNE = (D.f[DIR_PPM ])[kbne ];
-		f_BSW = (D.f[DIR_MMM ])[kbsw ];
-		f_BSE = (D.f[DIR_PMM ])[kbse ];
-		f_BNW = (D.f[DIR_MPM ])[kbnw ];
+		f_E   = (D.f[DIR_P00])[ke   ];
+		f_W   = (D.f[DIR_M00])[kw   ];
+		f_N   = (D.f[DIR_0P0])[kn   ];
+		f_S   = (D.f[DIR_0M0])[ks   ];
+		f_T   = (D.f[DIR_00P])[kt   ];
+		f_B   = (D.f[DIR_00M])[kb   ];
+		f_NE  = (D.f[DIR_PP0])[kne  ];
+		f_SW  = (D.f[DIR_MM0])[ksw  ];
+		f_SE  = (D.f[DIR_PM0])[kse  ];
+		f_NW  = (D.f[DIR_MP0])[knw  ];
+		f_TE  = (D.f[DIR_P0P])[kte  ];
+		f_BW  = (D.f[DIR_M0M])[kbw  ];
+		f_BE  = (D.f[DIR_P0M])[kbe  ];
+		f_TW  = (D.f[DIR_M0P])[ktw  ];
+		f_TN  = (D.f[DIR_0PP])[ktn  ];
+		f_BS  = (D.f[DIR_0MM])[kbs  ];
+		f_BN  = (D.f[DIR_0PM])[kbn  ];
+		f_TS  = (D.f[DIR_0MP])[kts  ];
+		f_TNE = (D.f[DIR_PPP])[ktne ];
+		f_TSW = (D.f[DIR_MMP])[ktsw ];
+		f_TSE = (D.f[DIR_PMP])[ktse ];
+		f_TNW = (D.f[DIR_MPP])[ktnw ];
+		f_BNE = (D.f[DIR_PPM])[kbne ];
+		f_BSW = (D.f[DIR_MMM])[kbsw ];
+		f_BSE = (D.f[DIR_PMM])[kbse ];
+		f_BNW = (D.f[DIR_MPM])[kbnw ];
 		 ////////////////////////////////////////////////////////////////////////////////
 		double	OnE   = c0o1, OnW   = c0o1, OnN   = c0o1, OnS   = c0o1, OnT = c0o1, OnB = c0o1, 
 				OnNE  = c0o1, OnSW  = c0o1, OnSE  = c0o1, OnNW  = c0o1, 
diff --git a/src/gpu/VirtualFluids_GPU/GPU/EnstrophyAnalyzer.cu b/src/gpu/VirtualFluids_GPU/GPU/EnstrophyAnalyzer.cu
index acd62b46c5666fc5f621c3772438e42b7ebef5c6..93879d73a32458d5403fd3fd16e68e0fcea7753d 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/EnstrophyAnalyzer.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/EnstrophyAnalyzer.cu
@@ -13,7 +13,7 @@
 
 #include <iomanip>
 
-//#include "Core/Logger/Logger.h"
+#include "cuda/CudaGrid.h"
 
 #include "Parameter/Parameter.h"
 // includes, kernels
@@ -22,7 +22,7 @@
 
 using namespace vf::lbm::constant;
 
-__global__                 void enstrophyKernel  ( real* veloX, real* veloY, real* veloZ, real* rho, uint* neighborX, uint* neighborY, uint* neighborZ, uint* neighborWSB, uint* geo, real* enstrophy, uint* isFluid, uint size_Mat );
+__global__                 void enstrophyKernel  ( real* veloX, real* veloY, real* veloZ, real* rho, uint* neighborX, uint* neighborY, uint* neighborZ, uint* neighborWSB, uint* geo, real* enstrophy, uint* isFluid, unsigned long long numberOfLBnodes );
 
 __host__ __device__ inline void enstrophyFunction( real* veloX, real* veloY, real* veloZ, real* rho, uint* neighborX, uint* neighborY, uint* neighborZ, uint* neighborWSB, uint* geo, real* enstrophy, uint* isFluid, uint index );
 
@@ -32,55 +32,40 @@ bool EnstrophyAnalyzer::run(uint iter)
 {
     if( iter % this->analyzeIter != 0 ) return false;
 
-	int lev = 0;
-	int size_Mat = this->para->getParD(lev)->numberOfNodes;
-	
-	thrust::device_vector<real> enstrophy( size_Mat, c0o1 );
-    thrust::device_vector<uint> isFluid  ( size_Mat, 0);
-
-	unsigned int numberOfThreads = 128;
-    int Grid = (size_Mat / numberOfThreads)+1;
-    int Grid1, Grid2;
-    if (Grid>512)
-    {
-       Grid1 = 512;
-       Grid2 = (Grid/Grid1)+1;
-    } 
-    else
-    {
-       Grid1 = 1;
-       Grid2 = Grid;
-    }
-    dim3 grid(Grid1, Grid2);
-    dim3 threads(numberOfThreads, 1, 1 );
-
-    LBCalcMacCompSP27<<< grid, threads >>> (para->getParD(lev)->velocityX,
-										    para->getParD(lev)->velocityY,
-										    para->getParD(lev)->velocityZ,
-										    para->getParD(lev)->rho,
-										    para->getParD(lev)->pressure,
-										    para->getParD(lev)->typeOfGridNode,
-										    para->getParD(lev)->neighborX,
-										    para->getParD(lev)->neighborY,
-										    para->getParD(lev)->neighborZ,
-										    para->getParD(lev)->numberOfNodes,
-										    para->getParD(lev)->distributions.f[0],
-										    para->getParD(lev)->isEvenTimestep); 
-	//cudaDeviceSynchronize();
-	getLastCudaError("LBCalcMacSP27 execution failed"); 
-
-	enstrophyKernel <<< grid, threads >>> ( para->getParD(lev)->velocityX,
-											para->getParD(lev)->velocityY, 
-											para->getParD(lev)->velocityZ, 
-											para->getParD(lev)->rho, 
-											para->getParD(lev)->neighborX,
-											para->getParD(lev)->neighborY,
-											para->getParD(lev)->neighborZ,
-											para->getParD(lev)->neighborInverse,
-											para->getParD(lev)->typeOfGridNode,
-											enstrophy.data().get(), 
-                                            isFluid.data().get(),
-											size_Mat);
+    int lev = 0;
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(lev)->numberofthreads, para->getParD(lev)->numberOfNodes);
+
+    thrust::device_vector<real> enstrophy( this->para->getParD(lev)->numberOfNodes, c0o1);
+    thrust::device_vector<uint> isFluid  ( this->para->getParD(lev)->numberOfNodes, 0);
+
+    LBCalcMacCompSP27<<< grid.grid, grid.threads >>>(
+        para->getParD(lev)->velocityX,
+        para->getParD(lev)->velocityY,
+        para->getParD(lev)->velocityZ,
+        para->getParD(lev)->rho,
+        para->getParD(lev)->pressure,
+        para->getParD(lev)->typeOfGridNode,
+        para->getParD(lev)->neighborX,
+        para->getParD(lev)->neighborY,
+        para->getParD(lev)->neighborZ,
+        para->getParD(lev)->numberOfNodes,
+        para->getParD(lev)->distributions.f[0],
+        para->getParD(lev)->isEvenTimestep); 
+    getLastCudaError("LBCalcMacCompSP27 execution failed");
+
+    enstrophyKernel<<< grid.grid, grid.threads >>>(
+        para->getParD(lev)->velocityX,
+        para->getParD(lev)->velocityY, 
+        para->getParD(lev)->velocityZ, 
+        para->getParD(lev)->rho, 
+        para->getParD(lev)->neighborX,
+        para->getParD(lev)->neighborY,
+        para->getParD(lev)->neighborZ,
+        para->getParD(lev)->neighborInverse,
+        para->getParD(lev)->typeOfGridNode,
+        enstrophy.data().get(), 
+        isFluid.data().get(),
+        para->getParD(lev)->numberOfNodes);
 	cudaDeviceSynchronize(); 
 	getLastCudaError("enstrophyKernel execution failed");
 
@@ -97,7 +82,7 @@ bool EnstrophyAnalyzer::run(uint iter)
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-__global__ void enstrophyKernel(real* veloX, real* veloY, real* veloZ, real* rho, uint* neighborX, uint* neighborY, uint* neighborZ, uint* neighborWSB, uint* geo, real* enstrophy, uint* isFluid, uint size_Mat)
+__global__ void enstrophyKernel(real* veloX, real* veloY, real* veloZ, real* rho, uint* neighborX, uint* neighborY, uint* neighborZ, uint* neighborWSB, uint* geo, real* enstrophy, uint* isFluid, unsigned long long numberOfLBnodes)
 {
     //////////////////////////////////////////////////////////////////////////
     const uint x = threadIdx.x;  // Globaler x-Index 
@@ -113,7 +98,7 @@ __global__ void enstrophyKernel(real* veloX, real* veloY, real* veloZ, real* rho
 
     //if( index % 34 == 0 || index % 34 == 33 ) return;
 
-    if( index >= size_Mat) return;
+    if( index >= (uint)numberOfLBnodes) return;
 
 	unsigned int BC;
 	BC = geo[index];
diff --git a/src/gpu/VirtualFluids_GPU/GPU/ExchangeData27.cu b/src/gpu/VirtualFluids_GPU/GPU/ExchangeData27.cu
index 5470da46342c85e57370227313c8c82674a17e6e..4ced64c0152bdbbd9752f736e2edca2c51fbc2ff 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/ExchangeData27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/ExchangeData27.cu
@@ -14,7 +14,7 @@ __global__ void getSendFsPost27(real* DD,
                                            unsigned int* neighborX,
                                            unsigned int* neighborY,
                                            unsigned int* neighborZ,
-                                           unsigned int size_Mat, 
+                                           unsigned long long numberOfLBnodes, 
                                            bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -65,150 +65,150 @@ __global__ void getSendFsPost27(real* DD,
       Distributions27 D;
       if (isEvenTimestep==true)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  //set Pointer for Buffer Fs
       Distributions27 Dbuff;
-      Dbuff.f[DIR_P00   ] = &bufferFs[DIR_P00   *buffmax];
-      Dbuff.f[DIR_M00   ] = &bufferFs[DIR_M00   *buffmax];
-      Dbuff.f[DIR_0P0   ] = &bufferFs[DIR_0P0   *buffmax];
-      Dbuff.f[DIR_0M0   ] = &bufferFs[DIR_0M0   *buffmax];
-      Dbuff.f[DIR_00P   ] = &bufferFs[DIR_00P   *buffmax];
-      Dbuff.f[DIR_00M   ] = &bufferFs[DIR_00M   *buffmax];
-      Dbuff.f[DIR_PP0  ] = &bufferFs[DIR_PP0  *buffmax];
-      Dbuff.f[DIR_MM0  ] = &bufferFs[DIR_MM0  *buffmax];
-      Dbuff.f[DIR_PM0  ] = &bufferFs[DIR_PM0  *buffmax];
-      Dbuff.f[DIR_MP0  ] = &bufferFs[DIR_MP0  *buffmax];
-      Dbuff.f[DIR_P0P  ] = &bufferFs[DIR_P0P  *buffmax];
-      Dbuff.f[DIR_M0M  ] = &bufferFs[DIR_M0M  *buffmax];
-      Dbuff.f[DIR_P0M  ] = &bufferFs[DIR_P0M  *buffmax];
-      Dbuff.f[DIR_M0P  ] = &bufferFs[DIR_M0P  *buffmax];
-      Dbuff.f[DIR_0PP  ] = &bufferFs[DIR_0PP  *buffmax];
-      Dbuff.f[DIR_0MM  ] = &bufferFs[DIR_0MM  *buffmax];
-      Dbuff.f[DIR_0PM  ] = &bufferFs[DIR_0PM  *buffmax];
-      Dbuff.f[DIR_0MP  ] = &bufferFs[DIR_0MP  *buffmax];
-      Dbuff.f[DIR_000] = &bufferFs[DIR_000*buffmax];
-      Dbuff.f[DIR_PPP ] = &bufferFs[DIR_PPP *buffmax];
-      Dbuff.f[DIR_MMP ] = &bufferFs[DIR_MMP *buffmax];
-      Dbuff.f[DIR_PMP ] = &bufferFs[DIR_PMP *buffmax];
-      Dbuff.f[DIR_MPP ] = &bufferFs[DIR_MPP *buffmax];
-      Dbuff.f[DIR_PPM ] = &bufferFs[DIR_PPM *buffmax];
-      Dbuff.f[DIR_MMM ] = &bufferFs[DIR_MMM *buffmax];
-      Dbuff.f[DIR_PMM ] = &bufferFs[DIR_PMM *buffmax];
-      Dbuff.f[DIR_MPM ] = &bufferFs[DIR_MPM *buffmax];
+      Dbuff.f[DIR_P00] = &bufferFs[DIR_P00 * buffmax];
+      Dbuff.f[DIR_M00] = &bufferFs[DIR_M00 * buffmax];
+      Dbuff.f[DIR_0P0] = &bufferFs[DIR_0P0 * buffmax];
+      Dbuff.f[DIR_0M0] = &bufferFs[DIR_0M0 * buffmax];
+      Dbuff.f[DIR_00P] = &bufferFs[DIR_00P * buffmax];
+      Dbuff.f[DIR_00M] = &bufferFs[DIR_00M * buffmax];
+      Dbuff.f[DIR_PP0] = &bufferFs[DIR_PP0 * buffmax];
+      Dbuff.f[DIR_MM0] = &bufferFs[DIR_MM0 * buffmax];
+      Dbuff.f[DIR_PM0] = &bufferFs[DIR_PM0 * buffmax];
+      Dbuff.f[DIR_MP0] = &bufferFs[DIR_MP0 * buffmax];
+      Dbuff.f[DIR_P0P] = &bufferFs[DIR_P0P * buffmax];
+      Dbuff.f[DIR_M0M] = &bufferFs[DIR_M0M * buffmax];
+      Dbuff.f[DIR_P0M] = &bufferFs[DIR_P0M * buffmax];
+      Dbuff.f[DIR_M0P] = &bufferFs[DIR_M0P * buffmax];
+      Dbuff.f[DIR_0PP] = &bufferFs[DIR_0PP * buffmax];
+      Dbuff.f[DIR_0MM] = &bufferFs[DIR_0MM * buffmax];
+      Dbuff.f[DIR_0PM] = &bufferFs[DIR_0PM * buffmax];
+      Dbuff.f[DIR_0MP] = &bufferFs[DIR_0MP * buffmax];
+      Dbuff.f[DIR_000] = &bufferFs[DIR_000 * buffmax];
+      Dbuff.f[DIR_PPP] = &bufferFs[DIR_PPP * buffmax];
+      Dbuff.f[DIR_MMP] = &bufferFs[DIR_MMP * buffmax];
+      Dbuff.f[DIR_PMP] = &bufferFs[DIR_PMP * buffmax];
+      Dbuff.f[DIR_MPP] = &bufferFs[DIR_MPP * buffmax];
+      Dbuff.f[DIR_PPM] = &bufferFs[DIR_PPM * buffmax];
+      Dbuff.f[DIR_MMM] = &bufferFs[DIR_MMM * buffmax];
+      Dbuff.f[DIR_PMM] = &bufferFs[DIR_PMM * buffmax];
+      Dbuff.f[DIR_MPM] = &bufferFs[DIR_MPM * buffmax];
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  //copy to buffer
-      //(Dbuff.f[DIR_P00   ])[k] = (D.f[DIR_P00   ])[ke   ];
-      //(Dbuff.f[DIR_M00   ])[k] = (D.f[DIR_M00   ])[kw   ];
-      //(Dbuff.f[DIR_0P0   ])[k] = (D.f[DIR_0P0   ])[kn   ];
-      //(Dbuff.f[DIR_0M0   ])[k] = (D.f[DIR_0M0   ])[ks   ];
-      //(Dbuff.f[DIR_00P   ])[k] = (D.f[DIR_00P   ])[kt   ];
-      //(Dbuff.f[DIR_00M   ])[k] = (D.f[DIR_00M   ])[kb   ];
-      //(Dbuff.f[DIR_PP0  ])[k] = (D.f[DIR_PP0  ])[kne  ];
-      //(Dbuff.f[DIR_MM0  ])[k] = (D.f[DIR_MM0  ])[ksw  ];
-      //(Dbuff.f[DIR_PM0  ])[k] = (D.f[DIR_PM0  ])[kse  ];
-      //(Dbuff.f[DIR_MP0  ])[k] = (D.f[DIR_MP0  ])[knw  ];
-      //(Dbuff.f[DIR_P0P  ])[k] = (D.f[DIR_P0P  ])[kte  ];
-      //(Dbuff.f[DIR_M0M  ])[k] = (D.f[DIR_M0M  ])[kbw  ];
-      //(Dbuff.f[DIR_P0M  ])[k] = (D.f[DIR_P0M  ])[kbe  ];
-      //(Dbuff.f[DIR_M0P  ])[k] = (D.f[DIR_M0P  ])[ktw  ];
-      //(Dbuff.f[DIR_0PP  ])[k] = (D.f[DIR_0PP  ])[ktn  ];
-      //(Dbuff.f[DIR_0MM  ])[k] = (D.f[DIR_0MM  ])[kbs  ];
-      //(Dbuff.f[DIR_0PM  ])[k] = (D.f[DIR_0PM  ])[kbn  ];
-      //(Dbuff.f[DIR_0MP  ])[k] = (D.f[DIR_0MP  ])[kts  ];
+      //(Dbuff.f[DIR_P00])[k] = (D.f[DIR_P00])[ke   ];
+      //(Dbuff.f[DIR_M00])[k] = (D.f[DIR_M00])[kw   ];
+      //(Dbuff.f[DIR_0P0])[k] = (D.f[DIR_0P0])[kn   ];
+      //(Dbuff.f[DIR_0M0])[k] = (D.f[DIR_0M0])[ks   ];
+      //(Dbuff.f[DIR_00P])[k] = (D.f[DIR_00P])[kt   ];
+      //(Dbuff.f[DIR_00M])[k] = (D.f[DIR_00M])[kb   ];
+      //(Dbuff.f[DIR_PP0])[k] = (D.f[DIR_PP0])[kne  ];
+      //(Dbuff.f[DIR_MM0])[k] = (D.f[DIR_MM0])[ksw  ];
+      //(Dbuff.f[DIR_PM0])[k] = (D.f[DIR_PM0])[kse  ];
+      //(Dbuff.f[DIR_MP0])[k] = (D.f[DIR_MP0])[knw  ];
+      //(Dbuff.f[DIR_P0P])[k] = (D.f[DIR_P0P])[kte  ];
+      //(Dbuff.f[DIR_M0M])[k] = (D.f[DIR_M0M])[kbw  ];
+      //(Dbuff.f[DIR_P0M])[k] = (D.f[DIR_P0M])[kbe  ];
+      //(Dbuff.f[DIR_M0P])[k] = (D.f[DIR_M0P])[ktw  ];
+      //(Dbuff.f[DIR_0PP])[k] = (D.f[DIR_0PP])[ktn  ];
+      //(Dbuff.f[DIR_0MM])[k] = (D.f[DIR_0MM])[kbs  ];
+      //(Dbuff.f[DIR_0PM])[k] = (D.f[DIR_0PM])[kbn  ];
+      //(Dbuff.f[DIR_0MP])[k] = (D.f[DIR_0MP])[kts  ];
       //(Dbuff.f[DIR_000])[k] = (D.f[DIR_000])[kzero];
-      //(Dbuff.f[DIR_PPP ])[k] = (D.f[DIR_PPP ])[ktne ];
-      //(Dbuff.f[DIR_MMP ])[k] = (D.f[DIR_MMP ])[ktsw ];
-      //(Dbuff.f[DIR_PMP ])[k] = (D.f[DIR_PMP ])[ktse ];
-      //(Dbuff.f[DIR_MPP ])[k] = (D.f[DIR_MPP ])[ktnw ];
-      //(Dbuff.f[DIR_PPM ])[k] = (D.f[DIR_PPM ])[kbne ];
-      //(Dbuff.f[DIR_MMM ])[k] = (D.f[DIR_MMM ])[kbsw ];
-      //(Dbuff.f[DIR_PMM ])[k] = (D.f[DIR_PMM ])[kbse ];
-      //(Dbuff.f[DIR_MPM ])[k] = (D.f[DIR_MPM ])[kbnw ];
-      (Dbuff.f[DIR_P00   ])[k] = (D.f[DIR_M00   ])[kw   ];
-      (Dbuff.f[DIR_M00   ])[k] = (D.f[DIR_P00   ])[ke   ];
-      (Dbuff.f[DIR_0P0   ])[k] = (D.f[DIR_0M0   ])[ks   ];
-      (Dbuff.f[DIR_0M0   ])[k] = (D.f[DIR_0P0   ])[kn   ];
-      (Dbuff.f[DIR_00P   ])[k] = (D.f[DIR_00M   ])[kb   ];
-      (Dbuff.f[DIR_00M   ])[k] = (D.f[DIR_00P   ])[kt   ];
-      (Dbuff.f[DIR_PP0  ])[k] = (D.f[DIR_MM0  ])[ksw  ];
-      (Dbuff.f[DIR_MM0  ])[k] = (D.f[DIR_PP0  ])[kne  ];
-      (Dbuff.f[DIR_PM0  ])[k] = (D.f[DIR_MP0  ])[knw  ];
-      (Dbuff.f[DIR_MP0  ])[k] = (D.f[DIR_PM0  ])[kse  ];
-      (Dbuff.f[DIR_P0P  ])[k] = (D.f[DIR_M0M  ])[kbw  ];
-      (Dbuff.f[DIR_M0M  ])[k] = (D.f[DIR_P0P  ])[kte  ];
-      (Dbuff.f[DIR_P0M  ])[k] = (D.f[DIR_M0P  ])[ktw  ];
-      (Dbuff.f[DIR_M0P  ])[k] = (D.f[DIR_P0M  ])[kbe  ];
-      (Dbuff.f[DIR_0PP  ])[k] = (D.f[DIR_0MM  ])[kbs  ];
-      (Dbuff.f[DIR_0MM  ])[k] = (D.f[DIR_0PP  ])[ktn  ];
-      (Dbuff.f[DIR_0PM  ])[k] = (D.f[DIR_0MP  ])[kts  ];
-      (Dbuff.f[DIR_0MP  ])[k] = (D.f[DIR_0PM  ])[kbn  ];
+      //(Dbuff.f[DIR_PPP])[k] = (D.f[DIR_PPP])[ktne ];
+      //(Dbuff.f[DIR_MMP])[k] = (D.f[DIR_MMP])[ktsw ];
+      //(Dbuff.f[DIR_PMP])[k] = (D.f[DIR_PMP])[ktse ];
+      //(Dbuff.f[DIR_MPP])[k] = (D.f[DIR_MPP])[ktnw ];
+      //(Dbuff.f[DIR_PPM])[k] = (D.f[DIR_PPM])[kbne ];
+      //(Dbuff.f[DIR_MMM])[k] = (D.f[DIR_MMM])[kbsw ];
+      //(Dbuff.f[DIR_PMM])[k] = (D.f[DIR_PMM])[kbse ];
+      //(Dbuff.f[DIR_MPM])[k] = (D.f[DIR_MPM])[kbnw ];
+      (Dbuff.f[DIR_P00])[k] = (D.f[DIR_M00])[kw   ];
+      (Dbuff.f[DIR_M00])[k] = (D.f[DIR_P00])[ke   ];
+      (Dbuff.f[DIR_0P0])[k] = (D.f[DIR_0M0])[ks   ];
+      (Dbuff.f[DIR_0M0])[k] = (D.f[DIR_0P0])[kn   ];
+      (Dbuff.f[DIR_00P])[k] = (D.f[DIR_00M])[kb   ];
+      (Dbuff.f[DIR_00M])[k] = (D.f[DIR_00P])[kt   ];
+      (Dbuff.f[DIR_PP0])[k] = (D.f[DIR_MM0])[ksw  ];
+      (Dbuff.f[DIR_MM0])[k] = (D.f[DIR_PP0])[kne  ];
+      (Dbuff.f[DIR_PM0])[k] = (D.f[DIR_MP0])[knw  ];
+      (Dbuff.f[DIR_MP0])[k] = (D.f[DIR_PM0])[kse  ];
+      (Dbuff.f[DIR_P0P])[k] = (D.f[DIR_M0M])[kbw  ];
+      (Dbuff.f[DIR_M0M])[k] = (D.f[DIR_P0P])[kte  ];
+      (Dbuff.f[DIR_P0M])[k] = (D.f[DIR_M0P])[ktw  ];
+      (Dbuff.f[DIR_M0P])[k] = (D.f[DIR_P0M])[kbe  ];
+      (Dbuff.f[DIR_0PP])[k] = (D.f[DIR_0MM])[kbs  ];
+      (Dbuff.f[DIR_0MM])[k] = (D.f[DIR_0PP])[ktn  ];
+      (Dbuff.f[DIR_0PM])[k] = (D.f[DIR_0MP])[kts  ];
+      (Dbuff.f[DIR_0MP])[k] = (D.f[DIR_0PM])[kbn  ];
       (Dbuff.f[DIR_000])[k] = (D.f[DIR_000])[kzero];
-      (Dbuff.f[DIR_PPP ])[k] = (D.f[DIR_MMM ])[kbsw ];
-      (Dbuff.f[DIR_MMP ])[k] = (D.f[DIR_PPM ])[kbne ];
-      (Dbuff.f[DIR_PMP ])[k] = (D.f[DIR_MPM ])[kbnw ];
-      (Dbuff.f[DIR_MPP ])[k] = (D.f[DIR_PMM ])[kbse ];
-      (Dbuff.f[DIR_PPM ])[k] = (D.f[DIR_MMP ])[ktsw ];
-      (Dbuff.f[DIR_MMM ])[k] = (D.f[DIR_PPP ])[ktne ];
-      (Dbuff.f[DIR_PMM ])[k] = (D.f[DIR_MPP ])[ktnw ];
-      (Dbuff.f[DIR_MPM ])[k] = (D.f[DIR_PMP ])[ktse ];
+      (Dbuff.f[DIR_PPP])[k] = (D.f[DIR_MMM])[kbsw ];
+      (Dbuff.f[DIR_MMP])[k] = (D.f[DIR_PPM])[kbne ];
+      (Dbuff.f[DIR_PMP])[k] = (D.f[DIR_MPM])[kbnw ];
+      (Dbuff.f[DIR_MPP])[k] = (D.f[DIR_PMM])[kbse ];
+      (Dbuff.f[DIR_PPM])[k] = (D.f[DIR_MMP])[ktsw ];
+      (Dbuff.f[DIR_MMM])[k] = (D.f[DIR_PPP])[ktne ];
+      (Dbuff.f[DIR_PMM])[k] = (D.f[DIR_MPP])[ktnw ];
+      (Dbuff.f[DIR_MPM])[k] = (D.f[DIR_PMP])[ktse ];
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -250,7 +250,7 @@ __global__ void setRecvFsPost27(real* DD,
                                            unsigned int* neighborX,
                                            unsigned int* neighborY,
                                            unsigned int* neighborZ,
-                                           unsigned int size_Mat, 
+                                           unsigned long long numberOfLBnodes, 
                                            bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -301,150 +301,150 @@ __global__ void setRecvFsPost27(real* DD,
       Distributions27 D;
       if (isEvenTimestep==true)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  //set Pointer for Buffer Fs
       Distributions27 Dbuff;
-      Dbuff.f[DIR_P00   ] = &bufferFs[DIR_P00   *buffmax];
-      Dbuff.f[DIR_M00   ] = &bufferFs[DIR_M00   *buffmax];
-      Dbuff.f[DIR_0P0   ] = &bufferFs[DIR_0P0   *buffmax];
-      Dbuff.f[DIR_0M0   ] = &bufferFs[DIR_0M0   *buffmax];
-      Dbuff.f[DIR_00P   ] = &bufferFs[DIR_00P   *buffmax];
-      Dbuff.f[DIR_00M   ] = &bufferFs[DIR_00M   *buffmax];
-      Dbuff.f[DIR_PP0  ] = &bufferFs[DIR_PP0  *buffmax];
-      Dbuff.f[DIR_MM0  ] = &bufferFs[DIR_MM0  *buffmax];
-      Dbuff.f[DIR_PM0  ] = &bufferFs[DIR_PM0  *buffmax];
-      Dbuff.f[DIR_MP0  ] = &bufferFs[DIR_MP0  *buffmax];
-      Dbuff.f[DIR_P0P  ] = &bufferFs[DIR_P0P  *buffmax];
-      Dbuff.f[DIR_M0M  ] = &bufferFs[DIR_M0M  *buffmax];
-      Dbuff.f[DIR_P0M  ] = &bufferFs[DIR_P0M  *buffmax];
-      Dbuff.f[DIR_M0P  ] = &bufferFs[DIR_M0P  *buffmax];
-      Dbuff.f[DIR_0PP  ] = &bufferFs[DIR_0PP  *buffmax];
-      Dbuff.f[DIR_0MM  ] = &bufferFs[DIR_0MM  *buffmax];
-      Dbuff.f[DIR_0PM  ] = &bufferFs[DIR_0PM  *buffmax];
-      Dbuff.f[DIR_0MP  ] = &bufferFs[DIR_0MP  *buffmax];
-      Dbuff.f[DIR_000] = &bufferFs[DIR_000*buffmax];
-      Dbuff.f[DIR_PPP ] = &bufferFs[DIR_PPP *buffmax];
-      Dbuff.f[DIR_MMP ] = &bufferFs[DIR_MMP *buffmax];
-      Dbuff.f[DIR_PMP ] = &bufferFs[DIR_PMP *buffmax];
-      Dbuff.f[DIR_MPP ] = &bufferFs[DIR_MPP *buffmax];
-      Dbuff.f[DIR_PPM ] = &bufferFs[DIR_PPM *buffmax];
-      Dbuff.f[DIR_MMM ] = &bufferFs[DIR_MMM *buffmax];
-      Dbuff.f[DIR_PMM ] = &bufferFs[DIR_PMM *buffmax];
-      Dbuff.f[DIR_MPM ] = &bufferFs[DIR_MPM *buffmax];
+      Dbuff.f[DIR_P00] = &bufferFs[DIR_P00 * buffmax];
+      Dbuff.f[DIR_M00] = &bufferFs[DIR_M00 * buffmax];
+      Dbuff.f[DIR_0P0] = &bufferFs[DIR_0P0 * buffmax];
+      Dbuff.f[DIR_0M0] = &bufferFs[DIR_0M0 * buffmax];
+      Dbuff.f[DIR_00P] = &bufferFs[DIR_00P * buffmax];
+      Dbuff.f[DIR_00M] = &bufferFs[DIR_00M * buffmax];
+      Dbuff.f[DIR_PP0] = &bufferFs[DIR_PP0 * buffmax];
+      Dbuff.f[DIR_MM0] = &bufferFs[DIR_MM0 * buffmax];
+      Dbuff.f[DIR_PM0] = &bufferFs[DIR_PM0 * buffmax];
+      Dbuff.f[DIR_MP0] = &bufferFs[DIR_MP0 * buffmax];
+      Dbuff.f[DIR_P0P] = &bufferFs[DIR_P0P * buffmax];
+      Dbuff.f[DIR_M0M] = &bufferFs[DIR_M0M * buffmax];
+      Dbuff.f[DIR_P0M] = &bufferFs[DIR_P0M * buffmax];
+      Dbuff.f[DIR_M0P] = &bufferFs[DIR_M0P * buffmax];
+      Dbuff.f[DIR_0PP] = &bufferFs[DIR_0PP * buffmax];
+      Dbuff.f[DIR_0MM] = &bufferFs[DIR_0MM * buffmax];
+      Dbuff.f[DIR_0PM] = &bufferFs[DIR_0PM * buffmax];
+      Dbuff.f[DIR_0MP] = &bufferFs[DIR_0MP * buffmax];
+      Dbuff.f[DIR_000] = &bufferFs[DIR_000 * buffmax];
+      Dbuff.f[DIR_PPP] = &bufferFs[DIR_PPP * buffmax];
+      Dbuff.f[DIR_MMP] = &bufferFs[DIR_MMP * buffmax];
+      Dbuff.f[DIR_PMP] = &bufferFs[DIR_PMP * buffmax];
+      Dbuff.f[DIR_MPP] = &bufferFs[DIR_MPP * buffmax];
+      Dbuff.f[DIR_PPM] = &bufferFs[DIR_PPM * buffmax];
+      Dbuff.f[DIR_MMM] = &bufferFs[DIR_MMM * buffmax];
+      Dbuff.f[DIR_PMM] = &bufferFs[DIR_PMM * buffmax];
+      Dbuff.f[DIR_MPM] = &bufferFs[DIR_MPM * buffmax];
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  //copy from buffer
-      //(D.f[DIR_P00   ])[ke   ] = (Dbuff.f[DIR_P00   ])[k];
-      //(D.f[DIR_M00   ])[kw   ] = (Dbuff.f[DIR_M00   ])[k];
-      //(D.f[DIR_0P0   ])[kn   ] = (Dbuff.f[DIR_0P0   ])[k];
-      //(D.f[DIR_0M0   ])[ks   ] = (Dbuff.f[DIR_0M0   ])[k];
-      //(D.f[DIR_00P   ])[kt   ] = (Dbuff.f[DIR_00P   ])[k];
-      //(D.f[DIR_00M   ])[kb   ] = (Dbuff.f[DIR_00M   ])[k];
-      //(D.f[DIR_PP0  ])[kne  ] = (Dbuff.f[DIR_PP0  ])[k];
-      //(D.f[DIR_MM0  ])[ksw  ] = (Dbuff.f[DIR_MM0  ])[k];
-      //(D.f[DIR_PM0  ])[kse  ] = (Dbuff.f[DIR_PM0  ])[k];
-      //(D.f[DIR_MP0  ])[knw  ] = (Dbuff.f[DIR_MP0  ])[k];
-      //(D.f[DIR_P0P  ])[kte  ] = (Dbuff.f[DIR_P0P  ])[k];
-      //(D.f[DIR_M0M  ])[kbw  ] = (Dbuff.f[DIR_M0M  ])[k];
-      //(D.f[DIR_P0M  ])[kbe  ] = (Dbuff.f[DIR_P0M  ])[k];
-      //(D.f[DIR_M0P  ])[ktw  ] = (Dbuff.f[DIR_M0P  ])[k];
-      //(D.f[DIR_0PP  ])[ktn  ] = (Dbuff.f[DIR_0PP  ])[k];
-      //(D.f[DIR_0MM  ])[kbs  ] = (Dbuff.f[DIR_0MM  ])[k];
-      //(D.f[DIR_0PM  ])[kbn  ] = (Dbuff.f[DIR_0PM  ])[k];
-      //(D.f[DIR_0MP  ])[kts  ] = (Dbuff.f[DIR_0MP  ])[k];
+      //(D.f[DIR_P00])[ke   ] = (Dbuff.f[DIR_P00])[k];
+      //(D.f[DIR_M00])[kw   ] = (Dbuff.f[DIR_M00])[k];
+      //(D.f[DIR_0P0])[kn   ] = (Dbuff.f[DIR_0P0])[k];
+      //(D.f[DIR_0M0])[ks   ] = (Dbuff.f[DIR_0M0])[k];
+      //(D.f[DIR_00P])[kt   ] = (Dbuff.f[DIR_00P])[k];
+      //(D.f[DIR_00M])[kb   ] = (Dbuff.f[DIR_00M])[k];
+      //(D.f[DIR_PP0])[kne  ] = (Dbuff.f[DIR_PP0])[k];
+      //(D.f[DIR_MM0])[ksw  ] = (Dbuff.f[DIR_MM0])[k];
+      //(D.f[DIR_PM0])[kse  ] = (Dbuff.f[DIR_PM0])[k];
+      //(D.f[DIR_MP0])[knw  ] = (Dbuff.f[DIR_MP0])[k];
+      //(D.f[DIR_P0P])[kte  ] = (Dbuff.f[DIR_P0P])[k];
+      //(D.f[DIR_M0M])[kbw  ] = (Dbuff.f[DIR_M0M])[k];
+      //(D.f[DIR_P0M])[kbe  ] = (Dbuff.f[DIR_P0M])[k];
+      //(D.f[DIR_M0P])[ktw  ] = (Dbuff.f[DIR_M0P])[k];
+      //(D.f[DIR_0PP])[ktn  ] = (Dbuff.f[DIR_0PP])[k];
+      //(D.f[DIR_0MM])[kbs  ] = (Dbuff.f[DIR_0MM])[k];
+      //(D.f[DIR_0PM])[kbn  ] = (Dbuff.f[DIR_0PM])[k];
+      //(D.f[DIR_0MP])[kts  ] = (Dbuff.f[DIR_0MP])[k];
       //(D.f[DIR_000])[kzero] = (Dbuff.f[DIR_000])[k];
-      //(D.f[DIR_PPP ])[ktne ] = (Dbuff.f[DIR_PPP ])[k];
-      //(D.f[DIR_MMP ])[ktsw ] = (Dbuff.f[DIR_MMP ])[k];
-      //(D.f[DIR_PMP ])[ktse ] = (Dbuff.f[DIR_PMP ])[k];
-      //(D.f[DIR_MPP ])[ktnw ] = (Dbuff.f[DIR_MPP ])[k];
-      //(D.f[DIR_PPM ])[kbne ] = (Dbuff.f[DIR_PPM ])[k];
-      //(D.f[DIR_MMM ])[kbsw ] = (Dbuff.f[DIR_MMM ])[k];
-      //(D.f[DIR_PMM ])[kbse ] = (Dbuff.f[DIR_PMM ])[k];
-      //(D.f[DIR_MPM ])[kbnw ] = (Dbuff.f[DIR_MPM ])[k];
-      (D.f[DIR_M00   ])[kw   ] = (Dbuff.f[DIR_P00   ])[k];
-      (D.f[DIR_P00   ])[ke   ] = (Dbuff.f[DIR_M00   ])[k];
-      (D.f[DIR_0M0   ])[ks   ] = (Dbuff.f[DIR_0P0   ])[k];
-      (D.f[DIR_0P0   ])[kn   ] = (Dbuff.f[DIR_0M0   ])[k];
-      (D.f[DIR_00M   ])[kb   ] = (Dbuff.f[DIR_00P   ])[k];
-      (D.f[DIR_00P   ])[kt   ] = (Dbuff.f[DIR_00M   ])[k];
-      (D.f[DIR_MM0  ])[ksw  ] = (Dbuff.f[DIR_PP0  ])[k];
-      (D.f[DIR_PP0  ])[kne  ] = (Dbuff.f[DIR_MM0  ])[k];
-      (D.f[DIR_MP0  ])[knw  ] = (Dbuff.f[DIR_PM0  ])[k];
-      (D.f[DIR_PM0  ])[kse  ] = (Dbuff.f[DIR_MP0  ])[k];
-      (D.f[DIR_M0M  ])[kbw  ] = (Dbuff.f[DIR_P0P  ])[k];
-      (D.f[DIR_P0P  ])[kte  ] = (Dbuff.f[DIR_M0M  ])[k];
-      (D.f[DIR_M0P  ])[ktw  ] = (Dbuff.f[DIR_P0M  ])[k];
-      (D.f[DIR_P0M  ])[kbe  ] = (Dbuff.f[DIR_M0P  ])[k];
-      (D.f[DIR_0MM  ])[kbs  ] = (Dbuff.f[DIR_0PP  ])[k];
-      (D.f[DIR_0PP  ])[ktn  ] = (Dbuff.f[DIR_0MM  ])[k];
-      (D.f[DIR_0MP  ])[kts  ] = (Dbuff.f[DIR_0PM  ])[k];
-      (D.f[DIR_0PM  ])[kbn  ] = (Dbuff.f[DIR_0MP  ])[k];
+      //(D.f[DIR_PPP])[ktne ] = (Dbuff.f[DIR_PPP])[k];
+      //(D.f[DIR_MMP])[ktsw ] = (Dbuff.f[DIR_MMP])[k];
+      //(D.f[DIR_PMP])[ktse ] = (Dbuff.f[DIR_PMP])[k];
+      //(D.f[DIR_MPP])[ktnw ] = (Dbuff.f[DIR_MPP])[k];
+      //(D.f[DIR_PPM])[kbne ] = (Dbuff.f[DIR_PPM])[k];
+      //(D.f[DIR_MMM])[kbsw ] = (Dbuff.f[DIR_MMM])[k];
+      //(D.f[DIR_PMM])[kbse ] = (Dbuff.f[DIR_PMM])[k];
+      //(D.f[DIR_MPM])[kbnw ] = (Dbuff.f[DIR_MPM])[k];
+      (D.f[DIR_M00])[kw   ] = (Dbuff.f[DIR_P00])[k];
+      (D.f[DIR_P00])[ke   ] = (Dbuff.f[DIR_M00])[k];
+      (D.f[DIR_0M0])[ks   ] = (Dbuff.f[DIR_0P0])[k];
+      (D.f[DIR_0P0])[kn   ] = (Dbuff.f[DIR_0M0])[k];
+      (D.f[DIR_00M])[kb   ] = (Dbuff.f[DIR_00P])[k];
+      (D.f[DIR_00P])[kt   ] = (Dbuff.f[DIR_00M])[k];
+      (D.f[DIR_MM0])[ksw  ] = (Dbuff.f[DIR_PP0])[k];
+      (D.f[DIR_PP0])[kne  ] = (Dbuff.f[DIR_MM0])[k];
+      (D.f[DIR_MP0])[knw  ] = (Dbuff.f[DIR_PM0])[k];
+      (D.f[DIR_PM0])[kse  ] = (Dbuff.f[DIR_MP0])[k];
+      (D.f[DIR_M0M])[kbw  ] = (Dbuff.f[DIR_P0P])[k];
+      (D.f[DIR_P0P])[kte  ] = (Dbuff.f[DIR_M0M])[k];
+      (D.f[DIR_M0P])[ktw  ] = (Dbuff.f[DIR_P0M])[k];
+      (D.f[DIR_P0M])[kbe  ] = (Dbuff.f[DIR_M0P])[k];
+      (D.f[DIR_0MM])[kbs  ] = (Dbuff.f[DIR_0PP])[k];
+      (D.f[DIR_0PP])[ktn  ] = (Dbuff.f[DIR_0MM])[k];
+      (D.f[DIR_0MP])[kts  ] = (Dbuff.f[DIR_0PM])[k];
+      (D.f[DIR_0PM])[kbn  ] = (Dbuff.f[DIR_0MP])[k];
       (D.f[DIR_000])[kzero] = (Dbuff.f[DIR_000])[k];
-      (D.f[DIR_MMM ])[kbsw ] = (Dbuff.f[DIR_PPP ])[k];
-      (D.f[DIR_PPM ])[kbne ] = (Dbuff.f[DIR_MMP ])[k];
-      (D.f[DIR_MPM ])[kbnw ] = (Dbuff.f[DIR_PMP ])[k];
-      (D.f[DIR_PMM ])[kbse ] = (Dbuff.f[DIR_MPP ])[k];
-      (D.f[DIR_MMP ])[ktsw ] = (Dbuff.f[DIR_PPM ])[k];
-      (D.f[DIR_PPP ])[ktne ] = (Dbuff.f[DIR_MMM ])[k];
-      (D.f[DIR_MPP ])[ktnw ] = (Dbuff.f[DIR_PMM ])[k];
-      (D.f[DIR_PMP ])[ktse ] = (Dbuff.f[DIR_MPM ])[k];
+      (D.f[DIR_MMM])[kbsw ] = (Dbuff.f[DIR_PPP])[k];
+      (D.f[DIR_PPM])[kbne ] = (Dbuff.f[DIR_MMP])[k];
+      (D.f[DIR_MPM])[kbnw ] = (Dbuff.f[DIR_PMP])[k];
+      (D.f[DIR_PMM])[kbse ] = (Dbuff.f[DIR_MPP])[k];
+      (D.f[DIR_MMP])[ktsw ] = (Dbuff.f[DIR_PPM])[k];
+      (D.f[DIR_PPP])[ktne ] = (Dbuff.f[DIR_MMM])[k];
+      (D.f[DIR_MPP])[ktnw ] = (Dbuff.f[DIR_PMM])[k];
+      (D.f[DIR_PMP])[ktse ] = (Dbuff.f[DIR_MPM])[k];
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -485,7 +485,7 @@ __global__ void getSendFsPre27(real* DD,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat, 
+                                          unsigned long long numberOfLBnodes, 
                                           bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -536,123 +536,123 @@ __global__ void getSendFsPre27(real* DD,
       Distributions27 D;
       if (isEvenTimestep==true)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  //set Pointer for Buffer Fs
       Distributions27 Dbuff;
-      Dbuff.f[DIR_P00   ] = &bufferFs[DIR_P00   *buffmax];
-      Dbuff.f[DIR_M00   ] = &bufferFs[DIR_M00   *buffmax];
-      Dbuff.f[DIR_0P0   ] = &bufferFs[DIR_0P0   *buffmax];
-      Dbuff.f[DIR_0M0   ] = &bufferFs[DIR_0M0   *buffmax];
-      Dbuff.f[DIR_00P   ] = &bufferFs[DIR_00P   *buffmax];
-      Dbuff.f[DIR_00M   ] = &bufferFs[DIR_00M   *buffmax];
-      Dbuff.f[DIR_PP0  ] = &bufferFs[DIR_PP0  *buffmax];
-      Dbuff.f[DIR_MM0  ] = &bufferFs[DIR_MM0  *buffmax];
-      Dbuff.f[DIR_PM0  ] = &bufferFs[DIR_PM0  *buffmax];
-      Dbuff.f[DIR_MP0  ] = &bufferFs[DIR_MP0  *buffmax];
-      Dbuff.f[DIR_P0P  ] = &bufferFs[DIR_P0P  *buffmax];
-      Dbuff.f[DIR_M0M  ] = &bufferFs[DIR_M0M  *buffmax];
-      Dbuff.f[DIR_P0M  ] = &bufferFs[DIR_P0M  *buffmax];
-      Dbuff.f[DIR_M0P  ] = &bufferFs[DIR_M0P  *buffmax];
-      Dbuff.f[DIR_0PP  ] = &bufferFs[DIR_0PP  *buffmax];
-      Dbuff.f[DIR_0MM  ] = &bufferFs[DIR_0MM  *buffmax];
-      Dbuff.f[DIR_0PM  ] = &bufferFs[DIR_0PM  *buffmax];
-      Dbuff.f[DIR_0MP  ] = &bufferFs[DIR_0MP  *buffmax];
-      Dbuff.f[DIR_000] = &bufferFs[DIR_000*buffmax];
-      Dbuff.f[DIR_PPP ] = &bufferFs[DIR_PPP *buffmax];
-      Dbuff.f[DIR_MMP ] = &bufferFs[DIR_MMP *buffmax];
-      Dbuff.f[DIR_PMP ] = &bufferFs[DIR_PMP *buffmax];
-      Dbuff.f[DIR_MPP ] = &bufferFs[DIR_MPP *buffmax];
-      Dbuff.f[DIR_PPM ] = &bufferFs[DIR_PPM *buffmax];
-      Dbuff.f[DIR_MMM ] = &bufferFs[DIR_MMM *buffmax];
-      Dbuff.f[DIR_PMM ] = &bufferFs[DIR_PMM *buffmax];
-      Dbuff.f[DIR_MPM ] = &bufferFs[DIR_MPM *buffmax];
+      Dbuff.f[DIR_P00] = &bufferFs[DIR_P00 * buffmax];
+      Dbuff.f[DIR_M00] = &bufferFs[DIR_M00 * buffmax];
+      Dbuff.f[DIR_0P0] = &bufferFs[DIR_0P0 * buffmax];
+      Dbuff.f[DIR_0M0] = &bufferFs[DIR_0M0 * buffmax];
+      Dbuff.f[DIR_00P] = &bufferFs[DIR_00P * buffmax];
+      Dbuff.f[DIR_00M] = &bufferFs[DIR_00M * buffmax];
+      Dbuff.f[DIR_PP0] = &bufferFs[DIR_PP0 * buffmax];
+      Dbuff.f[DIR_MM0] = &bufferFs[DIR_MM0 * buffmax];
+      Dbuff.f[DIR_PM0] = &bufferFs[DIR_PM0 * buffmax];
+      Dbuff.f[DIR_MP0] = &bufferFs[DIR_MP0 * buffmax];
+      Dbuff.f[DIR_P0P] = &bufferFs[DIR_P0P * buffmax];
+      Dbuff.f[DIR_M0M] = &bufferFs[DIR_M0M * buffmax];
+      Dbuff.f[DIR_P0M] = &bufferFs[DIR_P0M * buffmax];
+      Dbuff.f[DIR_M0P] = &bufferFs[DIR_M0P * buffmax];
+      Dbuff.f[DIR_0PP] = &bufferFs[DIR_0PP * buffmax];
+      Dbuff.f[DIR_0MM] = &bufferFs[DIR_0MM * buffmax];
+      Dbuff.f[DIR_0PM] = &bufferFs[DIR_0PM * buffmax];
+      Dbuff.f[DIR_0MP] = &bufferFs[DIR_0MP * buffmax];
+      Dbuff.f[DIR_000] = &bufferFs[DIR_000 * buffmax];
+      Dbuff.f[DIR_PPP] = &bufferFs[DIR_PPP * buffmax];
+      Dbuff.f[DIR_MMP] = &bufferFs[DIR_MMP * buffmax];
+      Dbuff.f[DIR_PMP] = &bufferFs[DIR_PMP * buffmax];
+      Dbuff.f[DIR_MPP] = &bufferFs[DIR_MPP * buffmax];
+      Dbuff.f[DIR_PPM] = &bufferFs[DIR_PPM * buffmax];
+      Dbuff.f[DIR_MMM] = &bufferFs[DIR_MMM * buffmax];
+      Dbuff.f[DIR_PMM] = &bufferFs[DIR_PMM * buffmax];
+      Dbuff.f[DIR_MPM] = &bufferFs[DIR_MPM * buffmax];
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  //copy to buffer
-      (Dbuff.f[DIR_P00   ])[k] = (D.f[DIR_P00   ])[ke   ];
-      (Dbuff.f[DIR_M00   ])[k] = (D.f[DIR_M00   ])[kw   ];
-      (Dbuff.f[DIR_0P0   ])[k] = (D.f[DIR_0P0   ])[kn   ];
-      (Dbuff.f[DIR_0M0   ])[k] = (D.f[DIR_0M0   ])[ks   ];
-      (Dbuff.f[DIR_00P   ])[k] = (D.f[DIR_00P   ])[kt   ];
-      (Dbuff.f[DIR_00M   ])[k] = (D.f[DIR_00M   ])[kb   ];
-      (Dbuff.f[DIR_PP0  ])[k] = (D.f[DIR_PP0  ])[kne  ];
-      (Dbuff.f[DIR_MM0  ])[k] = (D.f[DIR_MM0  ])[ksw  ];
-      (Dbuff.f[DIR_PM0  ])[k] = (D.f[DIR_PM0  ])[kse  ];
-      (Dbuff.f[DIR_MP0  ])[k] = (D.f[DIR_MP0  ])[knw  ];
-      (Dbuff.f[DIR_P0P  ])[k] = (D.f[DIR_P0P  ])[kte  ];
-      (Dbuff.f[DIR_M0M  ])[k] = (D.f[DIR_M0M  ])[kbw  ];
-      (Dbuff.f[DIR_P0M  ])[k] = (D.f[DIR_P0M  ])[kbe  ];
-      (Dbuff.f[DIR_M0P  ])[k] = (D.f[DIR_M0P  ])[ktw  ];
-      (Dbuff.f[DIR_0PP  ])[k] = (D.f[DIR_0PP  ])[ktn  ];
-      (Dbuff.f[DIR_0MM  ])[k] = (D.f[DIR_0MM  ])[kbs  ];
-      (Dbuff.f[DIR_0PM  ])[k] = (D.f[DIR_0PM  ])[kbn  ];
-      (Dbuff.f[DIR_0MP  ])[k] = (D.f[DIR_0MP  ])[kts  ];
+      (Dbuff.f[DIR_P00])[k] = (D.f[DIR_P00])[ke   ];
+      (Dbuff.f[DIR_M00])[k] = (D.f[DIR_M00])[kw   ];
+      (Dbuff.f[DIR_0P0])[k] = (D.f[DIR_0P0])[kn   ];
+      (Dbuff.f[DIR_0M0])[k] = (D.f[DIR_0M0])[ks   ];
+      (Dbuff.f[DIR_00P])[k] = (D.f[DIR_00P])[kt   ];
+      (Dbuff.f[DIR_00M])[k] = (D.f[DIR_00M])[kb   ];
+      (Dbuff.f[DIR_PP0])[k] = (D.f[DIR_PP0])[kne  ];
+      (Dbuff.f[DIR_MM0])[k] = (D.f[DIR_MM0])[ksw  ];
+      (Dbuff.f[DIR_PM0])[k] = (D.f[DIR_PM0])[kse  ];
+      (Dbuff.f[DIR_MP0])[k] = (D.f[DIR_MP0])[knw  ];
+      (Dbuff.f[DIR_P0P])[k] = (D.f[DIR_P0P])[kte  ];
+      (Dbuff.f[DIR_M0M])[k] = (D.f[DIR_M0M])[kbw  ];
+      (Dbuff.f[DIR_P0M])[k] = (D.f[DIR_P0M])[kbe  ];
+      (Dbuff.f[DIR_M0P])[k] = (D.f[DIR_M0P])[ktw  ];
+      (Dbuff.f[DIR_0PP])[k] = (D.f[DIR_0PP])[ktn  ];
+      (Dbuff.f[DIR_0MM])[k] = (D.f[DIR_0MM])[kbs  ];
+      (Dbuff.f[DIR_0PM])[k] = (D.f[DIR_0PM])[kbn  ];
+      (Dbuff.f[DIR_0MP])[k] = (D.f[DIR_0MP])[kts  ];
       (Dbuff.f[DIR_000])[k] = (D.f[DIR_000])[kzero];
-      (Dbuff.f[DIR_PPP ])[k] = (D.f[DIR_PPP ])[ktne ];
-      (Dbuff.f[DIR_MMP ])[k] = (D.f[DIR_MMP ])[ktsw ];
-      (Dbuff.f[DIR_PMP ])[k] = (D.f[DIR_PMP ])[ktse ];
-      (Dbuff.f[DIR_MPP ])[k] = (D.f[DIR_MPP ])[ktnw ];
-      (Dbuff.f[DIR_PPM ])[k] = (D.f[DIR_PPM ])[kbne ];
-      (Dbuff.f[DIR_MMM ])[k] = (D.f[DIR_MMM ])[kbsw ];
-      (Dbuff.f[DIR_PMM ])[k] = (D.f[DIR_PMM ])[kbse ];
-      (Dbuff.f[DIR_MPM ])[k] = (D.f[DIR_MPM ])[kbnw ];
+      (Dbuff.f[DIR_PPP])[k] = (D.f[DIR_PPP])[ktne ];
+      (Dbuff.f[DIR_MMP])[k] = (D.f[DIR_MMP])[ktsw ];
+      (Dbuff.f[DIR_PMP])[k] = (D.f[DIR_PMP])[ktse ];
+      (Dbuff.f[DIR_MPP])[k] = (D.f[DIR_MPP])[ktnw ];
+      (Dbuff.f[DIR_PPM])[k] = (D.f[DIR_PPM])[kbne ];
+      (Dbuff.f[DIR_MMM])[k] = (D.f[DIR_MMM])[kbsw ];
+      (Dbuff.f[DIR_PMM])[k] = (D.f[DIR_PMM])[kbse ];
+      (Dbuff.f[DIR_MPM])[k] = (D.f[DIR_MPM])[kbnw ];
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -694,7 +694,7 @@ __global__ void setRecvFsPre27(real* DD,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat, 
+                                          unsigned long long numberOfLBnodes, 
                                           bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -745,123 +745,123 @@ __global__ void setRecvFsPre27(real* DD,
       Distributions27 D;
       if (isEvenTimestep==true)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  //set Pointer for Buffer Fs
       Distributions27 Dbuff;
-      Dbuff.f[DIR_P00   ] = &bufferFs[DIR_P00   *buffmax];
-      Dbuff.f[DIR_M00   ] = &bufferFs[DIR_M00   *buffmax];
-      Dbuff.f[DIR_0P0   ] = &bufferFs[DIR_0P0   *buffmax];
-      Dbuff.f[DIR_0M0   ] = &bufferFs[DIR_0M0   *buffmax];
-      Dbuff.f[DIR_00P   ] = &bufferFs[DIR_00P   *buffmax];
-      Dbuff.f[DIR_00M   ] = &bufferFs[DIR_00M   *buffmax];
-      Dbuff.f[DIR_PP0  ] = &bufferFs[DIR_PP0  *buffmax];
-      Dbuff.f[DIR_MM0  ] = &bufferFs[DIR_MM0  *buffmax];
-      Dbuff.f[DIR_PM0  ] = &bufferFs[DIR_PM0  *buffmax];
-      Dbuff.f[DIR_MP0  ] = &bufferFs[DIR_MP0  *buffmax];
-      Dbuff.f[DIR_P0P  ] = &bufferFs[DIR_P0P  *buffmax];
-      Dbuff.f[DIR_M0M  ] = &bufferFs[DIR_M0M  *buffmax];
-      Dbuff.f[DIR_P0M  ] = &bufferFs[DIR_P0M  *buffmax];
-      Dbuff.f[DIR_M0P  ] = &bufferFs[DIR_M0P  *buffmax];
-      Dbuff.f[DIR_0PP  ] = &bufferFs[DIR_0PP  *buffmax];
-      Dbuff.f[DIR_0MM  ] = &bufferFs[DIR_0MM  *buffmax];
-      Dbuff.f[DIR_0PM  ] = &bufferFs[DIR_0PM  *buffmax];
-      Dbuff.f[DIR_0MP  ] = &bufferFs[DIR_0MP  *buffmax];
-      Dbuff.f[DIR_000] = &bufferFs[DIR_000*buffmax];
-      Dbuff.f[DIR_PPP ] = &bufferFs[DIR_PPP *buffmax];
-      Dbuff.f[DIR_MMP ] = &bufferFs[DIR_MMP *buffmax];
-      Dbuff.f[DIR_PMP ] = &bufferFs[DIR_PMP *buffmax];
-      Dbuff.f[DIR_MPP ] = &bufferFs[DIR_MPP *buffmax];
-      Dbuff.f[DIR_PPM ] = &bufferFs[DIR_PPM *buffmax];
-      Dbuff.f[DIR_MMM ] = &bufferFs[DIR_MMM *buffmax];
-      Dbuff.f[DIR_PMM ] = &bufferFs[DIR_PMM *buffmax];
-      Dbuff.f[DIR_MPM ] = &bufferFs[DIR_MPM *buffmax];
+      Dbuff.f[DIR_P00] = &bufferFs[DIR_P00 * buffmax];
+      Dbuff.f[DIR_M00] = &bufferFs[DIR_M00 * buffmax];
+      Dbuff.f[DIR_0P0] = &bufferFs[DIR_0P0 * buffmax];
+      Dbuff.f[DIR_0M0] = &bufferFs[DIR_0M0 * buffmax];
+      Dbuff.f[DIR_00P] = &bufferFs[DIR_00P * buffmax];
+      Dbuff.f[DIR_00M] = &bufferFs[DIR_00M * buffmax];
+      Dbuff.f[DIR_PP0] = &bufferFs[DIR_PP0 * buffmax];
+      Dbuff.f[DIR_MM0] = &bufferFs[DIR_MM0 * buffmax];
+      Dbuff.f[DIR_PM0] = &bufferFs[DIR_PM0 * buffmax];
+      Dbuff.f[DIR_MP0] = &bufferFs[DIR_MP0 * buffmax];
+      Dbuff.f[DIR_P0P] = &bufferFs[DIR_P0P * buffmax];
+      Dbuff.f[DIR_M0M] = &bufferFs[DIR_M0M * buffmax];
+      Dbuff.f[DIR_P0M] = &bufferFs[DIR_P0M * buffmax];
+      Dbuff.f[DIR_M0P] = &bufferFs[DIR_M0P * buffmax];
+      Dbuff.f[DIR_0PP] = &bufferFs[DIR_0PP * buffmax];
+      Dbuff.f[DIR_0MM] = &bufferFs[DIR_0MM * buffmax];
+      Dbuff.f[DIR_0PM] = &bufferFs[DIR_0PM * buffmax];
+      Dbuff.f[DIR_0MP] = &bufferFs[DIR_0MP * buffmax];
+      Dbuff.f[DIR_000] = &bufferFs[DIR_000 * buffmax];
+      Dbuff.f[DIR_PPP] = &bufferFs[DIR_PPP * buffmax];
+      Dbuff.f[DIR_MMP] = &bufferFs[DIR_MMP * buffmax];
+      Dbuff.f[DIR_PMP] = &bufferFs[DIR_PMP * buffmax];
+      Dbuff.f[DIR_MPP] = &bufferFs[DIR_MPP * buffmax];
+      Dbuff.f[DIR_PPM] = &bufferFs[DIR_PPM * buffmax];
+      Dbuff.f[DIR_MMM] = &bufferFs[DIR_MMM * buffmax];
+      Dbuff.f[DIR_PMM] = &bufferFs[DIR_PMM * buffmax];
+      Dbuff.f[DIR_MPM] = &bufferFs[DIR_MPM * buffmax];
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  //copy from buffer
-      (D.f[DIR_P00   ])[ke   ] = (Dbuff.f[DIR_P00   ])[k];
-      (D.f[DIR_M00   ])[kw   ] = (Dbuff.f[DIR_M00   ])[k];
-      (D.f[DIR_0P0   ])[kn   ] = (Dbuff.f[DIR_0P0   ])[k];
-      (D.f[DIR_0M0   ])[ks   ] = (Dbuff.f[DIR_0M0   ])[k];
-      (D.f[DIR_00P   ])[kt   ] = (Dbuff.f[DIR_00P   ])[k];
-      (D.f[DIR_00M   ])[kb   ] = (Dbuff.f[DIR_00M   ])[k];
-      (D.f[DIR_PP0  ])[kne  ] = (Dbuff.f[DIR_PP0  ])[k];
-      (D.f[DIR_MM0  ])[ksw  ] = (Dbuff.f[DIR_MM0  ])[k];
-      (D.f[DIR_PM0  ])[kse  ] = (Dbuff.f[DIR_PM0  ])[k];
-      (D.f[DIR_MP0  ])[knw  ] = (Dbuff.f[DIR_MP0  ])[k];
-      (D.f[DIR_P0P  ])[kte  ] = (Dbuff.f[DIR_P0P  ])[k];
-      (D.f[DIR_M0M  ])[kbw  ] = (Dbuff.f[DIR_M0M  ])[k];
-      (D.f[DIR_P0M  ])[kbe  ] = (Dbuff.f[DIR_P0M  ])[k];
-      (D.f[DIR_M0P  ])[ktw  ] = (Dbuff.f[DIR_M0P  ])[k];
-      (D.f[DIR_0PP  ])[ktn  ] = (Dbuff.f[DIR_0PP  ])[k];
-      (D.f[DIR_0MM  ])[kbs  ] = (Dbuff.f[DIR_0MM  ])[k];
-      (D.f[DIR_0PM  ])[kbn  ] = (Dbuff.f[DIR_0PM  ])[k];
-      (D.f[DIR_0MP  ])[kts  ] = (Dbuff.f[DIR_0MP  ])[k];
+      (D.f[DIR_P00])[ke   ] = (Dbuff.f[DIR_P00])[k];
+      (D.f[DIR_M00])[kw   ] = (Dbuff.f[DIR_M00])[k];
+      (D.f[DIR_0P0])[kn   ] = (Dbuff.f[DIR_0P0])[k];
+      (D.f[DIR_0M0])[ks   ] = (Dbuff.f[DIR_0M0])[k];
+      (D.f[DIR_00P])[kt   ] = (Dbuff.f[DIR_00P])[k];
+      (D.f[DIR_00M])[kb   ] = (Dbuff.f[DIR_00M])[k];
+      (D.f[DIR_PP0])[kne  ] = (Dbuff.f[DIR_PP0])[k];
+      (D.f[DIR_MM0])[ksw  ] = (Dbuff.f[DIR_MM0])[k];
+      (D.f[DIR_PM0])[kse  ] = (Dbuff.f[DIR_PM0])[k];
+      (D.f[DIR_MP0])[knw  ] = (Dbuff.f[DIR_MP0])[k];
+      (D.f[DIR_P0P])[kte  ] = (Dbuff.f[DIR_P0P])[k];
+      (D.f[DIR_M0M])[kbw  ] = (Dbuff.f[DIR_M0M])[k];
+      (D.f[DIR_P0M])[kbe  ] = (Dbuff.f[DIR_P0M])[k];
+      (D.f[DIR_M0P])[ktw  ] = (Dbuff.f[DIR_M0P])[k];
+      (D.f[DIR_0PP])[ktn  ] = (Dbuff.f[DIR_0PP])[k];
+      (D.f[DIR_0MM])[kbs  ] = (Dbuff.f[DIR_0MM])[k];
+      (D.f[DIR_0PM])[kbn  ] = (Dbuff.f[DIR_0PM])[k];
+      (D.f[DIR_0MP])[kts  ] = (Dbuff.f[DIR_0MP])[k];
       (D.f[DIR_000])[kzero] = (Dbuff.f[DIR_000])[k];
-      (D.f[DIR_PPP ])[ktne ] = (Dbuff.f[DIR_PPP ])[k];
-      (D.f[DIR_MMP ])[ktsw ] = (Dbuff.f[DIR_MMP ])[k];
-      (D.f[DIR_PMP ])[ktse ] = (Dbuff.f[DIR_PMP ])[k];
-      (D.f[DIR_MPP ])[ktnw ] = (Dbuff.f[DIR_MPP ])[k];
-      (D.f[DIR_PPM ])[kbne ] = (Dbuff.f[DIR_PPM ])[k];
-      (D.f[DIR_MMM ])[kbsw ] = (Dbuff.f[DIR_MMM ])[k];
-      (D.f[DIR_PMM ])[kbse ] = (Dbuff.f[DIR_PMM ])[k];
-      (D.f[DIR_MPM ])[kbnw ] = (Dbuff.f[DIR_MPM ])[k];
+      (D.f[DIR_PPP])[ktne ] = (Dbuff.f[DIR_PPP])[k];
+      (D.f[DIR_MMP])[ktsw ] = (Dbuff.f[DIR_MMP])[k];
+      (D.f[DIR_PMP])[ktse ] = (Dbuff.f[DIR_PMP])[k];
+      (D.f[DIR_MPP])[ktnw ] = (Dbuff.f[DIR_MPP])[k];
+      (D.f[DIR_PPM])[kbne ] = (Dbuff.f[DIR_PPM])[k];
+      (D.f[DIR_MMM])[kbsw ] = (Dbuff.f[DIR_MMM])[k];
+      (D.f[DIR_PMM])[kbse ] = (Dbuff.f[DIR_PMM])[k];
+      (D.f[DIR_MPM])[kbnw ] = (Dbuff.f[DIR_MPM])[k];
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -903,7 +903,7 @@ __global__ void getSendGsF3(
 	unsigned int* neighborX,
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
-	unsigned int size_Mat,
+	unsigned long long numberOfLBnodes,
 	bool isEvenTimestep)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -931,31 +931,31 @@ __global__ void getSendGsF3(
 		Distributions6 G;
 		if (isEvenTimestep)
 		{
-			G.g[DIR_P00] = &G6[DIR_P00   *size_Mat];
-			G.g[DIR_M00] = &G6[DIR_M00   *size_Mat];
-			G.g[DIR_0P0] = &G6[DIR_0P0   *size_Mat];
-			G.g[DIR_0M0] = &G6[DIR_0M0   *size_Mat];
-			G.g[DIR_00P] = &G6[DIR_00P   *size_Mat];
-			G.g[DIR_00M] = &G6[DIR_00M   *size_Mat];
+			G.g[DIR_P00] = &G6[DIR_P00 * numberOfLBnodes];
+			G.g[DIR_M00] = &G6[DIR_M00 * numberOfLBnodes];
+			G.g[DIR_0P0] = &G6[DIR_0P0 * numberOfLBnodes];
+			G.g[DIR_0M0] = &G6[DIR_0M0 * numberOfLBnodes];
+			G.g[DIR_00P] = &G6[DIR_00P * numberOfLBnodes];
+			G.g[DIR_00M] = &G6[DIR_00M * numberOfLBnodes];
 		}
 		else
 		{
-			G.g[DIR_M00] = &G6[DIR_P00   *size_Mat];
-			G.g[DIR_P00] = &G6[DIR_M00   *size_Mat];
-			G.g[DIR_0M0] = &G6[DIR_0P0   *size_Mat];
-			G.g[DIR_0P0] = &G6[DIR_0M0   *size_Mat];
-			G.g[DIR_00M] = &G6[DIR_00P   *size_Mat];
-			G.g[DIR_00P] = &G6[DIR_00M   *size_Mat];
+			G.g[DIR_M00] = &G6[DIR_P00 * numberOfLBnodes];
+			G.g[DIR_P00] = &G6[DIR_M00 * numberOfLBnodes];
+			G.g[DIR_0M0] = &G6[DIR_0P0 * numberOfLBnodes];
+			G.g[DIR_0P0] = &G6[DIR_0M0 * numberOfLBnodes];
+			G.g[DIR_00M] = &G6[DIR_00P * numberOfLBnodes];
+			G.g[DIR_00P] = &G6[DIR_00M * numberOfLBnodes];
 		}
 		//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 		//set Pointer for Buffer Gs
 		Distributions6 Dbuff;
-		Dbuff.g[DIR_P00] = &bufferGs[DIR_P00   *buffmax];
-		Dbuff.g[DIR_M00] = &bufferGs[DIR_M00   *buffmax];
-		Dbuff.g[DIR_0P0] = &bufferGs[DIR_0P0   *buffmax];
-		Dbuff.g[DIR_0M0] = &bufferGs[DIR_0M0   *buffmax];
-		Dbuff.g[DIR_00P] = &bufferGs[DIR_00P   *buffmax];
-		Dbuff.g[DIR_00M] = &bufferGs[DIR_00M   *buffmax];
+		Dbuff.g[DIR_P00] = &bufferGs[DIR_P00 * buffmax];
+		Dbuff.g[DIR_M00] = &bufferGs[DIR_M00 * buffmax];
+		Dbuff.g[DIR_0P0] = &bufferGs[DIR_0P0 * buffmax];
+		Dbuff.g[DIR_0M0] = &bufferGs[DIR_0M0 * buffmax];
+		Dbuff.g[DIR_00P] = &bufferGs[DIR_00P * buffmax];
+		Dbuff.g[DIR_00M] = &bufferGs[DIR_00M * buffmax];
 		//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 		//write Gs to buffer
 		(Dbuff.g[DIR_P00])[k] = (G.g[DIR_M00])[kw];
@@ -1006,7 +1006,7 @@ __global__ void setRecvGsF3(
 	unsigned int* neighborX,
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
-	unsigned int size_Mat,
+	unsigned long long numberOfLBnodes,
 	bool isEvenTimestep)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -1034,31 +1034,31 @@ __global__ void setRecvGsF3(
 		Distributions6 G;
 		if (isEvenTimestep)
 		{
-			G.g[DIR_P00] = &G6[DIR_P00   *size_Mat];
-			G.g[DIR_M00] = &G6[DIR_M00   *size_Mat];
-			G.g[DIR_0P0] = &G6[DIR_0P0   *size_Mat];
-			G.g[DIR_0M0] = &G6[DIR_0M0   *size_Mat];
-			G.g[DIR_00P] = &G6[DIR_00P   *size_Mat];
-			G.g[DIR_00M] = &G6[DIR_00M   *size_Mat];
+			G.g[DIR_P00] = &G6[DIR_P00 * numberOfLBnodes];
+			G.g[DIR_M00] = &G6[DIR_M00 * numberOfLBnodes];
+			G.g[DIR_0P0] = &G6[DIR_0P0 * numberOfLBnodes];
+			G.g[DIR_0M0] = &G6[DIR_0M0 * numberOfLBnodes];
+			G.g[DIR_00P] = &G6[DIR_00P * numberOfLBnodes];
+			G.g[DIR_00M] = &G6[DIR_00M * numberOfLBnodes];
 		}
 		else
 		{
-			G.g[DIR_M00] = &G6[DIR_P00   *size_Mat];
-			G.g[DIR_P00] = &G6[DIR_M00   *size_Mat];
-			G.g[DIR_0M0] = &G6[DIR_0P0   *size_Mat];
-			G.g[DIR_0P0] = &G6[DIR_0M0   *size_Mat];
-			G.g[DIR_00M] = &G6[DIR_00P   *size_Mat];
-			G.g[DIR_00P] = &G6[DIR_00M   *size_Mat];
+			G.g[DIR_M00] = &G6[DIR_P00 * numberOfLBnodes];
+			G.g[DIR_P00] = &G6[DIR_M00 * numberOfLBnodes];
+			G.g[DIR_0M0] = &G6[DIR_0P0 * numberOfLBnodes];
+			G.g[DIR_0P0] = &G6[DIR_0M0 * numberOfLBnodes];
+			G.g[DIR_00M] = &G6[DIR_00P * numberOfLBnodes];
+			G.g[DIR_00P] = &G6[DIR_00M * numberOfLBnodes];
 		}
 		//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 		//set Pointer for Buffer Gs
 		Distributions6 Dbuff;
-		Dbuff.g[DIR_P00] = &bufferGs[DIR_P00   *buffmax];
-		Dbuff.g[DIR_M00] = &bufferGs[DIR_M00   *buffmax];
-		Dbuff.g[DIR_0P0] = &bufferGs[DIR_0P0   *buffmax];
-		Dbuff.g[DIR_0M0] = &bufferGs[DIR_0M0   *buffmax];
-		Dbuff.g[DIR_00P] = &bufferGs[DIR_00P   *buffmax];
-		Dbuff.g[DIR_00M] = &bufferGs[DIR_00M   *buffmax];
+		Dbuff.g[DIR_P00] = &bufferGs[DIR_P00 * buffmax];
+		Dbuff.g[DIR_M00] = &bufferGs[DIR_M00 * buffmax];
+		Dbuff.g[DIR_0P0] = &bufferGs[DIR_0P0 * buffmax];
+		Dbuff.g[DIR_0M0] = &bufferGs[DIR_0M0 * buffmax];
+		Dbuff.g[DIR_00P] = &bufferGs[DIR_00P * buffmax];
+		Dbuff.g[DIR_00M] = &bufferGs[DIR_00M * buffmax];
 		//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 		//write buffer to Gs
 		(G.g[DIR_M00])[kw] = (Dbuff.g[DIR_P00])[k];
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h b/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
index ee987ae23402ef304220349db77084cc341ccd5a..ae8cbb77ec2493126d64b90a7119cbfa3efee666 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
+++ b/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
@@ -29,7 +29,7 @@ void KernelCas27(unsigned int grid_nx,
                             unsigned int* neighborY,
                             unsigned int* neighborZ,
                             real* DD,
-                            int size_Mat,
+                            unsigned long long numberOfLBnodes,
                             bool EvenOrOdd);
 
 void KernelCasSP27(unsigned int numberOfThreads, 
@@ -39,7 +39,7 @@ void KernelCasSP27(unsigned int numberOfThreads,
                               unsigned int* neighborY,
                               unsigned int* neighborZ,
                               real* DD,
-                              int size_Mat,
+                              unsigned long long numberOfLBnodes,
                               bool EvenOrOdd);
 
 void KernelCasSPMS27(unsigned int numberOfThreads, 
@@ -49,7 +49,7 @@ void KernelCasSPMS27(unsigned int numberOfThreads,
                                 unsigned int* neighborY,
                                 unsigned int* neighborZ,
                                 real* DD,
-                                int size_Mat,
+                                unsigned long long numberOfLBnodes,
                                 bool EvenOrOdd);
 
 void KernelCasSPMSOHM27( unsigned int numberOfThreads, 
@@ -59,7 +59,7 @@ void KernelCasSPMSOHM27( unsigned int numberOfThreads,
                                    unsigned int* neighborY,
                                    unsigned int* neighborZ,
                                    real* DD,
-                                   int size_Mat,
+                                   unsigned long long numberOfLBnodes,
                                    bool EvenOrOdd);
 
 void KernelKumCompSRTSP27(
@@ -70,7 +70,7 @@ void KernelKumCompSRTSP27(
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
 	real* DDStart,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	int level,
 	real* forces,
 	bool EvenOrOdd);
@@ -82,7 +82,7 @@ void KernelCumulantD3Q27All4(unsigned int numberOfThreads,
 									    unsigned int* neighborY,
 									    unsigned int* neighborZ,
 									    real* DD,
-									    int size_Mat,
+									    unsigned long long numberOfLBnodes,
 									    int level,
 									    real* forces,
 									    bool EvenOrOdd);
@@ -94,7 +94,7 @@ void KernelKumAA2016CompBulkSP27(unsigned int numberOfThreads,
 											unsigned int* neighborY,
 											unsigned int* neighborZ,
 											real* DD,
-											int size_Mat,
+											unsigned long long numberOfLBnodes,
 											int size_Array,
 											int level,
 											real* forces,
@@ -112,7 +112,7 @@ void KernelKum1hSP27(    unsigned int numberOfThreads,
 									real* coordY,
 									real* coordZ,
 									real* DDStart,
-									int size_Mat,
+									unsigned long long numberOfLBnodes,
 									bool EvenOrOdd);
 
 void KernelCascadeSP27(unsigned int numberOfThreads, 
@@ -122,7 +122,7 @@ void KernelCascadeSP27(unsigned int numberOfThreads,
 								  unsigned int* neighborY,
 								  unsigned int* neighborZ,
 								  real* DD,
-								  int size_Mat,
+								  unsigned long long numberOfLBnodes,
 								  bool EvenOrOdd);
 
 void KernelKumNewSP27(   unsigned int numberOfThreads, 
@@ -132,7 +132,7 @@ void KernelKumNewSP27(   unsigned int numberOfThreads,
 									unsigned int* neighborY,
 									unsigned int* neighborZ,
 									real* DD,
-									int size_Mat,
+									unsigned long long numberOfLBnodes,
 									bool EvenOrOdd);
 
 
@@ -144,7 +144,7 @@ void CumulantOnePreconditionedErrorDiffusionChimCompSP27(
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
 	real* DD,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	int size_Array,
 	int level,
 	real* forces,
@@ -158,7 +158,7 @@ void CumulantOnePreconditionedChimCompSP27(
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
 	real* DD,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	int size_Array,
 	int level,
 	real* forces,
@@ -172,7 +172,7 @@ void CumulantOneChimCompSP27(
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
 	real* DD,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	int size_Array,
 	int level,
 	real* forces,
@@ -189,7 +189,7 @@ void KernelKumIsoTestSP27(unsigned int numberOfThreads,
 									 real* dxxUx,
 									 real* dyyUy,
 									 real* dzzUz,
-									 int size_Mat,
+									 unsigned long long numberOfLBnodes,
 									 bool EvenOrOdd);
 
 void KernelKumCompSP27(  unsigned int numberOfThreads, 
@@ -199,7 +199,7 @@ void KernelKumCompSP27(  unsigned int numberOfThreads,
 									unsigned int* neighborY,
 									unsigned int* neighborZ,
 									real* DD,
-									int size_Mat,
+									unsigned long long numberOfLBnodes,
 									bool EvenOrOdd);
 
 void KernelWaleBySoniMalavCumAA2016CompSP27(
@@ -215,7 +215,7 @@ void KernelWaleBySoniMalavCumAA2016CompSP27(
 	real* veloZ,
 	real* DD,
 	real* turbulentViscosity,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	int size_Array,
 	int level,
 	real* forces,
@@ -227,7 +227,7 @@ void KernelPMCumOneCompSP27(unsigned int numberOfThreads,
 									   unsigned int* neighborY,
 									   unsigned int* neighborZ,
 									   real* DD,
-									   int size_Mat,
+									   unsigned long long numberOfLBnodes,
 									   int level,
 									   real* forces,
 									   real porosity,
@@ -245,7 +245,7 @@ void KernelADincomp7(   unsigned int numberOfThreads,
 								   unsigned int* neighborZ,
 								   real* DD,
 								   real* DD7,
-								   int size_Mat,
+								   unsigned long long numberOfLBnodes,
 								   bool EvenOrOdd);
 
 void KernelADincomp27(   unsigned int numberOfThreads, 
@@ -256,7 +256,7 @@ void KernelADincomp27(   unsigned int numberOfThreads,
 									unsigned int* neighborZ,
 									real* DD,
 									real* DD7,
-									int size_Mat,
+									unsigned long long numberOfLBnodes,
 									bool EvenOrOdd);
 
 void Init27(int myid,
@@ -267,7 +267,7 @@ void Init27(int myid,
                        unsigned int* neighborY,
                        unsigned int* neighborZ,
                        real* vParab,
-                       unsigned int size_Mat,
+                       unsigned long long numberOfLBnodes,
                        unsigned int grid_nx, 
                        unsigned int grid_ny, 
                        unsigned int grid_nz, 
@@ -285,7 +285,7 @@ void InitNonEqPartSP27(unsigned int numberOfThreads,
                                   real* ux,
                                   real* uy,
                                   real* uz,
-                                  unsigned int size_Mat,
+                                  unsigned long long numberOfLBnodes,
                                   real* DD,
                                   real omega,
                                   bool EvenOrOdd);
@@ -300,7 +300,7 @@ void InitThS7(  unsigned int numberOfThreads,
                            real* ux,
                            real* uy,
                            real* uz,
-                           unsigned int size_Mat,
+                           unsigned long long numberOfLBnodes,
                            real* DD7,
                            bool EvenOrOdd);
 
@@ -313,7 +313,7 @@ void InitADDev27( unsigned int numberOfThreads,
                            real* ux,
                            real* uy,
                            real* uz,
-                           unsigned int size_Mat,
+                           unsigned long long numberOfLBnodes,
                            real* DD27,
                            bool EvenOrOdd);
 
@@ -330,7 +330,7 @@ void PostProcessorF3_2018Fehlberg(
 	real* vzOut,
 	real* DDStart,
 	real* G6,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	int level,
 	real* forces,
 	bool EvenOrOdd);
@@ -343,7 +343,7 @@ void CalcMac27( real* vxD,
                           unsigned int* neighborX,
                           unsigned int* neighborY,
                           unsigned int* neighborZ,
-                          unsigned int size_Mat,
+                          unsigned long long numberOfLBnodes,
                           unsigned int grid_nx, 
                           unsigned int grid_ny, 
                           unsigned int grid_nz, 
@@ -359,7 +359,7 @@ void CalcMacSP27(real* vxD,
                             unsigned int* neighborX,
                             unsigned int* neighborY,
                             unsigned int* neighborZ,
-                            unsigned int size_Mat,
+                            unsigned long long numberOfLBnodes,
                             unsigned int numberOfThreads, 
                             real* DD,
                             bool isEvenTimestep);
@@ -373,7 +373,7 @@ void CalcMacCompSP27(real* vxD,
 								unsigned int* neighborX,
 								unsigned int* neighborY,
 								unsigned int* neighborZ,
-								unsigned int size_Mat,
+								unsigned long long numberOfLBnodes,
 								unsigned int numberOfThreads, 
 								real* DD,
 								bool isEvenTimestep);
@@ -383,7 +383,7 @@ void CalcMacThS7(  real* Conc,
                               unsigned int* neighborX,
                               unsigned int* neighborY,
                               unsigned int* neighborZ,
-                              unsigned int size_Mat,
+                              unsigned long long numberOfLBnodes,
                               unsigned int numberOfThreads, 
                               real* DD7,
                               bool isEvenTimestep);
@@ -395,7 +395,7 @@ void PlaneConcThS7(real* Conc,
 							  unsigned int* neighborX,
 							  unsigned int* neighborY,
 							  unsigned int* neighborZ,
-							  unsigned int size_Mat,
+							  unsigned long long numberOfLBnodes,
 							  unsigned int numberOfThreads, 
 							  real* DD7,
 							  bool isEvenTimestep);
@@ -407,7 +407,7 @@ void PlaneConcThS27(real* Conc,
 							   unsigned int* neighborX,
 							   unsigned int* neighborY,
 							   unsigned int* neighborZ,
-							   unsigned int size_Mat,
+							   unsigned long long numberOfLBnodes,
 							   unsigned int numberOfThreads, 
 							   real* DD27,
 							   bool isEvenTimestep);
@@ -418,7 +418,7 @@ void CalcConcentration27( unsigned int numberOfThreads,
                                      unsigned int* neighborX,
                                      unsigned int* neighborY,
                                      unsigned int* neighborZ,
-                                     unsigned int size_Mat,
+                                     unsigned long long numberOfLBnodes,
                                      real* DD27,
                                      bool isEvenTimestep);
 
@@ -431,7 +431,7 @@ void CalcMedSP27(  real* vxD,
                               unsigned int* neighborX,
                               unsigned int* neighborY,
                               unsigned int* neighborZ,
-                              unsigned int size_Mat,
+                              unsigned long long numberOfLBnodes,
                               unsigned int numberOfThreads, 
                               real* DD,
                               bool isEvenTimestep);
@@ -445,7 +445,7 @@ void CalcMedCompSP27(real* vxD,
 								unsigned int* neighborX,
 								unsigned int* neighborY,
 								unsigned int* neighborZ,
-								unsigned int size_Mat,
+								unsigned long long numberOfLBnodes,
 								unsigned int numberOfThreads, 
 								real* DD,
 								bool isEvenTimestep);
@@ -461,7 +461,7 @@ void CalcMedCompAD27(
 	unsigned int* neighborX,
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
-	unsigned int size_Mat,
+	unsigned long long numberOfLBnodes,
 	unsigned int numberOfThreads,
 	real* DD,
 	real* DD_AD,
@@ -477,7 +477,7 @@ void CalcMacMedSP27(  real* vxD,
                                  unsigned int* neighborY,
                                  unsigned int* neighborZ,
                                  unsigned int tdiff,
-                                 unsigned int size_Mat,
+                                 unsigned long long numberOfLBnodes,
                                  unsigned int numberOfThreads, 
                                  bool isEvenTimestep);
 
@@ -487,7 +487,7 @@ void ResetMedianValuesSP27(
 	real* vzD,
 	real* rhoD,
 	real* pressD,
-	unsigned int size_Mat,
+	unsigned long long numberOfLBnodes,
 	unsigned int numberOfThreads,
 	bool isEvenTimestep);
 
@@ -498,7 +498,7 @@ void ResetMedianValuesAD27(
 	real* rhoD,
 	real* pressD,
 	real* concD,
-	unsigned int size_Mat,
+	unsigned long long numberOfLBnodes,
 	unsigned int numberOfThreads,
 	bool isEvenTimestep);
 
@@ -511,7 +511,7 @@ void Calc2ndMomentsIncompSP27(real* kxyFromfcNEQ,
 										 unsigned int* neighborX,
 										 unsigned int* neighborY,
 										 unsigned int* neighborZ,
-										 unsigned int size_Mat,
+										 unsigned long long numberOfLBnodes,
 										 unsigned int numberOfThreads, 
 										 real* DD,
 										 bool isEvenTimestep);
@@ -525,7 +525,7 @@ void Calc2ndMomentsCompSP27(real* kxyFromfcNEQ,
 									   unsigned int* neighborX,
 									   unsigned int* neighborY,
 									   unsigned int* neighborZ,
-									   unsigned int size_Mat,
+									   unsigned long long numberOfLBnodes,
 									   unsigned int numberOfThreads, 
 									   real* DD,
 									   bool isEvenTimestep);
@@ -541,7 +541,7 @@ void Calc3rdMomentsIncompSP27(real* CUMbbb,
 										 unsigned int* neighborX,
 										 unsigned int* neighborY,
 										 unsigned int* neighborZ,
-										 unsigned int size_Mat,
+										 unsigned long long numberOfLBnodes,
 										 unsigned int numberOfThreads, 
 										 real* DD,
 										 bool isEvenTimestep);
@@ -557,7 +557,7 @@ void Calc3rdMomentsCompSP27(real* CUMbbb,
 									   unsigned int* neighborX,
 									   unsigned int* neighborY,
 									   unsigned int* neighborZ,
-									   unsigned int size_Mat,
+									   unsigned long long numberOfLBnodes,
 									   unsigned int numberOfThreads, 
 									   real* DD,
 									   bool isEvenTimestep);
@@ -576,7 +576,7 @@ void CalcHigherMomentsIncompSP27(real* CUMcbb,
 											unsigned int* neighborX,
 											unsigned int* neighborY,
 											unsigned int* neighborZ,
-											unsigned int size_Mat,
+											unsigned long long numberOfLBnodes,
 											unsigned int numberOfThreads, 
 											real* DD,
 											bool isEvenTimestep);
@@ -595,7 +595,7 @@ void CalcHigherMomentsCompSP27(real* CUMcbb,
 										  unsigned int* neighborX,
 										  unsigned int* neighborY,
 										  unsigned int* neighborZ,
-										  unsigned int size_Mat,
+										  unsigned long long numberOfLBnodes,
 										  unsigned int numberOfThreads, 
 										  real* DD,
 										  bool isEvenTimestep);
@@ -612,7 +612,7 @@ void LBCalcMeasurePoints27(real* vxMP,
                                       unsigned int* neighborX,
                                       unsigned int* neighborY,
                                       unsigned int* neighborZ,
-                                      unsigned int size_Mat,
+                                      unsigned long long numberOfLBnodes,
                                       real* DD,
                                       unsigned int numberOfThreads, 
                                       bool isEvenTimestep);
@@ -627,7 +627,7 @@ void BcPress27(int nx,
                           unsigned int* neighborY,
                           unsigned int* neighborZ,
                           real* DD, 
-                          unsigned int size_Mat, 
+                          unsigned long long numberOfLBnodes, 
                           bool isEvenTimestep);
 
 void BcVel27(int nx, 
@@ -641,7 +641,7 @@ void BcVel27(int nx,
                         unsigned int* neighborY,
                         unsigned int* neighborZ,
                         real* DD, 
-                        unsigned int size_Mat, 
+                        unsigned long long numberOfLBnodes, 
                         bool isEvenTimestep, 
                         real u0x, 
                         real om);
@@ -661,7 +661,7 @@ void QDevCompThinWalls27(unsigned int numberOfThreads,
 									unsigned int* neighborY,
 									unsigned int* neighborZ,
 									unsigned int* neighborWSB,
-									unsigned int size_Mat, 
+									unsigned long long numberOfLBnodes, 
 									bool isEvenTimestep);
 
 void QDev3rdMomentsComp27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
@@ -675,7 +675,7 @@ void QDevIncompHighNu27(  unsigned int numberOfThreads,
 									 unsigned int* neighborX,
 									 unsigned int* neighborY,
 									 unsigned int* neighborZ,
-									 unsigned int size_Mat, 
+									 unsigned long long numberOfLBnodes, 
 									 bool isEvenTimestep);
 
 void QDevCompHighNu27(unsigned int numberOfThreads,
@@ -687,7 +687,7 @@ void QDevCompHighNu27(unsigned int numberOfThreads,
 								 unsigned int* neighborX,
 								 unsigned int* neighborY,
 								 unsigned int* neighborZ,
-								 unsigned int size_Mat, 
+								 unsigned long long numberOfLBnodes, 
 								 bool isEvenTimestep);
 
 void QVelDevicePlainBB27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
@@ -704,7 +704,7 @@ void QVelDeviceCouette27(unsigned int numberOfThreads,
 									unsigned int* neighborX,
 									unsigned int* neighborY,
 									unsigned int* neighborZ,
-									unsigned int size_Mat, 
+									unsigned long long numberOfLBnodes, 
 									bool isEvenTimestep);
 
 void QVelDevice1h27( unsigned int numberOfThreads,
@@ -726,7 +726,7 @@ void QVelDevice1h27( unsigned int numberOfThreads,
 								real* coordX,
 								real* coordY,
 								real* coordZ,
-								unsigned int size_Mat, 
+								unsigned long long numberOfLBnodes, 
 								bool isEvenTimestep);
 
 void QVelDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
@@ -743,7 +743,7 @@ void QVelDevCompPlusSlip27(unsigned int numberOfThreads,
 									  unsigned int* neighborX,
 									  unsigned int* neighborY,
 									  unsigned int* neighborZ,
-									  unsigned int size_Mat, 
+									  unsigned long long numberOfLBnodes, 
 									  bool isEvenTimestep);
 
 void QVelDevComp27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
@@ -762,7 +762,7 @@ void QVelDevCompThinWalls27(unsigned int numberOfThreads,
 							           unsigned int* neighborY,
 							           unsigned int* neighborZ,
 									   unsigned int* neighborWSB,
-							           unsigned int size_Mat, 
+							           unsigned long long numberOfLBnodes, 
 							           bool isEvenTimestep);
 
 void QVelDevCompZeroPress27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
@@ -779,7 +779,7 @@ void QVelDevIncompHighNu27(  unsigned int numberOfThreads,
 										unsigned int* neighborX,
 										unsigned int* neighborY,
 										unsigned int* neighborZ,
-										unsigned int size_Mat, 
+										unsigned long long numberOfLBnodes, 
 										bool isEvenTimestep);
 
 void QVelDevCompHighNu27(unsigned int numberOfThreads,
@@ -794,7 +794,7 @@ void QVelDevCompHighNu27(unsigned int numberOfThreads,
 									unsigned int* neighborX,
 									unsigned int* neighborY,
 									unsigned int* neighborZ,
-									unsigned int size_Mat, 
+									unsigned long long numberOfLBnodes, 
 									bool isEvenTimestep);
 
 void QVeloDevEQ27(unsigned int numberOfThreads,
@@ -808,7 +808,7 @@ void QVeloDevEQ27(unsigned int numberOfThreads,
 							 unsigned int* neighborX,
 							 unsigned int* neighborY,
 							 unsigned int* neighborZ,
-							 unsigned int size_Mat, 
+							 unsigned long long numberOfLBnodes, 
 							 bool isEvenTimestep);
 
 void QVeloStreetDevEQ27(
@@ -848,7 +848,7 @@ void QSlipGeomDevComp27( unsigned int numberOfThreads,
 									unsigned int* neighborX,
 									unsigned int* neighborY,
 									unsigned int* neighborZ,
-									unsigned int size_Mat, 
+									unsigned long long numberOfLBnodes, 
 									bool isEvenTimestep);
 
 void QSlipNormDevComp27(unsigned int numberOfThreads,
@@ -863,7 +863,7 @@ void QSlipNormDevComp27(unsigned int numberOfThreads,
 								   unsigned int* neighborX,
 								   unsigned int* neighborY,
 								   unsigned int* neighborZ,
-								   unsigned int size_Mat, 
+								   unsigned long long numberOfLBnodes, 
 								   bool isEvenTimestep);
 
 void QStressDevComp27(Parameter *para,  QforBoundaryConditions* boundaryCondition, const int level);
@@ -883,7 +883,7 @@ void QPressDevFixBackflow27(unsigned int numberOfThreads,
                                        unsigned int* neighborX,
                                        unsigned int* neighborY,
                                        unsigned int* neighborZ,
-                                       unsigned int size_Mat, 
+                                       unsigned long long numberOfLBnodes, 
                                        bool isEvenTimestep);
 
 void QPressDevDirDepBot27(unsigned int numberOfThreads,
@@ -895,11 +895,13 @@ void QPressDevDirDepBot27(unsigned int numberOfThreads,
                                      unsigned int* neighborX,
                                      unsigned int* neighborY,
                                      unsigned int* neighborZ,
-                                     unsigned int size_Mat, 
+                                     unsigned long long numberOfLBnodes, 
                                      bool isEvenTimestep);
 
 void QPressNoRhoDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
 
+void QPressZeroRhoOutflowDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
+
 void QInflowScaleByPressDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
 
 void QPressDevOld27(unsigned int numberOfThreads,
@@ -912,7 +914,7 @@ void QPressDevOld27(unsigned int numberOfThreads,
                                unsigned int* neighborX,
                                unsigned int* neighborY,
                                unsigned int* neighborZ,
-                               unsigned int size_Mat, 
+                               unsigned long long numberOfLBnodes, 
                                bool isEvenTimestep);
 
 void QPressDevIncompNEQ27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
@@ -928,7 +930,7 @@ void QPressDevZero27(unsigned int numberOfThreads,
                                 unsigned int* neighborX,
                                 unsigned int* neighborY,
                                 unsigned int* neighborZ,
-                                unsigned int size_Mat, 
+                                unsigned long long numberOfLBnodes, 
                                 bool isEvenTimestep);
 
 void QPressDevFake27(   unsigned int numberOfThreads,
@@ -941,7 +943,7 @@ void QPressDevFake27(   unsigned int numberOfThreads,
 								   unsigned int* neighborX,
 								   unsigned int* neighborY,
 								   unsigned int* neighborZ,
-								   unsigned int size_Mat, 
+								   unsigned long long numberOfLBnodes, 
 								   bool isEvenTimestep);
 
 void BBDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
@@ -956,7 +958,7 @@ void QPressDev27_IntBB(  unsigned int numberOfThreads,
 									unsigned int* neighborX,
 									unsigned int* neighborY,
 									unsigned int* neighborZ,
-									unsigned int size_Mat, 
+									unsigned long long numberOfLBnodes, 
 									bool isEvenTimestep);
 
 void QPressDevAntiBB27(  unsigned int numberOfThreads,
@@ -972,7 +974,7 @@ void QPressDevAntiBB27(  unsigned int numberOfThreads,
 								  unsigned int* neighborX,
 								  unsigned int* neighborY,
 								  unsigned int* neighborZ,
-								  unsigned int size_Mat, 
+								  unsigned long long numberOfLBnodes, 
 								  bool isEvenTimestep);
 
 void PressSchlaffer27(unsigned int numberOfThreads,
@@ -989,7 +991,7 @@ void PressSchlaffer27(unsigned int numberOfThreads,
                                  unsigned int* neighborX,
                                  unsigned int* neighborY,
                                  unsigned int* neighborZ,
-                                 unsigned int size_Mat, 
+                                 unsigned long long numberOfLBnodes, 
                                  bool isEvenTimestep);
 
 void VelSchlaffer27(  unsigned int numberOfThreads,
@@ -1004,9 +1006,17 @@ void VelSchlaffer27(  unsigned int numberOfThreads,
                                  unsigned int* neighborX,
                                  unsigned int* neighborY,
                                  unsigned int* neighborZ,
-                                 unsigned int size_Mat, 
+                                 unsigned long long numberOfLBnodes, 
                                  bool isEvenTimestep);
 
+void QPrecursorDevCompZeroPress(LBMSimulationParameter* parameterDevice, QforPrecursorBoundaryConditions* boundaryCondition, real tRatio, real velocityRatio);
+
+void PrecursorDevEQ27(LBMSimulationParameter* parameterDevice, QforPrecursorBoundaryConditions* boundaryCondition, real tRatio, real velocityRatio);
+
+void PrecursorDevDistributions(LBMSimulationParameter* parameterDevice, QforPrecursorBoundaryConditions* boundaryCondition, real tRatio, real velocityRatio);
+
+void QPrecursorDevDistributions(LBMSimulationParameter* parameterDevice, QforPrecursorBoundaryConditions* boundaryCondition, real tRatio, real velocityRatio);
+
 void QADDev7(unsigned int numberOfThreads,
                         real* DD, 
                         real* DD7,
@@ -1019,7 +1029,7 @@ void QADDev7(unsigned int numberOfThreads,
                         unsigned int* neighborX,
                         unsigned int* neighborY,
                         unsigned int* neighborZ,
-                        unsigned int size_Mat, 
+                        unsigned long long numberOfLBnodes, 
                         bool isEvenTimestep);
 
 //////////////////////////////////////////////////////////////////////////
@@ -1033,7 +1043,7 @@ void FactorizedCentralMomentsAdvectionDiffusionDeviceKernel(
 	uint* neighborZ,
 	real* distributions,
 	real* distributionsAD,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	real* forces,
 	bool isEvenTimestep);
 
@@ -1053,7 +1063,7 @@ void ADSlipVelDevComp(
 	uint * neighborX,
 	uint * neighborY,
 	uint * neighborZ,
-	uint size_Mat,
+	unsigned long long numberOfLBnodes,
 	bool isEvenTimestep);
 	
 void QADDirichletDev27( unsigned int numberOfThreads,
@@ -1068,7 +1078,7 @@ void QADDirichletDev27( unsigned int numberOfThreads,
 								   unsigned int* neighborX,
 								   unsigned int* neighborY,
 								   unsigned int* neighborZ,
-								   unsigned int size_Mat, 
+								   unsigned long long numberOfLBnodes, 
 								   bool isEvenTimestep);
 
 void QADBBDev27(  unsigned int numberOfThreads,
@@ -1083,7 +1093,7 @@ void QADBBDev27(  unsigned int numberOfThreads,
 							 unsigned int* neighborX,
 							 unsigned int* neighborY,
 							 unsigned int* neighborZ,
-							 unsigned int size_Mat, 
+							 unsigned long long numberOfLBnodes, 
 							 bool isEvenTimestep);
 
 void QADVelDev7(unsigned int numberOfThreads,
@@ -1099,7 +1109,7 @@ void QADVelDev7(unsigned int numberOfThreads,
                            unsigned int* neighborX,
                            unsigned int* neighborY,
                            unsigned int* neighborZ,
-                           unsigned int size_Mat, 
+                           unsigned long long numberOfLBnodes, 
                            bool isEvenTimestep);
 
 
@@ -1116,7 +1126,7 @@ void QADVelDev27(  unsigned int numberOfThreads,
                               unsigned int* neighborX,
                               unsigned int* neighborY,
                               unsigned int* neighborZ,
-                              unsigned int size_Mat, 
+                              unsigned long long numberOfLBnodes, 
                               bool isEvenTimestep);
 
 void QADPressDev7( unsigned int numberOfThreads,
@@ -1132,7 +1142,7 @@ void QADPressDev7( unsigned int numberOfThreads,
                               unsigned int* neighborX,
                               unsigned int* neighborY,
                               unsigned int* neighborZ,
-                              unsigned int size_Mat, 
+                              unsigned long long numberOfLBnodes, 
                               bool isEvenTimestep);
 
 void QADPressDev27(unsigned int numberOfThreads,
@@ -1148,7 +1158,7 @@ void QADPressDev27(unsigned int numberOfThreads,
                               unsigned int* neighborX,
                               unsigned int* neighborY,
                               unsigned int* neighborZ,
-                              unsigned int size_Mat, 
+                              unsigned long long numberOfLBnodes, 
                               bool isEvenTimestep);
 
 void QADPressNEQNeighborDev27(
@@ -1161,7 +1171,7 @@ void QADPressNEQNeighborDev27(
 											unsigned int* neighborX,
 											unsigned int* neighborY,
 											unsigned int* neighborZ,
-											unsigned int size_Mat,
+											unsigned long long numberOfLBnodes,
 											bool isEvenTimestep
 										);
 
@@ -1177,7 +1187,7 @@ void QNoSlipADincompDev7(unsigned int numberOfThreads,
 									unsigned int* neighborX,
 									unsigned int* neighborY,
 									unsigned int* neighborZ,
-									unsigned int size_Mat, 
+									unsigned long long numberOfLBnodes, 
 									bool isEvenTimestep);
 
 void QNoSlipADincompDev27(unsigned int numberOfThreads,
@@ -1192,7 +1202,7 @@ void QNoSlipADincompDev27(unsigned int numberOfThreads,
 									 unsigned int* neighborX,
 									 unsigned int* neighborY,
 									 unsigned int* neighborZ,
-									 unsigned int size_Mat, 
+									 unsigned long long numberOfLBnodes, 
 									 bool isEvenTimestep);
 
 void QADVeloIncompDev7( unsigned int numberOfThreads,
@@ -1208,7 +1218,7 @@ void QADVeloIncompDev7( unsigned int numberOfThreads,
 								   unsigned int* neighborX,
 								   unsigned int* neighborY,
 								   unsigned int* neighborZ,
-								   unsigned int size_Mat, 
+								   unsigned long long numberOfLBnodes, 
 								   bool isEvenTimestep);
 
 
@@ -1225,7 +1235,7 @@ void QADVeloIncompDev27( unsigned int numberOfThreads,
 									unsigned int* neighborX,
 									unsigned int* neighborY,
 									unsigned int* neighborZ,
-									unsigned int size_Mat, 
+									unsigned long long numberOfLBnodes, 
 									bool isEvenTimestep);
 
 void QADPressIncompDev7(  unsigned int numberOfThreads,
@@ -1241,7 +1251,7 @@ void QADPressIncompDev7(  unsigned int numberOfThreads,
 									 unsigned int* neighborX,
 									 unsigned int* neighborY,
 									 unsigned int* neighborZ,
-									 unsigned int size_Mat, 
+									 unsigned long long numberOfLBnodes, 
 									 bool isEvenTimestep);
 
 void QADPressIncompDev27(  unsigned int numberOfThreads,
@@ -1257,7 +1267,7 @@ void QADPressIncompDev27(  unsigned int numberOfThreads,
 									  unsigned int* neighborX,
 									  unsigned int* neighborY,
 									  unsigned int* neighborZ,
-									  unsigned int size_Mat, 
+									  unsigned long long numberOfLBnodes, 
 									  bool isEvenTimestep);
 
 void PropVelo(   unsigned int numberOfThreads,
@@ -1270,7 +1280,7 @@ void PropVelo(   unsigned int numberOfThreads,
 							real* uz,
 							int* k_Q, 
 							unsigned int size_Prop,
-							unsigned int size_Mat,
+							unsigned long long numberOfLBnodes,
 							unsigned int* bcMatD,
 							real* DD,
 							bool EvenOrOdd);
@@ -1283,8 +1293,8 @@ void ScaleCF27( real* DC,
                            unsigned int* neighborFX,
                            unsigned int* neighborFY,
                            unsigned int* neighborFZ,
-                           unsigned int size_MatC, 
-                           unsigned int size_MatF, 
+                           unsigned long long numberOfLBnodesC, 
+                           unsigned long long numberOfLBnodesF, 
                            bool isEvenTimestep,
                            unsigned int* posCSWB, 
                            unsigned int* posFSWB, 
@@ -1306,8 +1316,8 @@ void ScaleFC27( real* DC,
                            unsigned int* neighborFX,
                            unsigned int* neighborFY,
                            unsigned int* neighborFZ,
-                           unsigned int size_MatC, 
-                           unsigned int size_MatF, 
+                           unsigned long long numberOfLBnodesC, 
+                           unsigned long long numberOfLBnodesF, 
                            bool isEvenTimestep,
                            unsigned int* posC, 
                            unsigned int* posFSWB, 
@@ -1329,8 +1339,8 @@ void ScaleCFEff27(real* DC,
                              unsigned int* neighborFX,
                              unsigned int* neighborFY,
                              unsigned int* neighborFZ,
-                             unsigned int size_MatC, 
-                             unsigned int size_MatF, 
+                             unsigned long long numberOfLBnodesC, 
+                             unsigned long long numberOfLBnodesF, 
                              bool isEvenTimestep,
                              unsigned int* posCSWB, 
                              unsigned int* posFSWB, 
@@ -1353,8 +1363,8 @@ void ScaleFCEff27(real* DC,
                              unsigned int* neighborFX,
                              unsigned int* neighborFY,
                              unsigned int* neighborFZ,
-                             unsigned int size_MatC, 
-                             unsigned int size_MatF, 
+                             unsigned long long numberOfLBnodesC, 
+                             unsigned long long numberOfLBnodesF, 
                              bool isEvenTimestep,
                              unsigned int* posC, 
                              unsigned int* posFSWB, 
@@ -1377,8 +1387,8 @@ void ScaleCFLast27(real* DC,
                               unsigned int* neighborFX,
                               unsigned int* neighborFY,
                               unsigned int* neighborFZ,
-                              unsigned int size_MatC, 
-                              unsigned int size_MatF, 
+                              unsigned long long numberOfLBnodesC, 
+                              unsigned long long numberOfLBnodesF, 
                               bool isEvenTimestep,
                               unsigned int* posCSWB, 
                               unsigned int* posFSWB, 
@@ -1401,8 +1411,8 @@ void ScaleFCLast27(real* DC,
                               unsigned int* neighborFX,
                               unsigned int* neighborFY,
                               unsigned int* neighborFZ,
-                              unsigned int size_MatC, 
-                              unsigned int size_MatF, 
+                              unsigned long long numberOfLBnodesC, 
+                              unsigned long long numberOfLBnodesF, 
                               bool isEvenTimestep,
                               unsigned int* posC, 
                               unsigned int* posFSWB, 
@@ -1425,8 +1435,8 @@ void ScaleCFpress27(real* DC,
                               unsigned int* neighborFX,
                               unsigned int* neighborFY,
                               unsigned int* neighborFZ,
-                              unsigned int size_MatC, 
-                              unsigned int size_MatF, 
+                              unsigned long long numberOfLBnodesC, 
+                              unsigned long long numberOfLBnodesF, 
                               bool isEvenTimestep,
                               unsigned int* posCSWB, 
                               unsigned int* posFSWB, 
@@ -1449,8 +1459,8 @@ void ScaleFCpress27(  real* DC,
                                  unsigned int* neighborFX,
                                  unsigned int* neighborFY,
                                  unsigned int* neighborFZ,
-                                 unsigned int size_MatC, 
-                                 unsigned int size_MatF, 
+                                 unsigned long long numberOfLBnodesC, 
+                                 unsigned long long numberOfLBnodesF, 
                                  bool isEvenTimestep,
                                  unsigned int* posC, 
                                  unsigned int* posFSWB, 
@@ -1473,8 +1483,8 @@ void ScaleCF_Fix_27(real* DC,
                               unsigned int* neighborFX,
                               unsigned int* neighborFY,
                               unsigned int* neighborFZ,
-                              unsigned int size_MatC, 
-                              unsigned int size_MatF, 
+                              unsigned long long numberOfLBnodesC, 
+                              unsigned long long numberOfLBnodesF, 
                               bool isEvenTimestep,
                               unsigned int* posCSWB, 
                               unsigned int* posFSWB, 
@@ -1497,8 +1507,8 @@ void ScaleCF_Fix_comp_27(   real* DC,
 									   unsigned int* neighborFX,
 									   unsigned int* neighborFY,
 									   unsigned int* neighborFZ,
-									   unsigned int size_MatC, 
-									   unsigned int size_MatF, 
+									   unsigned long long numberOfLBnodesC, 
+									   unsigned long long numberOfLBnodesF, 
 									   bool isEvenTimestep,
 									   unsigned int* posCSWB, 
 									   unsigned int* posFSWB, 
@@ -1521,8 +1531,8 @@ void ScaleCF_0817_comp_27(  real* DC,
 									   unsigned int* neighborFX,
 									   unsigned int* neighborFY,
 									   unsigned int* neighborFZ,
-									   unsigned int size_MatC, 
-									   unsigned int size_MatF, 
+									   unsigned long long numberOfLBnodesC, 
+									   unsigned long long numberOfLBnodesF, 
 									   bool isEvenTimestep,
 									   unsigned int* posCSWB, 
 									   unsigned int* posFSWB, 
@@ -1547,8 +1557,8 @@ void ScaleCF_comp_D3Q27F3_2018(	real* DC,
 											unsigned int* neighborFX,
 											unsigned int* neighborFY,
 											unsigned int* neighborFZ,
-											unsigned int size_MatC, 
-											unsigned int size_MatF, 
+											unsigned long long numberOfLBnodesC, 
+											unsigned long long numberOfLBnodesF, 
 											bool isEvenTimestep,
 											unsigned int* posCSWB, 
 											unsigned int* posFSWB, 
@@ -1572,8 +1582,8 @@ void ScaleCF_comp_D3Q27F3(real* DC,
 									 unsigned int* neighborFX,
 									 unsigned int* neighborFY,
 									 unsigned int* neighborFZ,
-									 unsigned int size_MatC, 
-									 unsigned int size_MatF, 
+									 unsigned long long numberOfLBnodesC, 
+									 unsigned long long numberOfLBnodesF, 
 									 bool isEvenTimestep,
 									 unsigned int* posCSWB, 
 									 unsigned int* posFSWB, 
@@ -1597,8 +1607,8 @@ void ScaleCF_staggered_time_comp_27( real* DC,
 												unsigned int* neighborFX,
 												unsigned int* neighborFY,
 												unsigned int* neighborFZ,
-												unsigned int size_MatC, 
-												unsigned int size_MatF, 
+												unsigned long long numberOfLBnodesC, 
+												unsigned long long numberOfLBnodesF, 
 												bool isEvenTimestep,
 												unsigned int* posCSWB, 
 												unsigned int* posFSWB, 
@@ -1624,8 +1634,8 @@ void ScaleCF_RhoSq_3rdMom_comp_27( real* DC,
 											  unsigned int* neighborFX,
 											  unsigned int* neighborFY,
 											  unsigned int* neighborFZ,
-											  unsigned int size_MatC, 
-											  unsigned int size_MatF, 
+											  unsigned long long numberOfLBnodesC, 
+											  unsigned long long numberOfLBnodesF, 
 											  bool isEvenTimestep,
 											  unsigned int* posCSWB, 
 											  unsigned int* posFSWB, 
@@ -1649,8 +1659,8 @@ void ScaleCF_AA2016_comp_27( real* DC,
 										unsigned int* neighborFX,
 										unsigned int* neighborFY,
 										unsigned int* neighborFZ,
-										unsigned int size_MatC, 
-										unsigned int size_MatF, 
+										unsigned long long numberOfLBnodesC, 
+										unsigned long long numberOfLBnodesF, 
 										bool isEvenTimestep,
 										unsigned int* posCSWB, 
 										unsigned int* posFSWB, 
@@ -1674,8 +1684,8 @@ void ScaleCF_NSPress_27(real* DC,
 								  unsigned int* neighborFX,
 								  unsigned int* neighborFY,
 								  unsigned int* neighborFZ,
-								  unsigned int size_MatC, 
-								  unsigned int size_MatF, 
+								  unsigned long long numberOfLBnodesC, 
+								  unsigned long long numberOfLBnodesF, 
 								  bool isEvenTimestep,
 								  unsigned int* posCSWB, 
 								  unsigned int* posFSWB, 
@@ -1698,8 +1708,8 @@ void ScaleFC_Fix_27(  real* DC,
                                  unsigned int* neighborFX,
                                  unsigned int* neighborFY,
                                  unsigned int* neighborFZ,
-                                 unsigned int size_MatC, 
-                                 unsigned int size_MatF, 
+                                 unsigned long long numberOfLBnodesC, 
+                                 unsigned long long numberOfLBnodesF, 
                                  bool isEvenTimestep,
                                  unsigned int* posC, 
                                  unsigned int* posFSWB, 
@@ -1722,8 +1732,8 @@ void ScaleFC_Fix_comp_27(   real* DC,
 									   unsigned int* neighborFX,
 									   unsigned int* neighborFY,
 									   unsigned int* neighborFZ,
-									   unsigned int size_MatC, 
-									   unsigned int size_MatF, 
+									   unsigned long long numberOfLBnodesC, 
+									   unsigned long long numberOfLBnodesF, 
 									   bool isEvenTimestep,
 									   unsigned int* posC, 
 									   unsigned int* posFSWB, 
@@ -1746,8 +1756,8 @@ void ScaleFC_0817_comp_27(  real* DC,
 									   unsigned int* neighborFX,
 									   unsigned int* neighborFY,
 									   unsigned int* neighborFZ,
-									   unsigned int size_MatC, 
-									   unsigned int size_MatF, 
+									   unsigned long long numberOfLBnodesC, 
+									   unsigned long long numberOfLBnodesF, 
 									   bool isEvenTimestep,
 									   unsigned int* posC, 
 									   unsigned int* posFSWB, 
@@ -1772,8 +1782,8 @@ void ScaleFC_comp_D3Q27F3_2018(real* DC,
 										  unsigned int* neighborFX,
 										  unsigned int* neighborFY,
 										  unsigned int* neighborFZ,
-										  unsigned int size_MatC, 
-										  unsigned int size_MatF, 
+										  unsigned long long numberOfLBnodesC, 
+										  unsigned long long numberOfLBnodesF, 
 										  bool isEvenTimestep,
 										  unsigned int* posC, 
 										  unsigned int* posFSWB, 
@@ -1797,8 +1807,8 @@ void ScaleFC_comp_D3Q27F3( real* DC,
 									  unsigned int* neighborFX,
 									  unsigned int* neighborFY,
 									  unsigned int* neighborFZ,
-									  unsigned int size_MatC, 
-									  unsigned int size_MatF, 
+									  unsigned long long numberOfLBnodesC, 
+									  unsigned long long numberOfLBnodesF, 
 									  bool isEvenTimestep,
 									  unsigned int* posC, 
 									  unsigned int* posFSWB, 
@@ -1822,8 +1832,8 @@ void ScaleFC_staggered_time_comp_27( real* DC,
 												unsigned int* neighborFX,
 												unsigned int* neighborFY,
 												unsigned int* neighborFZ,
-												unsigned int size_MatC, 
-												unsigned int size_MatF, 
+												unsigned long long numberOfLBnodesC, 
+												unsigned long long numberOfLBnodesF, 
 												bool isEvenTimestep,
 												unsigned int* posC, 
 												unsigned int* posFSWB, 
@@ -1849,8 +1859,8 @@ void ScaleFC_RhoSq_3rdMom_comp_27( real* DC,
 											  unsigned int* neighborFX,
 											  unsigned int* neighborFY,
 											  unsigned int* neighborFZ,
-											  unsigned int size_MatC, 
-											  unsigned int size_MatF, 
+											  unsigned long long numberOfLBnodesC, 
+											  unsigned long long numberOfLBnodesF, 
 											  bool isEvenTimestep,
 											  unsigned int* posC, 
 											  unsigned int* posFSWB, 
@@ -1874,8 +1884,8 @@ void ScaleFC_AA2016_comp_27( real* DC,
 										unsigned int* neighborFX,
 										unsigned int* neighborFY,
 										unsigned int* neighborFZ,
-										unsigned int size_MatC, 
-										unsigned int size_MatF, 
+										unsigned long long numberOfLBnodesC, 
+										unsigned long long numberOfLBnodesF, 
 										bool isEvenTimestep,
 										unsigned int* posC, 
 										unsigned int* posFSWB, 
@@ -1899,8 +1909,8 @@ void ScaleFC_NSPress_27(  real* DC,
 									 unsigned int* neighborFX,
 									 unsigned int* neighborFY,
 									 unsigned int* neighborFZ,
-									 unsigned int size_MatC, 
-									 unsigned int size_MatF, 
+									 unsigned long long numberOfLBnodesC, 
+									 unsigned long long numberOfLBnodesF, 
 									 bool isEvenTimestep,
 									 unsigned int* posC, 
 									 unsigned int* posFSWB, 
@@ -1925,8 +1935,8 @@ void ScaleCFThS7(  real* DC,
                               unsigned int* neighborFX,
                               unsigned int* neighborFY,
                               unsigned int* neighborFZ,
-                              unsigned int size_MatC, 
-                              unsigned int size_MatF, 
+                              unsigned long long numberOfLBnodesC, 
+                              unsigned long long numberOfLBnodesF, 
                               bool isEvenTimestep,
                               unsigned int* posCSWB, 
                               unsigned int* posFSWB, 
@@ -1945,8 +1955,8 @@ void ScaleFCThS7(  real* DC,
                               unsigned int* neighborFX,
                               unsigned int* neighborFY,
                               unsigned int* neighborFZ,
-                              unsigned int size_MatC, 
-                              unsigned int size_MatF, 
+                              unsigned long long numberOfLBnodesC, 
+                              unsigned long long numberOfLBnodesF, 
                               bool isEvenTimestep,
                               unsigned int* posC, 
                               unsigned int* posFSWB, 
@@ -1965,8 +1975,8 @@ void ScaleCFThSMG7(   real* DC,
                                  unsigned int* neighborFX,
                                  unsigned int* neighborFY,
                                  unsigned int* neighborFZ,
-                                 unsigned int size_MatC, 
-                                 unsigned int size_MatF, 
+                                 unsigned long long numberOfLBnodesC, 
+                                 unsigned long long numberOfLBnodesF, 
                                  bool isEvenTimestep,
                                  unsigned int* posCSWB, 
                                  unsigned int* posFSWB, 
@@ -1986,8 +1996,8 @@ void ScaleFCThSMG7(real* DC,
                               unsigned int* neighborFX,
                               unsigned int* neighborFY,
                               unsigned int* neighborFZ,
-                              unsigned int size_MatC, 
-                              unsigned int size_MatF, 
+                              unsigned long long numberOfLBnodesC, 
+                              unsigned long long numberOfLBnodesF, 
                               bool isEvenTimestep,
                               unsigned int* posC, 
                               unsigned int* posFSWB, 
@@ -2007,8 +2017,8 @@ void ScaleCFThS27( real* DC,
                               unsigned int* neighborFX,
                               unsigned int* neighborFY,
                               unsigned int* neighborFZ,
-                              unsigned int size_MatC, 
-                              unsigned int size_MatF, 
+                              unsigned long long numberOfLBnodesC, 
+                              unsigned long long numberOfLBnodesF, 
                               bool isEvenTimestep,
                               unsigned int* posCSWB, 
                               unsigned int* posFSWB, 
@@ -2028,8 +2038,8 @@ void ScaleFCThS27( real* DC,
                               unsigned int* neighborFX,
                               unsigned int* neighborFY,
                               unsigned int* neighborFZ,
-                              unsigned int size_MatC, 
-                              unsigned int size_MatF, 
+                              unsigned long long numberOfLBnodesC, 
+                              unsigned long long numberOfLBnodesF, 
                               bool isEvenTimestep,
                               unsigned int* posC, 
                               unsigned int* posFSWB, 
@@ -2049,7 +2059,7 @@ void DragLiftPostD27(real* DD,
 								unsigned int* neighborX,
 								unsigned int* neighborY,
 								unsigned int* neighborZ,
-								unsigned int size_Mat, 
+								unsigned long long numberOfLBnodes, 
 								bool isEvenTimestep,
 								unsigned int numberOfThreads);
 
@@ -2063,7 +2073,7 @@ void DragLiftPreD27( real* DD,
 								unsigned int* neighborX,
 								unsigned int* neighborY,
 								unsigned int* neighborZ,
-								unsigned int size_Mat, 
+								unsigned long long numberOfLBnodes, 
 								bool isEvenTimestep,
 								unsigned int numberOfThreads);
 
@@ -2074,7 +2084,7 @@ void CalcCPtop27(real* DD,
 							unsigned int* neighborX,
 							unsigned int* neighborY,
 							unsigned int* neighborZ,
-							unsigned int size_Mat, 
+							unsigned long long numberOfLBnodes, 
 							bool isEvenTimestep,
 							unsigned int numberOfThreads);
 
@@ -2085,7 +2095,7 @@ void CalcCPbottom27(real* DD,
 							   unsigned int* neighborX,
 							   unsigned int* neighborY,
 							   unsigned int* neighborZ,
-							   unsigned int size_Mat, 
+							   unsigned long long numberOfLBnodes, 
 							   bool isEvenTimestep,
 							   unsigned int numberOfThreads);
 
@@ -2096,7 +2106,7 @@ void GetSendFsPreDev27(real* DD,
 								  unsigned int* neighborX,
 								  unsigned int* neighborY,
 								  unsigned int* neighborZ,
-								  unsigned int size_Mat, 
+								  unsigned long long numberOfLBnodes, 
 								  bool isEvenTimestep,
 								  unsigned int numberOfThreads, 
 	                              cudaStream_t stream = CU_STREAM_LEGACY);
@@ -2108,7 +2118,7 @@ void GetSendFsPostDev27(real* DD,
 								   unsigned int* neighborX,
 								   unsigned int* neighborY,
 								   unsigned int* neighborZ,
-								   unsigned int size_Mat, 
+								   unsigned long long numberOfLBnodes, 
 								   bool isEvenTimestep,
 								   unsigned int numberOfThreads, 
 	                               cudaStream_t stream = CU_STREAM_LEGACY);
@@ -2120,7 +2130,7 @@ void SetRecvFsPreDev27(real* DD,
 								  unsigned int* neighborX,
 								  unsigned int* neighborY,
 								  unsigned int* neighborZ,
-								  unsigned int size_Mat, 
+								  unsigned long long numberOfLBnodes, 
 								  bool isEvenTimestep, unsigned int numberOfThreads, 
 	                              cudaStream_t stream = CU_STREAM_LEGACY);
 
@@ -2131,7 +2141,7 @@ void SetRecvFsPostDev27(real* DD,
 								   unsigned int* neighborX,
 								   unsigned int* neighborY,
 								   unsigned int* neighborZ,
-								   unsigned int size_Mat, 
+								   unsigned long long numberOfLBnodes, 
 								   bool isEvenTimestep,
 								   unsigned int numberOfThreads,
                                    cudaStream_t stream = CU_STREAM_LEGACY);
@@ -2144,7 +2154,7 @@ void getSendGsDevF3(
 	unsigned int* neighborX,
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
-	unsigned int size_Mat,
+	unsigned long long numberOfLBnodes,
 	bool isEvenTimestep,
 	unsigned int numberOfThreads);
 
@@ -2156,7 +2166,7 @@ void setRecvGsDevF3(
 	unsigned int* neighborX,
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
-	unsigned int size_Mat,
+	unsigned long long numberOfLBnodes,
 	bool isEvenTimestep,
 	unsigned int numberOfThreads);
 
@@ -2172,7 +2182,7 @@ void WallFuncDev27(unsigned int numberOfThreads,
 							  unsigned int* neighborX,
 							  unsigned int* neighborY,
 							  unsigned int* neighborZ,
-							  unsigned int size_Mat, 
+							  unsigned long long numberOfLBnodes, 
 							  bool isEvenTimestep);
 
 void SetOutputWallVelocitySP27(unsigned int numberOfThreads,
@@ -2190,7 +2200,7 @@ void SetOutputWallVelocitySP27(unsigned int numberOfThreads,
 										  unsigned int* neighborX,
 										  unsigned int* neighborY,
 										  unsigned int* neighborZ,
-										  unsigned int size_Mat,
+										  unsigned long long numberOfLBnodes,
 										  real* DD,
 										  bool isEvenTimestep);
 
@@ -2204,7 +2214,7 @@ void GetVelotoForce27(unsigned int numberOfThreads,
 								 unsigned int* neighborX,
 								 unsigned int* neighborY,
 								 unsigned int* neighborZ,
-								 unsigned int size_Mat, 
+								 unsigned long long numberOfLBnodes, 
 								 bool isEvenTimestep);
 
 void InitParticlesDevice(real* coordX,
@@ -2229,7 +2239,7 @@ void InitParticlesDevice(real* coordX,
 									unsigned int* neighborWSB,
 									int level,
 									unsigned int numberOfParticles, 
-									unsigned int size_Mat,
+									unsigned long long numberOfLBnodes,
 									unsigned int numberOfThreads);
 
 void MoveParticlesDevice(real* coordX,
@@ -2257,16 +2267,16 @@ void MoveParticlesDevice(real* coordX,
 									unsigned int timestep, 
 									unsigned int numberOfTimesteps, 
 									unsigned int numberOfParticles, 
-									unsigned int size_Mat,
+									unsigned long long numberOfLBnodes,
 									unsigned int numberOfThreads,
 									bool isEvenTimestep);
 
 void initRandomDevice(curandState* state,
-								 unsigned int size_Mat,
+								 unsigned long long numberOfLBnodes,
 								 unsigned int numberOfThreads);
 
 void generateRandomValuesDevice(curandState* state,
-										   unsigned int size_Mat,
+										   unsigned long long numberOfLBnodes,
 										   real* randArray,
 										   unsigned int numberOfThreads);
 
@@ -2285,7 +2295,7 @@ void CalcTurbulenceIntensityDevice(
    unsigned int* neighborX,
    unsigned int* neighborY,
    unsigned int* neighborZ,
-   unsigned int size_Mat, 
+   unsigned long long numberOfLBnodes, 
    bool isEvenTimestep,
    uint numberOfThreads);
 
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh b/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
index 94b9704b7ca57df4cd985f5aff9521b8a087b97f..3134db44346ee7f465a5c8f04505ee5749482fbf 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
+++ b/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
@@ -22,7 +22,7 @@ __global__ void LB_Kernel_Casc27(real s9,
                                             unsigned int* neighborY,
                                             unsigned int* neighborZ,
                                             real* DDStart,
-                                            int size_Mat,
+                                            unsigned long long numberOfLBnodes,
                                             bool EvenOrOdd);
 
 __global__ void LB_Kernel_Casc_SP_27(  real s9,
@@ -31,7 +31,7 @@ __global__ void LB_Kernel_Casc_SP_27(  real s9,
                                                   unsigned int* neighborY,
                                                   unsigned int* neighborZ,
                                                   real* DDStart,
-                                                  int size_Mat,
+                                                  unsigned long long numberOfLBnodes,
                                                   bool EvenOrOdd);
 
 __global__ void LB_Kernel_Casc_SP_MS_27(   real s9,
@@ -40,7 +40,7 @@ __global__ void LB_Kernel_Casc_SP_MS_27(   real s9,
                                                       unsigned int* neighborY,
                                                       unsigned int* neighborZ,
                                                       real* DDStart,
-                                                      int size_Mat,
+                                                      unsigned long long numberOfLBnodes,
                                                       bool EvenOrOdd);
 
 __global__ void LB_Kernel_Casc_SP_MS_OHM_27(  real s9,
@@ -49,134 +49,134 @@ __global__ void LB_Kernel_Casc_SP_MS_OHM_27(  real s9,
                                                          unsigned int* neighborY,
                                                          unsigned int* neighborZ,
                                                          real* DDStart,
-                                                         int size_Mat,
+                                                         unsigned long long numberOfLBnodes,
                                                          bool EvenOrOdd);
 
 __global__ void LB_Kernel_Kum_New_Comp_SRT_SP_27(
-	real omega,
-	unsigned int* bcMatD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	real* DDStart,
-	int size_Mat,
-	int level,
-	real* forces,
-	bool EvenOrOdd);
+    real omega,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DDStart,
+    unsigned long long numberOfLBnodes,
+    int level,
+    real* forces,
+    bool EvenOrOdd);
 
 __global__ void LB_Kernel_Cumulant_D3Q27All4(real omega,
-														unsigned int* bcMatD,
-														unsigned int* neighborX,
-														unsigned int* neighborY,
-														unsigned int* neighborZ,
-														real* DDStart,
-														int size_Mat,
-														int level,
-														real* forces,
-														bool EvenOrOdd);
+                                                        unsigned int* bcMatD,
+                                                        unsigned int* neighborX,
+                                                        unsigned int* neighborY,
+                                                        unsigned int* neighborZ,
+                                                        real* DDStart,
+                                                        unsigned long long numberOfLBnodes,
+                                                        int level,
+                                                        real* forces,
+                                                        bool EvenOrOdd);
 
 
 __global__ void LB_Kernel_Kum_AA2016_Comp_Bulk_SP_27(real omega,
-																unsigned int* bcMatD,
-																unsigned int* neighborX,
-																unsigned int* neighborY,
-																unsigned int* neighborZ,
-																real* DDStart,
-																int size_Mat,
-																int level,
-																real* forces,
-																bool EvenOrOdd);
+                                                                unsigned int* bcMatD,
+                                                                unsigned int* neighborX,
+                                                                unsigned int* neighborY,
+                                                                unsigned int* neighborZ,
+                                                                real* DDStart,
+                                                                unsigned long long numberOfLBnodes,
+                                                                int level,
+                                                                real* forces,
+                                                                bool EvenOrOdd);
 
 
 
 __global__ void LB_Kernel_Kum_1h_SP_27(  real omega,
-													real deltaPhi,
-													real angularVelocity,
-													unsigned int* bcMatD,
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													real* coordX,
-													real* coordY,
-													real* coordZ,
-													real* DDStart,
-													int size_Mat,
-													bool EvenOrOdd);
+                                                    real deltaPhi,
+                                                    real angularVelocity,
+                                                    unsigned int* bcMatD,
+                                                    unsigned int* neighborX,
+                                                    unsigned int* neighborY,
+                                                    unsigned int* neighborZ,
+                                                    real* coordX,
+                                                    real* coordY,
+                                                    real* coordZ,
+                                                    real* DDStart,
+                                                    unsigned long long numberOfLBnodes,
+                                                    bool EvenOrOdd);
 
 __global__ void LB_Kernel_Cascade_SP_27( real s9,
-													unsigned int* bcMatD,
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													real* DDStart,
-													int size_Mat,
-													bool EvenOrOdd);
+                                                    unsigned int* bcMatD,
+                                                    unsigned int* neighborX,
+                                                    unsigned int* neighborY,
+                                                    unsigned int* neighborZ,
+                                                    real* DDStart,
+                                                    unsigned long long numberOfLBnodes,
+                                                    bool EvenOrOdd);
 
 __global__ void LB_Kernel_Kum_New_SP_27( real s9,
-													unsigned int* bcMatD,
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													real* DDStart,
-													int size_Mat,
-													bool EvenOrOdd);
+                                                    unsigned int* bcMatD,
+                                                    unsigned int* neighborX,
+                                                    unsigned int* neighborY,
+                                                    unsigned int* neighborZ,
+                                                    real* DDStart,
+                                                    unsigned long long numberOfLBnodes,
+                                                    bool EvenOrOdd);
 
 __global__ void LB_Kernel_Kum_IsoTest_SP_27( real omega,
-														unsigned int* bcMatD,
-														unsigned int* neighborX,
-														unsigned int* neighborY,
-														unsigned int* neighborZ,
-														real* DDStart,
-														real* dxxUx,
-														real* dyyUy,
-														real* dzzUz,
-														int size_Mat,
-														bool EvenOrOdd);
+                                                        unsigned int* bcMatD,
+                                                        unsigned int* neighborX,
+                                                        unsigned int* neighborY,
+                                                        unsigned int* neighborZ,
+                                                        real* DDStart,
+                                                        real* dxxUx,
+                                                        real* dyyUy,
+                                                        real* dzzUz,
+                                                        unsigned long long numberOfLBnodes,
+                                                        bool EvenOrOdd);
 
 __global__ void LB_Kernel_Kum_Comp_SP_27(real s9,
-													unsigned int* bcMatD,
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													real* DDStart,
-													int size_Mat,
-													bool EvenOrOdd);
+                                                    unsigned int* bcMatD,
+                                                    unsigned int* neighborX,
+                                                    unsigned int* neighborY,
+                                                    unsigned int* neighborZ,
+                                                    real* DDStart,
+                                                    unsigned long long numberOfLBnodes,
+                                                    bool EvenOrOdd);
 
 __global__ void Cumulant_One_preconditioned_errorDiffusion_chim_Comp_SP_27(
-	real omega,
-	unsigned int* bcMatD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	real* DDStart,
-	int size_Mat,
-	int level,
-	real* forces,
-	bool EvenOrOdd);
+    real omega,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DDStart,
+    unsigned long long numberOfLBnodes,
+    int level,
+    real* forces,
+    bool EvenOrOdd);
 
 __global__ void Cumulant_One_preconditioned_chim_Comp_SP_27(
-	real omega,
-	unsigned int* bcMatD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	real* DDStart,
-	int size_Mat,
-	int level,
-	real* forces,
-	bool EvenOrOdd);
+    real omega,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DDStart,
+    unsigned long long numberOfLBnodes,
+    int level,
+    real* forces,
+    bool EvenOrOdd);
 
 __global__ void Cumulant_One_chim_Comp_SP_27(
-	real omega,
-	unsigned int* bcMatD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	real* DDStart,
-	int size_Mat,
-	int level,
-	real* forces,
-	bool EvenOrOdd);
+    real omega,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DDStart,
+    unsigned long long numberOfLBnodes,
+    int level,
+    real* forces,
+    bool EvenOrOdd);
 
 inline __device__ void forwardChimeraWithK(real &mfa, real &mfb, real &mfc, real vv, real v2, real K);
 
@@ -189,57 +189,57 @@ inline __device__ void backwardChimeraWithK(real &mfa, real &mfb, real &mfc, rea
 
 
 __global__ void LB_Kernel_WaleBySoniMalav_Cum_AA2016_Comp_SP_27(
-	real omega_in,
-	unsigned int* bcMatD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	unsigned int* neighborWSB,
-	real* veloX,
-	real* veloY,
-	real* veloZ,
-	real* DDStart,
-	real* turbulentViscosity,
-	int size_Mat,
-	int level,
-	real* forces,
-	bool EvenOrOdd);
+    real omega_in,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned int* neighborWSB,
+    real* veloX,
+    real* veloY,
+    real* veloZ,
+    real* DDStart,
+    real* turbulentViscosity,
+    unsigned long long numberOfLBnodes,
+    int level,
+    real* forces,
+    bool EvenOrOdd);
 
 
 __global__ void LB_Kernel_PM_Cum_One_Comp_SP_27( real omega,
-															unsigned int* neighborX,
-															unsigned int* neighborY,
-															unsigned int* neighborZ,
-															real* DDStart,
-															int size_Mat,
-															int level,
-															real* forces,
-															real porosity,
-															real darcy,
-															real forchheimer,
-															unsigned int sizeOfPorousMedia,
-															unsigned int* nodeIdsPorousMedia,
-															bool EvenOrOdd);
+                                                            unsigned int* neighborX,
+                                                            unsigned int* neighborY,
+                                                            unsigned int* neighborZ,
+                                                            real* DDStart,
+                                                            unsigned long long numberOfLBnodes,
+                                                            int level,
+                                                            real* forces,
+                                                            real porosity,
+                                                            real darcy,
+                                                            real forchheimer,
+                                                            unsigned int sizeOfPorousMedia,
+                                                            unsigned int* nodeIdsPorousMedia,
+                                                            bool EvenOrOdd);
 
 __global__ void LB_Kernel_AD_Incomp_7( real diffusivity,
-												  unsigned int* bcMatD,
-												  unsigned int* neighborX,
-												  unsigned int* neighborY,
-												  unsigned int* neighborZ,
-												  real* DDStart,
-												  real* DD7,
-												  int size_Mat,
-												  bool EvenOrOdd);
+                                                  unsigned int* bcMatD,
+                                                  unsigned int* neighborX,
+                                                  unsigned int* neighborY,
+                                                  unsigned int* neighborZ,
+                                                  real* DDStart,
+                                                  real* DD7,
+                                                  unsigned long long numberOfLBnodes,
+                                                  bool EvenOrOdd);
 
 __global__ void LB_Kernel_AD_Incomp_27( real diffusivity,
-												   unsigned int* bcMatD,
-												   unsigned int* neighborX,
-												   unsigned int* neighborY,
-												   unsigned int* neighborZ,
-												   real* DDStart,
-												   real* DD27,
-												   int size_Mat,
-												   bool EvenOrOdd);
+                                                   unsigned int* bcMatD,
+                                                   unsigned int* neighborX,
+                                                   unsigned int* neighborY,
+                                                   unsigned int* neighborZ,
+                                                   real* DDStart,
+                                                   real* DD27,
+                                                   unsigned long long numberOfLBnodes,
+                                                   bool EvenOrOdd);
 
 __global__ void LBInit27( int myid,
                                      int numprocs,
@@ -249,7 +249,7 @@ __global__ void LBInit27( int myid,
                                      unsigned int* neighborY,
                                      unsigned int* neighborZ,
                                      real* vParabel,
-                                     unsigned int size_Mat,
+                                     unsigned long long numberOfLBnodes,
                                      unsigned int grid_nx,
                                      unsigned int grid_ny,
                                      unsigned int grid_nz,
@@ -266,7 +266,7 @@ __global__ void LBInitNonEqPartSP27(unsigned int* neighborX,
                                                real* ux,
                                                real* uy,
                                                real* uz,
-                                               unsigned int size_Mat,
+                                               unsigned long long numberOfLBnodes,
                                                real* DD,
                                                real omega,
                                                bool EvenOrOdd);
@@ -279,7 +279,7 @@ __global__ void InitAD7( unsigned int* neighborX,
                                        real* ux,
                                        real* uy,
                                        real* uz,
-                                       unsigned int size_Mat,
+                                       unsigned long long numberOfLBnodes,
                                        real* DD7,
                                        bool EvenOrOdd);
 
@@ -291,26 +291,26 @@ __global__ void InitAD27(unsigned int* neighborX,
                                        real* ux,
                                        real* uy,
                                        real* uz,
-                                       unsigned int size_Mat,
+                                       unsigned long long numberOfLBnodes,
                                        real* DD27,
                                        bool EvenOrOdd);
 
 __global__ void LB_PostProcessor_F3_2018_Fehlberg(
-	real omega,
-	unsigned int* bcMatD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	real* rhoOut,
-	real* vxOut,
-	real* vyOut,
-	real* vzOut,
-	real* DDStart,
-	real* G6,
-	int size_Mat,
-	int level,
-	real* forces,
-	bool EvenOrOdd);
+    real omega,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* rhoOut,
+    real* vxOut,
+    real* vyOut,
+    real* vzOut,
+    real* DDStart,
+    real* G6,
+    unsigned long long numberOfLBnodes,
+    int level,
+    real* forces,
+    bool EvenOrOdd);
 
 __global__ void LBCalcMac27( real* vxD,
                                         real* vyD,
@@ -320,7 +320,7 @@ __global__ void LBCalcMac27( real* vxD,
                                         unsigned int* neighborY,
                                         unsigned int* neighborZ,
                                         unsigned int* geoD,
-                                        unsigned int size_Mat,
+                                        unsigned long long numberOfLBnodes,
                                         real* DD,
                                         bool isEvenTimestep);
 
@@ -333,60 +333,60 @@ __global__ void LBCalcMacSP27( real* vxD,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat,
+                                          unsigned long long numberOfLBnodes,
                                           real* DD,
                                           bool isEvenTimestep);
 
 __global__ void LBCalcMacCompSP27( real* vxD,
-											  real* vyD,
-											  real* vzD,
-											  real* rhoD,
-											  real* pressD,
-											  unsigned int* geoD,
-											  unsigned int* neighborX,
-											  unsigned int* neighborY,
-											  unsigned int* neighborZ,
-											  unsigned int size_Mat,
-											  real* DD,
-											  bool isEvenTimestep);
+                                              real* vyD,
+                                              real* vzD,
+                                              real* rhoD,
+                                              real* pressD,
+                                              unsigned int* geoD,
+                                              unsigned int* neighborX,
+                                              unsigned int* neighborY,
+                                              unsigned int* neighborZ,
+                                              unsigned long long numberOfLBnodes,
+                                              real* DD,
+                                              bool isEvenTimestep);
 
 __global__ void CalcConc7( real* Conc,
                                           unsigned int* geoD,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat,
+                                          unsigned long long numberOfLBnodes,
                                           real* DD7,
                                           bool isEvenTimestep);
 
 __global__ void GetPlaneConc7(real* Conc,
-								            int* kPC,
-								            unsigned int numberOfPointskPC,
-											unsigned int* geoD,
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int size_Mat,
-											real* DD7,
-											bool isEvenTimestep);
+                                            int* kPC,
+                                            unsigned int numberOfPointskPC,
+                                            unsigned int* geoD,
+                                            unsigned int* neighborX,
+                                            unsigned int* neighborY,
+                                            unsigned int* neighborZ,
+                                            unsigned long long numberOfLBnodes,
+                                            real* DD7,
+                                            bool isEvenTimestep);
 
 __global__ void GetPlaneConc27(real* Conc,
-								             int* kPC,
-								             unsigned int numberOfPointskPC,
-											 unsigned int* geoD,
-											 unsigned int* neighborX,
-											 unsigned int* neighborY,
-											 unsigned int* neighborZ,
-											 unsigned int size_Mat,
-											 real* DD27,
-											 bool isEvenTimestep);
+                                             int* kPC,
+                                             unsigned int numberOfPointskPC,
+                                             unsigned int* geoD,
+                                             unsigned int* neighborX,
+                                             unsigned int* neighborY,
+                                             unsigned int* neighborZ,
+                                             unsigned long long numberOfLBnodes,
+                                             real* DD27,
+                                             bool isEvenTimestep);
 
 __global__ void CalcConc27(real* Conc,
                                           unsigned int* geoD,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat,
+                                          unsigned long long numberOfLBnodes,
                                           real* DD27,
                                           bool isEvenTimestep);
 
@@ -399,38 +399,38 @@ __global__ void LBCalcMedSP27( real* vxD,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat,
+                                          unsigned long long numberOfLBnodes,
                                           real* DD,
                                           bool isEvenTimestep);
 
 __global__ void LBCalcMedCompSP27( real* vxD,
-											  real* vyD,
-											  real* vzD,
-											  real* rhoD,
-											  real* pressD,
-											  unsigned int* geoD,
-											  unsigned int* neighborX,
-											  unsigned int* neighborY,
-											  unsigned int* neighborZ,
-											  unsigned int size_Mat,
-											  real* DD,
-											  bool isEvenTimestep);
+                                              real* vyD,
+                                              real* vzD,
+                                              real* rhoD,
+                                              real* pressD,
+                                              unsigned int* geoD,
+                                              unsigned int* neighborX,
+                                              unsigned int* neighborY,
+                                              unsigned int* neighborZ,
+                                              unsigned long long numberOfLBnodes,
+                                              real* DD,
+                                              bool isEvenTimestep);
 
 __global__ void LBCalcMedCompAD27(
-	real* vxD,
-	real* vyD,
-	real* vzD,
-	real* rhoD,
-	real* pressD,
-	real* concD,
-	unsigned int* geoD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	unsigned int size_Mat,
-	real* DD,
-	real* DD_AD,
-	bool isEvenTimestep);
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    real* concD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    real* DD,
+    real* DD_AD,
+    bool isEvenTimestep);
 
 __global__ void LBCalcMacMedSP27( real* vxD,
                                              real* vyD,
@@ -442,119 +442,119 @@ __global__ void LBCalcMacMedSP27( real* vxD,
                                              unsigned int* neighborY,
                                              unsigned int* neighborZ,
                                              unsigned int tdiff,
-                                             unsigned int size_Mat,
+                                             unsigned long long numberOfLBnodes,
                                              bool isEvenTimestep);
 
 __global__ void LBResetMedianValuesSP27(
-	real* vxD,
-	real* vyD,
-	real* vzD,
-	real* rhoD,
-	real* pressD,
-	unsigned int size_Mat,
-	bool isEvenTimestep);
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep);
 
 __global__ void LBResetMedianValuesAD27(
-	real* vxD,
-	real* vyD,
-	real* vzD,
-	real* rhoD,
-	real* pressD,
-	real* concD,
-	unsigned int size_Mat,
-	bool isEvenTimestep);
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    real* concD,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep);
 
 __global__ void LBCalc2ndMomentsIncompSP27(  real* kxyFromfcNEQ,
-														real* kyzFromfcNEQ,
-														real* kxzFromfcNEQ,
-														real* kxxMyyFromfcNEQ,
-														real* kxxMzzFromfcNEQ,
-														unsigned int* geoD,
-														unsigned int* neighborX,
-														unsigned int* neighborY,
-														unsigned int* neighborZ,
-														unsigned int size_Mat,
-														real* DD,
-														bool isEvenTimestep);
+                                                        real* kyzFromfcNEQ,
+                                                        real* kxzFromfcNEQ,
+                                                        real* kxxMyyFromfcNEQ,
+                                                        real* kxxMzzFromfcNEQ,
+                                                        unsigned int* geoD,
+                                                        unsigned int* neighborX,
+                                                        unsigned int* neighborY,
+                                                        unsigned int* neighborZ,
+                                                        unsigned long long numberOfLBnodes,
+                                                        real* DD,
+                                                        bool isEvenTimestep);
 
 __global__ void LBCalc2ndMomentsCompSP27(real* kxyFromfcNEQ,
-													real* kyzFromfcNEQ,
-													real* kxzFromfcNEQ,
-													real* kxxMyyFromfcNEQ,
-													real* kxxMzzFromfcNEQ,
-													unsigned int* geoD,
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													unsigned int size_Mat,
-													real* DD,
-													bool isEvenTimestep);
+                                                    real* kyzFromfcNEQ,
+                                                    real* kxzFromfcNEQ,
+                                                    real* kxxMyyFromfcNEQ,
+                                                    real* kxxMzzFromfcNEQ,
+                                                    unsigned int* geoD,
+                                                    unsigned int* neighborX,
+                                                    unsigned int* neighborY,
+                                                    unsigned int* neighborZ,
+                                                    unsigned long long numberOfLBnodes,
+                                                    real* DD,
+                                                    bool isEvenTimestep);
 
 __global__ void LBCalc3rdMomentsIncompSP27(  real* CUMbbb,
-														real* CUMabc,
-														real* CUMbac,
-														real* CUMbca,
-														real* CUMcba,
-														real* CUMacb,
-														real* CUMcab,
-														unsigned int* bcMatD,
-														unsigned int* neighborX,
-														unsigned int* neighborY,
-														unsigned int* neighborZ,
-														real* DDStart,
-														int size_Mat,
-														bool EvenOrOdd);
+                                                        real* CUMabc,
+                                                        real* CUMbac,
+                                                        real* CUMbca,
+                                                        real* CUMcba,
+                                                        real* CUMacb,
+                                                        real* CUMcab,
+                                                        unsigned int* bcMatD,
+                                                        unsigned int* neighborX,
+                                                        unsigned int* neighborY,
+                                                        unsigned int* neighborZ,
+                                                        real* DDStart,
+                                                        unsigned long long numberOfLBnodes,
+                                                        bool EvenOrOdd);
 
 __global__ void LBCalc3rdMomentsCompSP27(real* CUMbbb,
-													real* CUMabc,
-													real* CUMbac,
-													real* CUMbca,
-													real* CUMcba,
-													real* CUMacb,
-													real* CUMcab,
-													unsigned int* bcMatD,
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													real* DDStart,
-													int size_Mat,
-													bool EvenOrOdd);
+                                                    real* CUMabc,
+                                                    real* CUMbac,
+                                                    real* CUMbca,
+                                                    real* CUMcba,
+                                                    real* CUMacb,
+                                                    real* CUMcab,
+                                                    unsigned int* bcMatD,
+                                                    unsigned int* neighborX,
+                                                    unsigned int* neighborY,
+                                                    unsigned int* neighborZ,
+                                                    real* DDStart,
+                                                    unsigned long long numberOfLBnodes,
+                                                    bool EvenOrOdd);
 
 __global__ void LBCalcHigherMomentsIncompSP27(   real* CUMcbb,
-															real* CUMbcb,
-															real* CUMbbc,
-															real* CUMcca,
-															real* CUMcac,
-															real* CUMacc,
-															real* CUMbcc,
-															real* CUMcbc,
-															real* CUMccb,
-															real* CUMccc,
-															unsigned int* bcMatD,
-															unsigned int* neighborX,
-															unsigned int* neighborY,
-															unsigned int* neighborZ,
-															real* DDStart,
-															int size_Mat,
-															bool EvenOrOdd);
+                                                            real* CUMbcb,
+                                                            real* CUMbbc,
+                                                            real* CUMcca,
+                                                            real* CUMcac,
+                                                            real* CUMacc,
+                                                            real* CUMbcc,
+                                                            real* CUMcbc,
+                                                            real* CUMccb,
+                                                            real* CUMccc,
+                                                            unsigned int* bcMatD,
+                                                            unsigned int* neighborX,
+                                                            unsigned int* neighborY,
+                                                            unsigned int* neighborZ,
+                                                            real* DDStart,
+                                                            unsigned long long numberOfLBnodes,
+                                                            bool EvenOrOdd);
 
 __global__ void LBCalcHigherMomentsCompSP27( real* CUMcbb,
-														real* CUMbcb,
-														real* CUMbbc,
-														real* CUMcca,
-														real* CUMcac,
-														real* CUMacc,
-														real* CUMbcc,
-														real* CUMcbc,
-														real* CUMccb,
-														real* CUMccc,
-														unsigned int* bcMatD,
-														unsigned int* neighborX,
-														unsigned int* neighborY,
-														unsigned int* neighborZ,
-														real* DDStart,
-														int size_Mat,
-														bool EvenOrOdd);
+                                                        real* CUMbcb,
+                                                        real* CUMbbc,
+                                                        real* CUMcca,
+                                                        real* CUMcac,
+                                                        real* CUMacc,
+                                                        real* CUMbcc,
+                                                        real* CUMcbc,
+                                                        real* CUMccb,
+                                                        real* CUMccc,
+                                                        unsigned int* bcMatD,
+                                                        unsigned int* neighborX,
+                                                        unsigned int* neighborY,
+                                                        unsigned int* neighborZ,
+                                                        real* DDStart,
+                                                        unsigned long long numberOfLBnodes,
+                                                        bool EvenOrOdd);
 
 __global__ void LBCalcMeasurePoints(real* vxMP,
                                                real* vyMP,
@@ -568,7 +568,7 @@ __global__ void LBCalcMeasurePoints(real* vxMP,
                                                unsigned int* neighborX,
                                                unsigned int* neighborY,
                                                unsigned int* neighborZ,
-                                               unsigned int size_Mat,
+                                               unsigned long long numberOfLBnodes,
                                                real* DD,
                                                bool isEvenTimestep);
 
@@ -580,7 +580,7 @@ __global__ void LB_BC_Press_East27( int nx,
                                                unsigned int* neighborY,
                                                unsigned int* neighborZ,
                                                real* DD,
-                                               unsigned int size_Mat,
+                                               unsigned long long numberOfLBnodes,
                                                bool isEvenTimestep) ;
 
 __global__ void LB_BC_Vel_West_27( int nx,
@@ -592,7 +592,7 @@ __global__ void LB_BC_Vel_West_27( int nx,
                                               unsigned int* neighborY,
                                               unsigned int* neighborZ,
                                               real* DD,
-                                              unsigned int size_Mat,
+                                              unsigned long long numberOfLBnodes,
                                               bool isEvenTimestep,
                                               real u0x,
                                               unsigned int grid_nx,
@@ -608,64 +608,64 @@ __global__ void QDevice27(real* distributions,
                                      unsigned int* neighborX,
                                      unsigned int* neighborY,
                                      unsigned int* neighborZ,
-                                     unsigned int numberOfLBnodes,
+                                     unsigned long long numberOfLBnodes,
                                      bool isEvenTimestep);
 
 __global__ void QDeviceComp27(
-										 real* distributions,
-										 int* subgridDistanceIndices,
-										 real* subgridDistances,
-										 unsigned int numberOfBCnodes,
-										 real omega,
-										 unsigned int* neighborX,
-										 unsigned int* neighborY,
-										 unsigned int* neighborZ,
-										 unsigned int numberOfLBnodes,
-										 bool isEvenTimestep);
+                                         real* distributions,
+                                         int* subgridDistanceIndices,
+                                         real* subgridDistances,
+                                         unsigned int numberOfBCnodes,
+                                         real omega,
+                                         unsigned int* neighborX,
+                                         unsigned int* neighborY,
+                                         unsigned int* neighborZ,
+                                         unsigned long long numberOfLBnodes,
+                                         bool isEvenTimestep);
 
 __global__ void QDeviceCompThinWallsPartOne27(real* DD,
-														 int* k_Q,
-														 real* QQ,
-														 unsigned int numberOfBCnodes,
-														 real om1,
-														 unsigned int* neighborX,
-														 unsigned int* neighborY,
-														 unsigned int* neighborZ,
-														 unsigned int size_Mat,
-														 bool isEvenTimestep);
-
-__global__ void QDevice3rdMomentsComp27(	 real* distributions, 
-													 int* subgridDistanceIndices, 
-													 real* subgridDistances,
-													 unsigned int numberOfBCnodes, 
-													 real omega, 
-													 unsigned int* neighborX,
-													 unsigned int* neighborY,
-													 unsigned int* neighborZ,
-													 unsigned int numberOfLBnodes, 
-													 bool isEvenTimestep);
+                                                         int* k_Q,
+                                                         real* QQ,
+                                                         unsigned int numberOfBCnodes,
+                                                         real om1,
+                                                         unsigned int* neighborX,
+                                                         unsigned int* neighborY,
+                                                         unsigned int* neighborZ,
+                                                         unsigned long long numberOfLBnodes,
+                                                         bool isEvenTimestep);
+
+__global__ void QDevice3rdMomentsComp27(	 real* distributions,
+                                                     int* subgridDistanceIndices,
+                                                     real* subgridDistances,
+                                                     unsigned int numberOfBCnodes,
+                                                     real omega,
+                                                     unsigned int* neighborX,
+                                                     unsigned int* neighborY,
+                                                     unsigned int* neighborZ,
+                                                     unsigned long long numberOfLBnodes,
+                                                     bool isEvenTimestep);
 
 __global__ void QDeviceIncompHighNu27(real* DD,
-												 int* k_Q,
-												 real* QQ,
-												 unsigned int numberOfBCnodes,
-												 real om1,
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int numberOfLBnodes,
-												 bool isEvenTimestep);
+                                                 int* k_Q,
+                                                 real* QQ,
+                                                 unsigned int numberOfBCnodes,
+                                                 real om1,
+                                                 unsigned int* neighborX,
+                                                 unsigned int* neighborY,
+                                                 unsigned int* neighborZ,
+                                                 unsigned long long numberOfLBnodes,
+                                                 bool isEvenTimestep);
 
 __global__ void QDeviceCompHighNu27(	 real* DD,
-												 int* k_Q,
-												 real* QQ,
-												 unsigned int numberOfBCnodes,
-												 real om1,
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int size_Mat,
-												 bool isEvenTimestep);
+                                                 int* k_Q,
+                                                 real* QQ,
+                                                 unsigned int numberOfBCnodes,
+                                                 real om1,
+                                                 unsigned int* neighborX,
+                                                 unsigned int* neighborY,
+                                                 unsigned int* neighborZ,
+                                                 unsigned long long numberOfLBnodes,
+                                                 bool isEvenTimestep);
 
 //Velocity BCs
 __global__ void QVelDevPlainBB27(
@@ -679,43 +679,43 @@ __global__ void QVelDevPlainBB27(
     uint* neighborX,
     uint* neighborY,
     uint* neighborZ,
-    uint numberOfLBnodes,
+    unsigned long long numberOfLBnodes,
     bool isEvenTimestep);
 
 __global__ void QVelDevCouette27(real* vx,
-											real* vy,
-											real* vz,
-											real* DD,
-											int* k_Q,
-											real* QQ,
-											unsigned int numberOfBCnodes,
-											real om1,
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int size_Mat,
-											bool isEvenTimestep);
+                                            real* vy,
+                                            real* vz,
+                                            real* DD,
+                                            int* k_Q,
+                                            real* QQ,
+                                            unsigned int numberOfBCnodes,
+                                            real om1,
+                                            unsigned int* neighborX,
+                                            unsigned int* neighborY,
+                                            unsigned int* neighborZ,
+                                            unsigned long long numberOfLBnodes,
+                                            bool isEvenTimestep);
 
 __global__ void QVelDev1h27( int inx,
-										int iny,
-										real* vx,
-										real* vy,
-										real* vz,
-										real* DD,
-										int* k_Q,
-										real* QQ,
-										unsigned int numberOfBCnodes,
-										real om1,
-										real Phi,
-										real angularVelocity,
-										unsigned int* neighborX,
-										unsigned int* neighborY,
-										unsigned int* neighborZ,
-										real* coordX,
-										real* coordY,
-										real* coordZ,
-										unsigned int size_Mat,
-										bool isEvenTimestep);
+                                        int iny,
+                                        real* vx,
+                                        real* vy,
+                                        real* vz,
+                                        real* DD,
+                                        int* k_Q,
+                                        real* QQ,
+                                        unsigned int numberOfBCnodes,
+                                        real om1,
+                                        real Phi,
+                                        real angularVelocity,
+                                        unsigned int* neighborX,
+                                        unsigned int* neighborY,
+                                        unsigned int* neighborZ,
+                                        real* coordX,
+                                        real* coordY,
+                                        real* coordZ,
+                                        unsigned long long numberOfLBnodes,
+                                        bool isEvenTimestep);
 
 __global__ void QVelDevice27(int inx,
                                         int iny,
@@ -730,111 +730,111 @@ __global__ void QVelDevice27(int inx,
                                         unsigned int* neighborX,
                                         unsigned int* neighborY,
                                         unsigned int* neighborZ,
-                                        unsigned int size_Mat,
+                                        unsigned long long numberOfLBnodes,
                                         bool isEvenTimestep);
 
 __global__ void QVelDeviceCompPlusSlip27(real* vx,
-													real* vy,
-													real* vz,
-													real* DD,
-													int* k_Q,
-													real* QQ,
-													unsigned int numberOfBCnodes,
-													real om1,
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													unsigned int size_Mat,
-													bool isEvenTimestep);
+                                                    real* vy,
+                                                    real* vz,
+                                                    real* DD,
+                                                    int* k_Q,
+                                                    real* QQ,
+                                                    unsigned int numberOfBCnodes,
+                                                    real om1,
+                                                    unsigned int* neighborX,
+                                                    unsigned int* neighborY,
+                                                    unsigned int* neighborZ,
+                                                    unsigned long long numberOfLBnodes,
+                                                    bool isEvenTimestep);
 
 __global__ void QVelDeviceComp27(real* velocityX,
-											real* velocityY,
-											real* velocityZ,
-											real* distribution,
-											int* subgridDistanceIndices,
-											real* subgridDistances,
-											unsigned int numberOfBCnodes,
-											real omega,
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int numberOfLBnodes,
-											bool isEvenTimestep);
+                                            real* velocityY,
+                                            real* velocityZ,
+                                            real* distribution,
+                                            int* subgridDistanceIndices,
+                                            real* subgridDistances,
+                                            unsigned int numberOfBCnodes,
+                                            real omega,
+                                            unsigned int* neighborX,
+                                            unsigned int* neighborY,
+                                            unsigned int* neighborZ,
+                                            unsigned long long numberOfLBnodes,
+                                            bool isEvenTimestep);
 
 __global__ void QVelDeviceCompThinWallsPartOne27(
-	real* vx,
-	real* vy,
-	real* vz,
-	real* DD,
-	int* k_Q,
-	real* QQ,
-	uint numberOfBCnodes,
-	real om1,
-	uint* neighborX,
-	uint* neighborY,
-	uint* neighborZ,
-	uint size_Mat,
-	bool isEvenTimestep);
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    uint numberOfBCnodes,
+    real om1,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep);
 
 __global__ void QThinWallsPartTwo27(
-	real* DD,
-	int* k_Q,
-	real* QQ,
-	uint numberOfBCnodes,
-	uint* geom,
-	uint* neighborX,
-	uint* neighborY,
-	uint* neighborZ,
-	uint* neighborWSB,
-	uint size_Mat,
-	bool isEvenTimestep);
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    uint numberOfBCnodes,
+    uint* geom,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    uint* neighborWSB,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep);
 
 __global__ void QVelDeviceCompZeroPress27(
-	real* velocityX,
-	real* velocityY,
-	real* velocityZ,
-	real* distribution,
-	int* subgridDistanceIndices,
-	real* subgridDistances,
-	unsigned int numberOfBCnodes,
-	real omega,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	unsigned int numberOfLBnodes,
-	bool isEvenTimestep);
+    real* velocityX,
+    real* velocityY,
+    real* velocityZ,
+    real* distribution,
+    int* subgridDistanceIndices,
+    real* subgridDistances,
+    unsigned int numberOfBCnodes,
+    real omega,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep);
 
 __global__ void QVelDeviceIncompHighNu27(real* vx,
-													real* vy,
-													real* vz,
-													real* DD,
-													int* k_Q,
-													real* QQ,
-													unsigned int numberOfBCnodes,
-													real om1,
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													unsigned int size_Mat,
-													bool isEvenTimestep);
+                                                    real* vy,
+                                                    real* vz,
+                                                    real* DD,
+                                                    int* k_Q,
+                                                    real* QQ,
+                                                    unsigned int numberOfBCnodes,
+                                                    real om1,
+                                                    unsigned int* neighborX,
+                                                    unsigned int* neighborY,
+                                                    unsigned int* neighborZ,
+                                                    unsigned long long numberOfLBnodes,
+                                                    bool isEvenTimestep);
 
 __global__ void QVelDeviceCompHighNu27(	real* vx,
-													real* vy,
-													real* vz,
-													real* DD,
-													int* k_Q,
-													real* QQ,
-													unsigned int numberOfBCnodes,
-													real om1,
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													unsigned int size_Mat,
-													bool isEvenTimestep);
+                                                    real* vy,
+                                                    real* vz,
+                                                    real* DD,
+                                                    int* k_Q,
+                                                    real* QQ,
+                                                    unsigned int numberOfBCnodes,
+                                                    real om1,
+                                                    unsigned int* neighborX,
+                                                    unsigned int* neighborY,
+                                                    unsigned int* neighborZ,
+                                                    unsigned long long numberOfLBnodes,
+                                                    bool isEvenTimestep);
 
 __global__ void QVeloDeviceEQ27(real* VeloX,
-										   real* VeloY,
-										   real* VeloZ,
+                                           real* VeloY,
+                                           real* VeloZ,
                                            real* DD,
                                            int* k_Q,
                                            int numberOfBCnodes,
@@ -842,22 +842,22 @@ __global__ void QVeloDeviceEQ27(real* VeloX,
                                            unsigned int* neighborX,
                                            unsigned int* neighborY,
                                            unsigned int* neighborZ,
-                                           unsigned int size_Mat,
+                                           unsigned long long numberOfLBnodes,
                                            bool isEvenTimestep);
 
 __global__ void QVeloStreetDeviceEQ27(
-	real* veloXfraction,
-	real* veloYfraction,
-	int*  naschVelo,
-	real* DD,
-	int*  naschIndex,
-	int   numberOfStreetNodes,
-	real  velocityRatio,
-	uint* neighborX,
-	uint* neighborY,
-	uint* neighborZ,
-	uint  size_Mat,
-	bool  isEvenTimestep);
+    real* veloXfraction,
+    real* veloYfraction,
+    int*  naschVelo,
+    real* DD,
+    int*  naschIndex,
+    int   numberOfStreetNodes,
+    real  velocityRatio,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool  isEvenTimestep);
 
 //Slip BCs
 __global__ void QSlipDevice27(real* DD,
@@ -868,139 +868,150 @@ __global__ void QSlipDevice27(real* DD,
                                          unsigned int* neighborX,
                                          unsigned int* neighborY,
                                          unsigned int* neighborZ,
-                                         unsigned int size_Mat,
+                                         unsigned long long numberOfLBnodes,
                                          bool isEvenTimestep);
 
 __global__ void QSlipDeviceComp27(real* DD,
-											 int* k_Q,
-											 real* QQ,
-											 unsigned int numberOfBCnodes,
-											 real om1,
-											 unsigned int* neighborX,
-											 unsigned int* neighborY,
-											 unsigned int* neighborZ,
-											 unsigned int size_Mat,
-											 bool isEvenTimestep);
+                                             int* k_Q,
+                                             real* QQ,
+                                             unsigned int numberOfBCnodes,
+                                             real om1,
+                                             unsigned int* neighborX,
+                                             unsigned int* neighborY,
+                                             unsigned int* neighborZ,
+                                             unsigned long long numberOfLBnodes,
+                                             bool isEvenTimestep);
 
 __global__ void QSlipDeviceComp27TurbViscosity(
-                                    real* distributions, 
-                                    int* subgridDistanceIndices, 
+                                    real* distributions,
+                                    int* subgridDistanceIndices,
                                     real* subgridDistances,
                                     unsigned int numberOfBCnodes,
-                                    real omega, 
+                                    real omega,
                                     unsigned int* neighborX,
                                     unsigned int* neighborY,
                                     unsigned int* neighborZ,
                                     real* turbViscosity,
-                                    unsigned int numberOfLBnodes, 
+                                    unsigned long long numberOfLBnodes,
                                     bool isEvenTimestep);
 
 __global__ void QSlipPressureDeviceComp27TurbViscosity(
-                                    real* distributions, 
-                                    int* subgridDistanceIndices, 
+                                    real* distributions,
+                                    int* subgridDistanceIndices,
                                     real* subgridDistances,
                                     unsigned int numberOfBCnodes,
-                                    real omega, 
+                                    real omega,
                                     unsigned int* neighborX,
                                     unsigned int* neighborY,
                                     unsigned int* neighborZ,
                                     real* turbViscosity,
-                                    unsigned int numberOfLBnodes, 
+                                    unsigned long long numberOfLBnodes,
                                     bool isEvenTimestep);
 
 __global__ void QSlipGeomDeviceComp27(real* DD,
-												 int* k_Q,
-												 real* QQ,
-												 unsigned int numberOfBCnodes,
-												 real om1,
-												 real* NormalX,
-												 real* NormalY,
-												 real* NormalZ,
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int size_Mat,
-												 bool isEvenTimestep);
+                                                 int* k_Q,
+                                                 real* QQ,
+                                                 unsigned int numberOfBCnodes,
+                                                 real om1,
+                                                 real* NormalX,
+                                                 real* NormalY,
+                                                 real* NormalZ,
+                                                 unsigned int* neighborX,
+                                                 unsigned int* neighborY,
+                                                 unsigned int* neighborZ,
+                                                 unsigned long long numberOfLBnodes,
+                                                 bool isEvenTimestep);
 
 __global__ void QSlipNormDeviceComp27(real* DD,
-												 int* k_Q,
-												 real* QQ,
-												 unsigned int numberOfBCnodes,
-												 real om1,
-												 real* NormalX,
-												 real* NormalY,
-												 real* NormalZ,
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int size_Mat,
-												 bool isEvenTimestep);
+                                                 int* k_Q,
+                                                 real* QQ,
+                                                 unsigned int numberOfBCnodes,
+                                                 real om1,
+                                                 real* NormalX,
+                                                 real* NormalY,
+                                                 real* NormalZ,
+                                                 unsigned int* neighborX,
+                                                 unsigned int* neighborY,
+                                                 unsigned int* neighborZ,
+                                                 unsigned long long numberOfLBnodes,
+                                                 bool isEvenTimestep);
+
+__global__ void BBSlipDeviceComp27(
+    real* distributions,
+    int* subgridDistanceIndices,
+    real* subgridDistances,
+    unsigned int numberOfBCnodes,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep);
 
 // Stress BCs (wall model)
 __global__ void QStressDeviceComp27(real* DD,
-											   int* k_Q,
-											 int* k_N,
-											 real* QQ,
-											 unsigned int numberOfBCnodes,
-											 real om1,
-											 real* turbViscosity,
-										     real* vx,
-											 real* vy,
-                                    	     real* vz,
-											 real* normalX,
-											 real* normalY,
-                                    	     real* normalZ,
-											 real* vx_bc,
-											 real* vy_bc,
-                                    	     real* vz_bc,
-											 real* vx1,
-                                    		 real* vy1,
-                                    		 real* vz1,
-											 int* samplingOffset,
-											 real* z0,
-											 bool  hasWallModelMonitor,
-											real* u_star_monitor,
-											real* Fx_monitor,
-											real* Fy_monitor,
-											real* Fz_monitor,
-											 unsigned int* neighborX,
-											 unsigned int* neighborY,
-											 unsigned int* neighborZ,
-											 unsigned int size_Mat,
-											 bool isEvenTimestep);
+                                               int* k_Q,
+                                             int* k_N,
+                                             real* QQ,
+                                             unsigned int numberOfBCnodes,
+                                             real om1,
+                                             real* turbViscosity,
+                                             real* vx,
+                                             real* vy,
+                                             real* vz,
+                                             real* normalX,
+                                             real* normalY,
+                                             real* normalZ,
+                                             real* vx_bc,
+                                             real* vy_bc,
+                                             real* vz_bc,
+                                             real* vx1,
+                                             real* vy1,
+                                             real* vz1,
+                                             int* samplingOffset,
+                                             real* z0,
+                                             bool  hasWallModelMonitor,
+                                            real* u_star_monitor,
+                                            real* Fx_monitor,
+                                            real* Fy_monitor,
+                                            real* Fz_monitor,
+                                             unsigned int* neighborX,
+                                             unsigned int* neighborY,
+                                             unsigned int* neighborZ,
+                                             unsigned long long numberOfLBnodes,
+                                             bool isEvenTimestep);
 
 __global__ void BBStressDevice27( real* DD,
-												int* k_Q,
-												int* k_N,
-												real* QQ,
-												unsigned int numberOfBCnodes,
-												real* vx,
-												real* vy,
-												real* vz,
-												real* normalX,
-												real* normalY,
-												real* normalZ,
-												real* vx_bc,
-												real* vy_bc,
-												real* vz_bc,
-												real* vx1,
-												real* vy1,
-												real* vz1,
-												int* samplingOffset,
-												real* z0,
-												bool  hasWallModelMonitor,
-												real* u_star_monitor,
-												real* Fx_monitor,
-												real* Fy_monitor,
-												real* Fz_monitor,
-												unsigned int* neighborX,
-												unsigned int* neighborY,
-												unsigned int* neighborZ,
-												unsigned int size_Mat,
-												bool isEvenTimestep);
+                                                int* k_Q,
+                                                int* k_N,
+                                                real* QQ,
+                                                unsigned int numberOfBCnodes,
+                                                real* vx,
+                                                real* vy,
+                                                real* vz,
+                                                real* normalX,
+                                                real* normalY,
+                                                real* normalZ,
+                                                real* vx_bc,
+                                                real* vy_bc,
+                                                real* vz_bc,
+                                                real* vx1,
+                                                real* vy1,
+                                                real* vz1,
+                                                int* samplingOffset,
+                                                real* z0,
+                                                bool  hasWallModelMonitor,
+                                                real* u_star_monitor,
+                                                real* Fx_monitor,
+                                                real* Fy_monitor,
+                                                real* Fz_monitor,
+                                                unsigned int* neighborX,
+                                                unsigned int* neighborY,
+                                                unsigned int* neighborZ,
+                                                unsigned long long numberOfLBnodes,
+                                                bool isEvenTimestep);
 
 __global__ void BBStressPressureDevice27( real* DD,
-											            int* k_Q,
+                                                        int* k_Q,
                                              int* k_N,
                                              real* QQ,
                                              unsigned int  numberOfBCnodes,
@@ -1026,7 +1037,7 @@ __global__ void BBStressPressureDevice27( real* DD,
                                              unsigned int* neighborX,
                                              unsigned int* neighborY,
                                              unsigned int* neighborZ,
-                                             unsigned int size_Mat,
+                                             unsigned long long numberOfLBnodes,
                                              bool isEvenTimestep);
 
 //Pressure BCs
@@ -1039,23 +1050,23 @@ __global__ void QPressDevice27( real* rhoBC,
                                            unsigned int* neighborX,
                                            unsigned int* neighborY,
                                            unsigned int* neighborZ,
-                                           unsigned int size_Mat,
+                                           unsigned long long numberOfLBnodes,
                                            bool isEvenTimestep);
 
 __global__ void QPressDeviceAntiBB27(   real* rhoBC,
-												   real* vx,
-												   real* vy,
-												   real* vz,
-												   real* DD,
-												   int* k_Q,
-												   real* QQ,
-												   int numberOfBCnodes,
-												   real om1,
-												   unsigned int* neighborX,
-												   unsigned int* neighborY,
-												   unsigned int* neighborZ,
-												   unsigned int size_Mat,
-												   bool isEvenTimestep);
+                                                   real* vx,
+                                                   real* vy,
+                                                   real* vz,
+                                                   real* DD,
+                                                   int* k_Q,
+                                                   real* QQ,
+                                                   int numberOfBCnodes,
+                                                   real om1,
+                                                   unsigned int* neighborX,
+                                                   unsigned int* neighborY,
+                                                   unsigned int* neighborZ,
+                                                   unsigned long long numberOfLBnodes,
+                                                   bool isEvenTimestep);
 
 __global__ void QPressDeviceFixBackflow27( real* rhoBC,
                                                       real* DD,
@@ -1065,7 +1076,7 @@ __global__ void QPressDeviceFixBackflow27( real* rhoBC,
                                                       unsigned int* neighborX,
                                                       unsigned int* neighborY,
                                                       unsigned int* neighborZ,
-                                                      unsigned int size_Mat,
+                                                      unsigned long long numberOfLBnodes,
                                                       bool isEvenTimestep);
 
 __global__ void QPressDeviceDirDepBot27(  real* rhoBC,
@@ -1076,32 +1087,47 @@ __global__ void QPressDeviceDirDepBot27(  real* rhoBC,
                                                      unsigned int* neighborX,
                                                      unsigned int* neighborY,
                                                      unsigned int* neighborZ,
-                                                     unsigned int size_Mat,
+                                                     unsigned long long numberOfLBnodes,
                                                      bool isEvenTimestep);
 
 __global__ void QPressNoRhoDevice27(  real* rhoBC,
-												 real* DD,
-												 int* k_Q,
-												 int* k_N,
-												 int numberOfBCnodes,
-												 real om1,
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int size_Mat,
-												 bool isEvenTimestep);
+                                                 real* distributions,
+                                                 int* k_Q,
+                                                 int* k_N,
+                                                 int numberOfBCnodes,
+                                                 real om1,
+                                                 unsigned int* neighborX,
+                                                 unsigned int* neighborY,
+                                                 unsigned int* neighborZ,
+                                                 unsigned long long numberOfLBnodes,
+                                                 bool isEvenTimestep,
+                                                 int direction);
+
+__global__ void QPressZeroRhoOutflowDevice27(  real* rhoBC,
+                                            real* distributions,
+                                            int* k_Q,
+                                            int* k_N,
+                                            int numberOfBCnodes,
+                                            real om1,
+                                            unsigned int* neighborX,
+                                            unsigned int* neighborY,
+                                            unsigned int* neighborZ,
+                                            unsigned long long numberOfLBnodes,
+                                            bool isEvenTimestep,
+                                            int direction,
+                                            real densityCorrectionFactor);
 
 __global__ void QInflowScaleByPressDevice27(  real* rhoBC,
-														 real* DD,
-														 int* k_Q,
-														 int* k_N,
-														 int numberOfBCnodes,
-														 real om1,
-														 unsigned int* neighborX,
-														 unsigned int* neighborY,
-														 unsigned int* neighborZ,
-														 unsigned int size_Mat,
-														 bool isEvenTimestep);
+                                                         real* DD,
+                                                         int* k_Q,
+                                                         int* k_N,
+                                                         int numberOfBCnodes,
+                                                         real om1,
+                                                         unsigned int* neighborX,
+                                                         unsigned int* neighborY,
+                                                         unsigned int* neighborZ,
+                                                         unsigned long long numberOfLBnodes,
+                                                         bool isEvenTimestep);
 
 __global__ void QPressDeviceOld27(real* rhoBC,
                                              real* DD,
@@ -1112,20 +1138,20 @@ __global__ void QPressDeviceOld27(real* rhoBC,
                                              unsigned int* neighborX,
                                              unsigned int* neighborY,
                                              unsigned int* neighborZ,
-                                             unsigned int size_Mat,
+                                             unsigned long long numberOfLBnodes,
                                              bool isEvenTimestep);
 
 __global__ void QPressDeviceIncompNEQ27( real* rhoBC,
-													real* DD,
-													int* k_Q,
-													int* k_N,
-													int numberOfBCnodes,
-													real om1,
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													unsigned int size_Mat,
-													bool isEvenTimestep);
+                                                    real* DD,
+                                                    int* k_Q,
+                                                    int* k_N,
+                                                    int numberOfBCnodes,
+                                                    real om1,
+                                                    unsigned int* neighborX,
+                                                    unsigned int* neighborY,
+                                                    unsigned int* neighborZ,
+                                                    unsigned long long numberOfLBnodes,
+                                                    bool isEvenTimestep);
 
 __global__ void QPressDeviceNEQ27(real* rhoBC,
                                              real* distribution,
@@ -1136,7 +1162,7 @@ __global__ void QPressDeviceNEQ27(real* rhoBC,
                                              unsigned int* neighborX,
                                              unsigned int* neighborY,
                                              unsigned int* neighborZ,
-                                             unsigned int size_Mat,
+                                             unsigned long long numberOfLBnodes,
                                              bool isEvenTimestep);
 
 __global__ void QPressDeviceEQZ27(real* rhoBC,
@@ -1149,17 +1175,17 @@ __global__ void QPressDeviceEQZ27(real* rhoBC,
                                              unsigned int* neighborX,
                                              unsigned int* neighborY,
                                              unsigned int* neighborZ,
-                                             unsigned int size_Mat,
+                                             unsigned long long numberOfLBnodes,
                                              bool isEvenTimestep);
 
 __global__ void QPressDeviceZero27(  real* DD,
-												int* k_Q,
-												unsigned int numberOfBCnodes,
-												unsigned int* neighborX,
-												unsigned int* neighborY,
-												unsigned int* neighborZ,
-												unsigned int size_Mat,
-												bool isEvenTimestep);
+                                                int* k_Q,
+                                                unsigned int numberOfBCnodes,
+                                                unsigned int* neighborX,
+                                                unsigned int* neighborY,
+                                                unsigned int* neighborZ,
+                                                unsigned long long numberOfLBnodes,
+                                                bool isEvenTimestep);
 
 __global__ void QPressDeviceFake27(real* rhoBC,
                                              real* DD,
@@ -1170,7 +1196,7 @@ __global__ void QPressDeviceFake27(real* rhoBC,
                                              unsigned int* neighborX,
                                              unsigned int* neighborY,
                                              unsigned int* neighborZ,
-                                             unsigned int size_Mat,
+                                             unsigned long long numberOfLBnodes,
                                              bool isEvenTimestep);
 
 __global__ void BBDevice27(real* distributions,
@@ -1180,20 +1206,20 @@ __global__ void BBDevice27(real* distributions,
                                      unsigned int* neighborX,
                                      unsigned int* neighborY,
                                      unsigned int* neighborZ,
-                                     unsigned int numberOfLBnodes,
+                                     unsigned long long numberOfLBnodes,
                                      bool isEvenTimestep);
 
 __global__ void QPressDevice27_IntBB(real* rho,
-												real* DD,
-												int* k_Q,
-												real* QQ,
-												unsigned int numberOfBCnodes,
-												real om1,
-												unsigned int* neighborX,
-												unsigned int* neighborY,
-												unsigned int* neighborZ,
-												unsigned int size_Mat,
-												bool isEvenTimestep);
+                                                real* DD,
+                                                int* k_Q,
+                                                real* QQ,
+                                                unsigned int numberOfBCnodes,
+                                                real om1,
+                                                unsigned int* neighborX,
+                                                unsigned int* neighborY,
+                                                unsigned int* neighborZ,
+                                                unsigned long long numberOfLBnodes,
+                                                bool isEvenTimestep);
 
 // TODO: https://git.rz.tu-bs.de/irmb/VirtualFluids_dev/-/issues/29
 //Schlaffer BCs
@@ -1210,7 +1236,7 @@ __global__ void PressSchlaff27(real* rhoBC,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat,
+                                          unsigned long long numberOfLBnodes,
                                           bool isEvenTimestep);
 
 // TODO: https://git.rz.tu-bs.de/irmb/VirtualFluids_dev/-/issues/29
@@ -1225,9 +1251,106 @@ __global__ void VelSchlaff27(  int t,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat,
+                                          unsigned long long numberOfLBnodes,
                                           bool isEvenTimestep);
 
+__global__ void QPrecursorDeviceCompZeroPress( 	int* subgridDistanceIndices,
+                                                int numberOfBCnodes,
+                                                int numberOfPrecursorNodes,
+                                                int sizeQ,
+                                                real omega,
+                                                real* distributions,
+                                                real* subgridDistances,
+                                                uint* neighborX,
+                                                uint* neighborY,
+                                                uint* neighborZ,
+                                                uint* neighborsNT,
+                                                uint* neighborsNB,
+                                                uint* neighborsST,
+                                                uint* neighborsSB,
+                                                real* weights0PP,
+                                                real* weights0PM,
+                                                real* weights0MP,
+                                                real* weights0MM,
+                                                real* vLast,
+                                                real* vCurrent,
+                                                real velocityX,
+                                                real velocityY,
+                                                real velocityZ,
+                                                real timeRatio,
+                                                real velocityRatio,
+                                                unsigned long long numberOfLBnodes,
+                                                bool isEvenTimestep);
+
+__global__ void PrecursorDeviceEQ27( 	int* subgridDistanceIndices,
+                                        int numberOfBCnodes,
+                                        int numberOfPrecursorNodes,
+                                        real omega,
+                                        real* distributions,
+                                        uint* neighborX,
+                                        uint* neighborY,
+                                        uint* neighborZ,
+                                        uint* neighborsNT,
+                                        uint* neighborsNB,
+                                        uint* neighborsST,
+                                        uint* neighborsSB,
+                                        real* weights0PP,
+                                        real* weights0PM,
+                                        real* weights0MP,
+                                        real* weights0MM,
+                                        real* vLast,
+                                        real* vCurrent,
+                                        real velocityX,
+                                        real velocityY,
+                                        real velocityZ,
+                                        real timeRatio,
+                                        real velocityRatio,
+                                        unsigned long long numberOfLBnodes,
+                                        bool isEvenTimestep);
+
+__global__ void PrecursorDeviceDistributions( 	int* subgridDistanceIndices,
+                                                int numberOfBCNodes,
+                                                int numberOfPrecursorNodes,
+                                                real* distributions,
+                                                uint* neighborX,
+                                                uint* neighborY,
+                                                uint* neighborZ,
+                                                uint* neighborsNT,
+                                                uint* neighborsNB,
+                                                uint* neighborsST,
+                                                uint* neighborsSB,
+                                                real* weights0PP,
+                                                real* weights0PM,
+                                                real* weights0MP,
+                                                real* weights0MM,
+                                                real* fsLast,
+                                                real* fsNext,
+                                                real timeRatio,
+                                                unsigned long long numberOfLBnodes,
+                                                bool isEvenTimestep);
+__global__ void QPrecursorDeviceDistributions( 	int* subgridDistanceIndices,
+                                                real* subgridDistances,
+                                                int sizeQ,
+                                                int numberOfBCNodes,
+                                                int numberOfPrecursorNodes,
+                                                real* distributions,
+                                                uint* neighborX,
+                                                uint* neighborY,
+                                                uint* neighborZ,
+                                                uint* neighborsNT,
+                                                uint* neighborsNB,
+                                                uint* neighborsST,
+                                                uint* neighborsSB,
+                                                real* weights0PP,
+                                                real* weights0PM,
+                                                real* weights0MP,
+                                                real* weights0MM,
+                                                real* fsLast,
+                                                real* fsNext,
+                                                real timeRatio,
+                                                unsigned long long numberOfLBnodes,
+                                                bool isEvenTimestep);
+
 //Advection / Diffusion BCs
 __global__ void QAD7( real* DD,
                                  real* DD7,
@@ -1240,68 +1363,68 @@ __global__ void QAD7( real* DD,
                                  unsigned int* neighborX,
                                  unsigned int* neighborY,
                                  unsigned int* neighborZ,
-                                 unsigned int size_Mat,
+                                 unsigned long long numberOfLBnodes,
                                  bool isEvenTimestep);
 
 //////////////////////////////////////////////////////////////////////////
 //! \brief \ref Advection_Diffusion_Device_Kernel : Factorized central moments for Advection Diffusion Equation
 __global__ void Factorized_Central_Moments_Advection_Diffusion_Device_Kernel(
-	real omegaDiffusivity,
-	uint* typeOfGridNode,
-	uint* neighborX,
-	uint* neighborY,
-	uint* neighborZ,
-	real* distributions,
-	real* distributionsAD,
-	int size_Mat,
-	real* forces,
-	bool isEvenTimestep);
+    real omegaDiffusivity,
+    uint* typeOfGridNode,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    real* distributions,
+    real* distributionsAD,
+    unsigned long long numberOfLBnodes,
+    real* forces,
+    bool isEvenTimestep);
 
 //////////////////////////////////////////////////////////////////////////
 //! \brief \ref AD_SlipVelDeviceComp : device function for the slip-AD boundary condition
 __global__ void AD_SlipVelDeviceComp(
-	real * normalX,
-	real * normalY,
-	real * normalZ,
-	real * distributions,
-	real * distributionsAD,
-	int* QindexArray,
-	real * Qarrays,
-	uint numberOfBCnodes,
-	real omegaDiffusivity,
-	uint * neighborX,
-	uint * neighborY,
-	uint * neighborZ,
-	uint size_Mat,
-	bool isEvenTimestep);
+    real * normalX,
+    real * normalY,
+    real * normalZ,
+    real * distributions,
+    real * distributionsAD,
+    int* QindexArray,
+    real * Qarrays,
+    uint numberOfBCnodes,
+    real omegaDiffusivity,
+    uint * neighborX,
+    uint * neighborY,
+    uint * neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep);
 
 __global__ void QADDirichlet27(   real* DD,
-											 real* DD27,
-											 real* temp,
-											 real diffusivity,
-											 int* k_Q,
-											 real* QQ,
-											 unsigned int numberOfBCnodes,
-											 real om1,
-											 unsigned int* neighborX,
-											 unsigned int* neighborY,
-											 unsigned int* neighborZ,
-											 unsigned int size_Mat,
-											 bool isEvenTimestep);
+                                             real* DD27,
+                                             real* temp,
+                                             real diffusivity,
+                                             int* k_Q,
+                                             real* QQ,
+                                             unsigned int numberOfBCnodes,
+                                             real om1,
+                                             unsigned int* neighborX,
+                                             unsigned int* neighborY,
+                                             unsigned int* neighborZ,
+                                             unsigned long long numberOfLBnodes,
+                                             bool isEvenTimestep);
 
 __global__ void QADBB27(  real* DD,
-									 real* DD27,
-									 real* temp,
-									 real diffusivity,
-									 int* k_Q,
-									 real* QQ,
-									 unsigned int numberOfBCnodes,
-									 real om1,
-									 unsigned int* neighborX,
-									 unsigned int* neighborY,
-									 unsigned int* neighborZ,
-									 unsigned int size_Mat,
-									 bool isEvenTimestep);
+                                     real* DD27,
+                                     real* temp,
+                                     real diffusivity,
+                                     int* k_Q,
+                                     real* QQ,
+                                     unsigned int numberOfBCnodes,
+                                     real om1,
+                                     unsigned int* neighborX,
+                                     unsigned int* neighborY,
+                                     unsigned int* neighborZ,
+                                     unsigned long long numberOfLBnodes,
+                                     bool isEvenTimestep);
 
 __global__ void QADVel7( real* DD,
                                     real* DD7,
@@ -1315,7 +1438,7 @@ __global__ void QADVel7( real* DD,
                                     unsigned int* neighborX,
                                     unsigned int* neighborY,
                                     unsigned int* neighborZ,
-                                    unsigned int size_Mat,
+                                    unsigned long long numberOfLBnodes,
                                     bool isEvenTimestep);
 
 __global__ void QADVel27(real* DD,
@@ -1330,7 +1453,7 @@ __global__ void QADVel27(real* DD,
                                     unsigned int* neighborX,
                                     unsigned int* neighborY,
                                     unsigned int* neighborZ,
-                                    unsigned int size_Mat,
+                                    unsigned long long numberOfLBnodes,
                                     bool isEvenTimestep);
 
 __global__ void QADPress7(  real* DD,
@@ -1345,7 +1468,7 @@ __global__ void QADPress7(  real* DD,
                                        unsigned int* neighborX,
                                        unsigned int* neighborY,
                                        unsigned int* neighborZ,
-                                       unsigned int size_Mat,
+                                       unsigned long long numberOfLBnodes,
                                        bool isEvenTimestep);
 
 __global__ void QADPress27( real* DD,
@@ -1360,109 +1483,109 @@ __global__ void QADPress27( real* DD,
                                        unsigned int* neighborX,
                                        unsigned int* neighborY,
                                        unsigned int* neighborZ,
-                                       unsigned int size_Mat,
+                                       unsigned long long numberOfLBnodes,
                                        bool isEvenTimestep);
 
 __global__ void QADPressNEQNeighbor27(
-												 real* DD,
-												 real* DD27,
-												 int* k_Q,
-												 int* k_N,
-												 int numberOfBCnodes,
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int size_Mat,
-												 bool isEvenTimestep
-												);
+                                                 real* DD,
+                                                 real* DD27,
+                                                 int* k_Q,
+                                                 int* k_N,
+                                                 int numberOfBCnodes,
+                                                 unsigned int* neighborX,
+                                                 unsigned int* neighborY,
+                                                 unsigned int* neighborZ,
+                                                 unsigned long long numberOfLBnodes,
+                                                 bool isEvenTimestep
+                                                );
 
 __global__ void QNoSlipADincomp7( real* DD,
-											 real* DD7,
-											 real* temp,
-											 real diffusivity,
-											 int* k_Q,
-											 real* QQ,
-											 unsigned int numberOfBCnodes,
-											 real om1,
-											 unsigned int* neighborX,
-											 unsigned int* neighborY,
-											 unsigned int* neighborZ,
-											 unsigned int size_Mat,
-											 bool isEvenTimestep);
+                                             real* DD7,
+                                             real* temp,
+                                             real diffusivity,
+                                             int* k_Q,
+                                             real* QQ,
+                                             unsigned int numberOfBCnodes,
+                                             real om1,
+                                             unsigned int* neighborX,
+                                             unsigned int* neighborY,
+                                             unsigned int* neighborZ,
+                                             unsigned long long numberOfLBnodes,
+                                             bool isEvenTimestep);
 
 __global__ void QNoSlipADincomp27( real* DD,
-											 real* DD27,
-											 real* temp,
-											 real diffusivity,
-											 int* k_Q,
-											 real* QQ,
-											 unsigned int numberOfBCnodes,
-											 real om1,
-											 unsigned int* neighborX,
-											 unsigned int* neighborY,
-											 unsigned int* neighborZ,
-											 unsigned int size_Mat,
-											 bool isEvenTimestep);
+                                             real* DD27,
+                                             real* temp,
+                                             real diffusivity,
+                                             int* k_Q,
+                                             real* QQ,
+                                             unsigned int numberOfBCnodes,
+                                             real om1,
+                                             unsigned int* neighborX,
+                                             unsigned int* neighborY,
+                                             unsigned int* neighborZ,
+                                             unsigned long long numberOfLBnodes,
+                                             bool isEvenTimestep);
 
 __global__ void QADVeloIncomp7(  real* DD,
-											real* DD7,
-											real* temp,
-											real* velo,
-											real diffusivity,
-											int* k_Q,
-											real* QQ,
-											unsigned int numberOfBCnodes,
-											real om1,
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int size_Mat,
-											bool isEvenTimestep);
+                                            real* DD7,
+                                            real* temp,
+                                            real* velo,
+                                            real diffusivity,
+                                            int* k_Q,
+                                            real* QQ,
+                                            unsigned int numberOfBCnodes,
+                                            real om1,
+                                            unsigned int* neighborX,
+                                            unsigned int* neighborY,
+                                            unsigned int* neighborZ,
+                                            unsigned long long numberOfLBnodes,
+                                            bool isEvenTimestep);
 
 __global__ void QADVeloIncomp27( real* DD,
-											real* DD27,
-											real* temp,
-											real* velo,
-											real diffusivity,
-											int* k_Q,
-											real* QQ,
-											unsigned int numberOfBCnodes,
-											real om1,
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int size_Mat,
-											bool isEvenTimestep);
+                                            real* DD27,
+                                            real* temp,
+                                            real* velo,
+                                            real diffusivity,
+                                            int* k_Q,
+                                            real* QQ,
+                                            unsigned int numberOfBCnodes,
+                                            real om1,
+                                            unsigned int* neighborX,
+                                            unsigned int* neighborY,
+                                            unsigned int* neighborZ,
+                                            unsigned long long numberOfLBnodes,
+                                            bool isEvenTimestep);
 
 __global__ void QADPressIncomp7(real* DD,
-										   real* DD7,
-										   real* temp,
-										   real* velo,
-										   real diffusivity,
-										   int* k_Q,
-										   real* QQ,
-										   unsigned int numberOfBCnodes,
-										   real om1,
-										   unsigned int* neighborX,
-										   unsigned int* neighborY,
-										   unsigned int* neighborZ,
-										   unsigned int size_Mat,
-										   bool isEvenTimestep);
+                                           real* DD7,
+                                           real* temp,
+                                           real* velo,
+                                           real diffusivity,
+                                           int* k_Q,
+                                           real* QQ,
+                                           unsigned int numberOfBCnodes,
+                                           real om1,
+                                           unsigned int* neighborX,
+                                           unsigned int* neighborY,
+                                           unsigned int* neighborZ,
+                                           unsigned long long numberOfLBnodes,
+                                           bool isEvenTimestep);
 
 __global__ void QADPressIncomp27(   real* DD,
-											   real* DD27,
-											   real* temp,
-											   real* velo,
-											   real diffusivity,
-											   int* k_Q,
-											   real* QQ,
-											   unsigned int numberOfBCnodes,
-											   real om1,
-											   unsigned int* neighborX,
-											   unsigned int* neighborY,
-											   unsigned int* neighborZ,
-											   unsigned int size_Mat,
-											   bool isEvenTimestep);
+                                               real* DD27,
+                                               real* temp,
+                                               real* velo,
+                                               real diffusivity,
+                                               int* k_Q,
+                                               real* QQ,
+                                               unsigned int numberOfBCnodes,
+                                               real om1,
+                                               unsigned int* neighborX,
+                                               unsigned int* neighborY,
+                                               unsigned int* neighborZ,
+                                               unsigned long long numberOfLBnodes,
+                                               bool isEvenTimestep);
 
 //Propeller BC
 __global__ void PropellerBC(unsigned int* neighborX,
@@ -1473,8 +1596,8 @@ __global__ void PropellerBC(unsigned int* neighborX,
                                        real* uy,
                                        real* uz,
                                        int* k_Q,
-									   unsigned int size_Prop,
-                                       unsigned int size_Mat,
+                                       unsigned int size_Prop,
+                                       unsigned long long numberOfLBnodes,
                                        unsigned int* bcMatD,
                                        real* DD,
                                        bool EvenOrOdd);
@@ -1490,19 +1613,19 @@ __global__ void scaleCF27(real* DC,
                                     unsigned int* neighborFX,
                                     unsigned int* neighborFY,
                                     unsigned int* neighborFZ,
-										       unsigned int size_MatC,
-										       unsigned int size_MatF,
-										       bool isEvenTimestep,
+                                               unsigned long long numberOfLBnodesC,
+                                               unsigned long long numberOfLBnodesF,
+                                               bool isEvenTimestep,
                                      unsigned int* posCSWB,
                                      unsigned int* posFSWB,
                                      unsigned int kCF,
-										       real omCoarse,
-										       real omFine,
-										       real nu,
-										       unsigned int nxC,
-										       unsigned int nyC,
-										       unsigned int nxF,
-										       unsigned int nyF);
+                                               real omCoarse,
+                                               real omFine,
+                                               real nu,
+                                               unsigned int nxC,
+                                               unsigned int nyC,
+                                               unsigned int nxF,
+                                               unsigned int nyF);
 
 __global__ void scaleCFEff27(real* DC,
                                         real* DF,
@@ -1512,18 +1635,18 @@ __global__ void scaleCFEff27(real* DC,
                                         unsigned int* neighborFX,
                                         unsigned int* neighborFY,
                                         unsigned int* neighborFZ,
-									             unsigned int size_MatC,
-									             unsigned int size_MatF,
-									             bool isEvenTimestep,
+                                                 unsigned long long numberOfLBnodesC,
+                                                 unsigned long long numberOfLBnodesF,
+                                                 bool isEvenTimestep,
                                         unsigned int* posCSWB,
                                         unsigned int* posFSWB,
                                         unsigned int kCF,
-									             real omCoarse,
-									             real omFine,
-									             real nu,
-									             unsigned int nxC,
-									             unsigned int nyC,
-									             unsigned int nxF,
+                                                 real omCoarse,
+                                                 real omFine,
+                                                 real nu,
+                                                 unsigned int nxC,
+                                                 unsigned int nyC,
+                                                 unsigned int nxF,
                                         unsigned int nyF,
                                         OffCF offCF);
 
@@ -1535,8 +1658,8 @@ __global__ void scaleCFLast27( real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC,
-                                          unsigned int size_MatF,
+                                          unsigned long long numberOfLBnodesC,
+                                          unsigned long long numberOfLBnodesF,
                                           bool isEvenTimestep,
                                           unsigned int* posCSWB,
                                           unsigned int* posFSWB,
@@ -1558,8 +1681,8 @@ __global__ void scaleCFpress27(real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC,
-                                          unsigned int size_MatF,
+                                          unsigned long long numberOfLBnodesC,
+                                          unsigned long long numberOfLBnodesF,
                                           bool isEvenTimestep,
                                           unsigned int* posCSWB,
                                           unsigned int* posFSWB,
@@ -1581,8 +1704,8 @@ __global__ void scaleCF_Fix_27(real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC,
-                                          unsigned int size_MatF,
+                                          unsigned long long numberOfLBnodesC,
+                                          unsigned long long numberOfLBnodesF,
                                           bool isEvenTimestep,
                                           unsigned int* posCSWB,
                                           unsigned int* posFSWB,
@@ -1597,233 +1720,233 @@ __global__ void scaleCF_Fix_27(real* DC,
                                           OffCF offCF);
 
 __global__ void scaleCF_Fix_comp_27(   real* DC,
-												  real* DF,
-												  unsigned int* neighborCX,
-												  unsigned int* neighborCY,
-												  unsigned int* neighborCZ,
-												  unsigned int* neighborFX,
-												  unsigned int* neighborFY,
-												  unsigned int* neighborFZ,
-												  unsigned int size_MatC,
-												  unsigned int size_MatF,
-												  bool isEvenTimestep,
-												  unsigned int* posCSWB,
-												  unsigned int* posFSWB,
-												  unsigned int kCF,
-												  real omCoarse,
-												  real omFine,
-												  real nu,
-												  unsigned int nxC,
-												  unsigned int nyC,
-												  unsigned int nxF,
-												  unsigned int nyF,
-												  OffCF offCF);
+                                                  real* DF,
+                                                  unsigned int* neighborCX,
+                                                  unsigned int* neighborCY,
+                                                  unsigned int* neighborCZ,
+                                                  unsigned int* neighborFX,
+                                                  unsigned int* neighborFY,
+                                                  unsigned int* neighborFZ,
+                                                  unsigned long long numberOfLBnodesC,
+                                                  unsigned long long numberOfLBnodesF,
+                                                  bool isEvenTimestep,
+                                                  unsigned int* posCSWB,
+                                                  unsigned int* posFSWB,
+                                                  unsigned int kCF,
+                                                  real omCoarse,
+                                                  real omFine,
+                                                  real nu,
+                                                  unsigned int nxC,
+                                                  unsigned int nyC,
+                                                  unsigned int nxF,
+                                                  unsigned int nyF,
+                                                  OffCF offCF);
 
 __global__ void scaleCF_0817_comp_27(  real* DC,
-												  real* DF,
-												  unsigned int* neighborCX,
-												  unsigned int* neighborCY,
-												  unsigned int* neighborCZ,
-												  unsigned int* neighborFX,
-												  unsigned int* neighborFY,
-												  unsigned int* neighborFZ,
-												  unsigned int size_MatC,
-												  unsigned int size_MatF,
-												  bool isEvenTimestep,
-												  unsigned int* posCSWB,
-												  unsigned int* posFSWB,
-												  unsigned int kCF,
-												  real omCoarse,
-												  real omFine,
-												  real nu,
-												  unsigned int nxC,
-												  unsigned int nyC,
-												  unsigned int nxF,
-												  unsigned int nyF,
-												  OffCF offCF);
+                                                  real* DF,
+                                                  unsigned int* neighborCX,
+                                                  unsigned int* neighborCY,
+                                                  unsigned int* neighborCZ,
+                                                  unsigned int* neighborFX,
+                                                  unsigned int* neighborFY,
+                                                  unsigned int* neighborFZ,
+                                                  unsigned long long numberOfLBnodesC,
+                                                  unsigned long long numberOfLBnodesF,
+                                                  bool isEvenTimestep,
+                                                  unsigned int* posCSWB,
+                                                  unsigned int* posFSWB,
+                                                  unsigned int kCF,
+                                                  real omCoarse,
+                                                  real omFine,
+                                                  real nu,
+                                                  unsigned int nxC,
+                                                  unsigned int nyC,
+                                                  unsigned int nxF,
+                                                  unsigned int nyF,
+                                                  OffCF offCF);
 
 __global__ void scaleCF_comp_D3Q27F3_2018( real* DC,
-													  real* DF,
-													  real* G6,
-													  unsigned int* neighborCX,
-													  unsigned int* neighborCY,
-													  unsigned int* neighborCZ,
-													  unsigned int* neighborFX,
-													  unsigned int* neighborFY,
-													  unsigned int* neighborFZ,
-													  unsigned int size_MatC,
-													  unsigned int size_MatF,
-													  bool isEvenTimestep,
-													  unsigned int* posCSWB,
-													  unsigned int* posFSWB,
-													  unsigned int kCF,
-													  real omCoarse,
-													  real omFine,
-													  real nu,
-													  unsigned int nxC,
-													  unsigned int nyC,
-													  unsigned int nxF,
-													  unsigned int nyF,
-													  OffCF offCF);
+                                                      real* DF,
+                                                      real* G6,
+                                                      unsigned int* neighborCX,
+                                                      unsigned int* neighborCY,
+                                                      unsigned int* neighborCZ,
+                                                      unsigned int* neighborFX,
+                                                      unsigned int* neighborFY,
+                                                      unsigned int* neighborFZ,
+                                                      unsigned long long numberOfLBnodesC,
+                                                      unsigned long long numberOfLBnodesF,
+                                                      bool isEvenTimestep,
+                                                      unsigned int* posCSWB,
+                                                      unsigned int* posFSWB,
+                                                      unsigned int kCF,
+                                                      real omCoarse,
+                                                      real omFine,
+                                                      real nu,
+                                                      unsigned int nxC,
+                                                      unsigned int nyC,
+                                                      unsigned int nxF,
+                                                      unsigned int nyF,
+                                                      OffCF offCF);
 
 __global__ void scaleCF_comp_D3Q27F3( real* DC,
-												 real* DF,
-												 real* G6,
-												 unsigned int* neighborCX,
-												 unsigned int* neighborCY,
-												 unsigned int* neighborCZ,
-												 unsigned int* neighborFX,
-												 unsigned int* neighborFY,
-												 unsigned int* neighborFZ,
-												 unsigned int size_MatC,
-												 unsigned int size_MatF,
-												 bool isEvenTimestep,
-												 unsigned int* posCSWB,
-												 unsigned int* posFSWB,
-												 unsigned int kCF,
-												 real omCoarse,
-												 real omFine,
-												 real nu,
-												 unsigned int nxC,
-												 unsigned int nyC,
-												 unsigned int nxF,
-												 unsigned int nyF,
-												 OffCF offCF);
+                                                 real* DF,
+                                                 real* G6,
+                                                 unsigned int* neighborCX,
+                                                 unsigned int* neighborCY,
+                                                 unsigned int* neighborCZ,
+                                                 unsigned int* neighborFX,
+                                                 unsigned int* neighborFY,
+                                                 unsigned int* neighborFZ,
+                                                 unsigned long long numberOfLBnodesC,
+                                                 unsigned long long numberOfLBnodesF,
+                                                 bool isEvenTimestep,
+                                                 unsigned int* posCSWB,
+                                                 unsigned int* posFSWB,
+                                                 unsigned int kCF,
+                                                 real omCoarse,
+                                                 real omFine,
+                                                 real nu,
+                                                 unsigned int nxC,
+                                                 unsigned int nyC,
+                                                 unsigned int nxF,
+                                                 unsigned int nyF,
+                                                 OffCF offCF);
 
 
 __global__ void scaleCF_staggered_time_comp_27(real* DC,
-														  real* DF,
-														  unsigned int* neighborCX,
-														  unsigned int* neighborCY,
-														  unsigned int* neighborCZ,
-														  unsigned int* neighborFX,
-														  unsigned int* neighborFY,
-														  unsigned int* neighborFZ,
-														  unsigned int size_MatC,
-														  unsigned int size_MatF,
-														  bool isEvenTimestep,
-														  unsigned int* posCSWB,
-														  unsigned int* posFSWB,
-														  unsigned int kCF,
-														  real omCoarse,
-														  real omFine,
-														  real nu,
-														  unsigned int nxC,
-														  unsigned int nyC,
-														  unsigned int nxF,
-														  unsigned int nyF,
-														  OffCF offCF);
+                                                          real* DF,
+                                                          unsigned int* neighborCX,
+                                                          unsigned int* neighborCY,
+                                                          unsigned int* neighborCZ,
+                                                          unsigned int* neighborFX,
+                                                          unsigned int* neighborFY,
+                                                          unsigned int* neighborFZ,
+                                                          unsigned long long numberOfLBnodesC,
+                                                          unsigned long long numberOfLBnodesF,
+                                                          bool isEvenTimestep,
+                                                          unsigned int* posCSWB,
+                                                          unsigned int* posFSWB,
+                                                          unsigned int kCF,
+                                                          real omCoarse,
+                                                          real omFine,
+                                                          real nu,
+                                                          unsigned int nxC,
+                                                          unsigned int nyC,
+                                                          unsigned int nxF,
+                                                          unsigned int nyF,
+                                                          OffCF offCF);
 
 __global__ void scaleCF_RhoSq_comp_27( real* DC,
-												  real* DF,
-												  unsigned int* neighborCX,
-												  unsigned int* neighborCY,
-												  unsigned int* neighborCZ,
-												  unsigned int* neighborFX,
-												  unsigned int* neighborFY,
-												  unsigned int* neighborFZ,
-												  unsigned int size_MatC,
-												  unsigned int size_MatF,
-												  bool isEvenTimestep,
-												  unsigned int* posCSWB,
-												  unsigned int* posFSWB,
-												  unsigned int kCF,
-												  real omCoarse,
-												  real omFine,
-												  real nu,
-												  unsigned int nxC,
-												  unsigned int nyC,
-												  unsigned int nxF,
-												  unsigned int nyF,
-												  OffCF offCF);
+                                                  real* DF,
+                                                  unsigned int* neighborCX,
+                                                  unsigned int* neighborCY,
+                                                  unsigned int* neighborCZ,
+                                                  unsigned int* neighborFX,
+                                                  unsigned int* neighborFY,
+                                                  unsigned int* neighborFZ,
+                                                  unsigned long long numberOfLBnodesC,
+                                                  unsigned long long numberOfLBnodesF,
+                                                  bool isEvenTimestep,
+                                                  unsigned int* posCSWB,
+                                                  unsigned int* posFSWB,
+                                                  unsigned int kCF,
+                                                  real omCoarse,
+                                                  real omFine,
+                                                  real nu,
+                                                  unsigned int nxC,
+                                                  unsigned int nyC,
+                                                  unsigned int nxF,
+                                                  unsigned int nyF,
+                                                  OffCF offCF);
 
 __global__ void scaleCF_compressible(
-    real* distributionsCoarse, 
-    real* distributionsFine, 
+    real* distributionsCoarse,
+    real* distributionsFine,
     unsigned int* neighborXcoarse,
     unsigned int* neighborYcoarse,
     unsigned int* neighborZcoarse,
     unsigned int* neighborXfine,
     unsigned int* neighborYfine,
     unsigned int* neighborZfine,
-    unsigned int numberOfLBnodesCoarse, 
-    unsigned int numberOfLBnodesFine, 
+    unsigned long long numberOfLBnodesCoarse,
+    unsigned long long numberOfLBnodesFine,
     bool isEvenTimestep,
-    unsigned int* indicesCoarseMMM, 
-    unsigned int* indicesFineMMM, 
-    unsigned int numberOfInterfaceNodes, 
-    real omegaCoarse, 
-    real omegaFine, 
+    unsigned int* indicesCoarseMMM,
+    unsigned int* indicesFineMMM,
+    unsigned int numberOfInterfaceNodes,
+    real omegaCoarse,
+    real omegaFine,
     OffCF offsetCF);
 
 __global__ void scaleCF_RhoSq_3rdMom_comp_27(real* DC,
-														real* DF,
-														unsigned int* neighborCX,
-														unsigned int* neighborCY,
-														unsigned int* neighborCZ,
-														unsigned int* neighborFX,
-														unsigned int* neighborFY,
-														unsigned int* neighborFZ,
-														unsigned int size_MatC,
-														unsigned int size_MatF,
-														bool isEvenTimestep,
-														unsigned int* posCSWB,
-														unsigned int* posFSWB,
-														unsigned int kCF,
-														real omCoarse,
-														real omFine,
-														real nu,
-														unsigned int nxC,
-														unsigned int nyC,
-														unsigned int nxF,
-														unsigned int nyF,
-														OffCF offCF);
+                                                        real* DF,
+                                                        unsigned int* neighborCX,
+                                                        unsigned int* neighborCY,
+                                                        unsigned int* neighborCZ,
+                                                        unsigned int* neighborFX,
+                                                        unsigned int* neighborFY,
+                                                        unsigned int* neighborFZ,
+                                                        unsigned long long numberOfLBnodesC,
+                                                        unsigned long long numberOfLBnodesF,
+                                                        bool isEvenTimestep,
+                                                        unsigned int* posCSWB,
+                                                        unsigned int* posFSWB,
+                                                        unsigned int kCF,
+                                                        real omCoarse,
+                                                        real omFine,
+                                                        real nu,
+                                                        unsigned int nxC,
+                                                        unsigned int nyC,
+                                                        unsigned int nxF,
+                                                        unsigned int nyF,
+                                                        OffCF offCF);
 
 __global__ void scaleCF_AA2016_comp_27(real* DC,
-												  real* DF,
-												  unsigned int* neighborCX,
-												  unsigned int* neighborCY,
-												  unsigned int* neighborCZ,
-												  unsigned int* neighborFX,
-												  unsigned int* neighborFY,
-												  unsigned int* neighborFZ,
-												  unsigned int size_MatC,
-												  unsigned int size_MatF,
-												  bool isEvenTimestep,
-												  unsigned int* posCSWB,
-												  unsigned int* posFSWB,
-												  unsigned int kCF,
-												  real omCoarse,
-												  real omFine,
-												  real nu,
-												  unsigned int nxC,
-												  unsigned int nyC,
-												  unsigned int nxF,
-												  unsigned int nyF,
-												  OffCF offCF);
+                                                  real* DF,
+                                                  unsigned int* neighborCX,
+                                                  unsigned int* neighborCY,
+                                                  unsigned int* neighborCZ,
+                                                  unsigned int* neighborFX,
+                                                  unsigned int* neighborFY,
+                                                  unsigned int* neighborFZ,
+                                                  unsigned long long numberOfLBnodesC,
+                                                  unsigned long long numberOfLBnodesF,
+                                                  bool isEvenTimestep,
+                                                  unsigned int* posCSWB,
+                                                  unsigned int* posFSWB,
+                                                  unsigned int kCF,
+                                                  real omCoarse,
+                                                  real omFine,
+                                                  real nu,
+                                                  unsigned int nxC,
+                                                  unsigned int nyC,
+                                                  unsigned int nxF,
+                                                  unsigned int nyF,
+                                                  OffCF offCF);
 
 __global__ void scaleCF_NSPress_27(real* DC,
-											  real* DF,
-											  unsigned int* neighborCX,
-											  unsigned int* neighborCY,
-											  unsigned int* neighborCZ,
-											  unsigned int* neighborFX,
-											  unsigned int* neighborFY,
-											  unsigned int* neighborFZ,
-											  unsigned int size_MatC,
-											  unsigned int size_MatF,
-											  bool isEvenTimestep,
-											  unsigned int* posCSWB,
-											  unsigned int* posFSWB,
-											  unsigned int kCF,
-											  real omCoarse,
-											  real omFine,
-											  real nu,
-											  unsigned int nxC,
-											  unsigned int nyC,
-											  unsigned int nxF,
-											  unsigned int nyF,
-											  OffCF offCF);
+                                              real* DF,
+                                              unsigned int* neighborCX,
+                                              unsigned int* neighborCY,
+                                              unsigned int* neighborCZ,
+                                              unsigned int* neighborFX,
+                                              unsigned int* neighborFY,
+                                              unsigned int* neighborFZ,
+                                              unsigned long long numberOfLBnodesC,
+                                              unsigned long long numberOfLBnodesF,
+                                              bool isEvenTimestep,
+                                              unsigned int* posCSWB,
+                                              unsigned int* posFSWB,
+                                              unsigned int kCF,
+                                              real omCoarse,
+                                              real omFine,
+                                              real nu,
+                                              unsigned int nxC,
+                                              unsigned int nyC,
+                                              unsigned int nxF,
+                                              unsigned int nyF,
+                                              OffCF offCF);
 
 __global__ void scaleCFThSMG7( real* DC,
                                           real* DF,
@@ -1835,8 +1958,8 @@ __global__ void scaleCFThSMG7( real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC,
-                                          unsigned int size_MatF,
+                                          unsigned long long numberOfLBnodesC,
+                                          unsigned long long numberOfLBnodesF,
                                           bool isEvenTimestep,
                                           unsigned int* posCSWB,
                                           unsigned int* posFSWB,
@@ -1855,8 +1978,8 @@ __global__ void scaleCFThS7(real* DC,
                                        unsigned int* neighborFX,
                                        unsigned int* neighborFY,
                                        unsigned int* neighborFZ,
-                                       unsigned int size_MatC,
-                                       unsigned int size_MatF,
+                                       unsigned long long numberOfLBnodesC,
+                                       unsigned long long numberOfLBnodesF,
                                        bool isEvenTimestep,
                                        unsigned int* posCSWB,
                                        unsigned int* posFSWB,
@@ -1874,15 +1997,15 @@ __global__ void scaleCFThS27(real* DC,
                                         unsigned int* neighborFX,
                                         unsigned int* neighborFY,
                                         unsigned int* neighborFZ,
-                                        unsigned int size_MatC,
-                                        unsigned int size_MatF,
+                                        unsigned long long numberOfLBnodesC,
+                                        unsigned long long numberOfLBnodesF,
                                         bool isEvenTimestep,
                                         unsigned int* posCSWB,
                                         unsigned int* posFSWB,
                                         unsigned int kCF,
                                         real nu,
                                         real diffusivity_fine,
-										OffCF offCF);
+                                        OffCF offCF);
 
 //fine to coarse
 __global__ void scaleFC27(real* DC,
@@ -1893,18 +2016,18 @@ __global__ void scaleFC27(real* DC,
                                     unsigned int* neighborFX,
                                     unsigned int* neighborFY,
                                     unsigned int* neighborFZ,
-										       unsigned int size_MatC,
-										       unsigned int size_MatF,
-										       bool isEvenTimestep,
+                                               unsigned long long numberOfLBnodesC,
+                                               unsigned long long numberOfLBnodesF,
+                                               bool isEvenTimestep,
                                      unsigned int* posC,
                                      unsigned int* posFSWB,
                                      unsigned int kFC,
-										       real omCoarse,
-										       real omFine,
-										       real nu,
-										       unsigned int nxC,
-										       unsigned int nyC,
-										       unsigned int nxF,
+                                               real omCoarse,
+                                               real omFine,
+                                               real nu,
+                                               unsigned int nxC,
+                                               unsigned int nyC,
+                                               unsigned int nxF,
                                      unsigned int nyF);
 
 __global__ void scaleFCEff27(real* DC,
@@ -1915,8 +2038,8 @@ __global__ void scaleFCEff27(real* DC,
                                         unsigned int* neighborFX,
                                         unsigned int* neighborFY,
                                         unsigned int* neighborFZ,
-                                        unsigned int size_MatC,
-                                        unsigned int size_MatF,
+                                        unsigned long long numberOfLBnodesC,
+                                        unsigned long long numberOfLBnodesF,
                                         bool isEvenTimestep,
                                         unsigned int* posC,
                                         unsigned int* posFSWB,
@@ -1938,8 +2061,8 @@ __global__ void scaleFCLast27( real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC,
-                                          unsigned int size_MatF,
+                                          unsigned long long numberOfLBnodesC,
+                                          unsigned long long numberOfLBnodesF,
                                           bool isEvenTimestep,
                                           unsigned int* posC,
                                           unsigned int* posFSWB,
@@ -1961,8 +2084,8 @@ __global__ void scaleFCpress27( real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC,
-                                          unsigned int size_MatF,
+                                          unsigned long long numberOfLBnodesC,
+                                          unsigned long long numberOfLBnodesF,
                                           bool isEvenTimestep,
                                           unsigned int* posC,
                                           unsigned int* posFSWB,
@@ -1984,8 +2107,8 @@ __global__ void scaleFC_Fix_27( real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC,
-                                          unsigned int size_MatF,
+                                          unsigned long long numberOfLBnodesC,
+                                          unsigned long long numberOfLBnodesF,
                                           bool isEvenTimestep,
                                           unsigned int* posC,
                                           unsigned int* posFSWB,
@@ -2000,145 +2123,145 @@ __global__ void scaleFC_Fix_27( real* DC,
                                           OffFC offFC);
 
 __global__ void scaleFC_Fix_comp_27(   real* DC,
-												  real* DF,
-												  unsigned int* neighborCX,
-												  unsigned int* neighborCY,
-												  unsigned int* neighborCZ,
-												  unsigned int* neighborFX,
-												  unsigned int* neighborFY,
-												  unsigned int* neighborFZ,
-												  unsigned int size_MatC,
-												  unsigned int size_MatF,
-												  bool isEvenTimestep,
-												  unsigned int* posC,
-												  unsigned int* posFSWB,
-												  unsigned int kFC,
-												  real omCoarse,
-												  real omFine,
-												  real nu,
-												  unsigned int nxC,
-												  unsigned int nyC,
-												  unsigned int nxF,
-												  unsigned int nyF,
-												  OffFC offFC);
+                                                  real* DF,
+                                                  unsigned int* neighborCX,
+                                                  unsigned int* neighborCY,
+                                                  unsigned int* neighborCZ,
+                                                  unsigned int* neighborFX,
+                                                  unsigned int* neighborFY,
+                                                  unsigned int* neighborFZ,
+                                                  unsigned long long numberOfLBnodesC,
+                                                  unsigned long long numberOfLBnodesF,
+                                                  bool isEvenTimestep,
+                                                  unsigned int* posC,
+                                                  unsigned int* posFSWB,
+                                                  unsigned int kFC,
+                                                  real omCoarse,
+                                                  real omFine,
+                                                  real nu,
+                                                  unsigned int nxC,
+                                                  unsigned int nyC,
+                                                  unsigned int nxF,
+                                                  unsigned int nyF,
+                                                  OffFC offFC);
 
 __global__ void scaleFC_0817_comp_27(  real* DC,
-												  real* DF,
-												  unsigned int* neighborCX,
-												  unsigned int* neighborCY,
-												  unsigned int* neighborCZ,
-												  unsigned int* neighborFX,
-												  unsigned int* neighborFY,
-												  unsigned int* neighborFZ,
-												  unsigned int size_MatC,
-												  unsigned int size_MatF,
-												  bool isEvenTimestep,
-												  unsigned int* posC,
-												  unsigned int* posFSWB,
-												  unsigned int kFC,
-												  real omCoarse,
-												  real omFine,
-												  real nu,
-												  unsigned int nxC,
-												  unsigned int nyC,
-												  unsigned int nxF,
-												  unsigned int nyF,
-												  OffFC offFC);
+                                                  real* DF,
+                                                  unsigned int* neighborCX,
+                                                  unsigned int* neighborCY,
+                                                  unsigned int* neighborCZ,
+                                                  unsigned int* neighborFX,
+                                                  unsigned int* neighborFY,
+                                                  unsigned int* neighborFZ,
+                                                  unsigned long long numberOfLBnodesC,
+                                                  unsigned long long numberOfLBnodesF,
+                                                  bool isEvenTimestep,
+                                                  unsigned int* posC,
+                                                  unsigned int* posFSWB,
+                                                  unsigned int kFC,
+                                                  real omCoarse,
+                                                  real omFine,
+                                                  real nu,
+                                                  unsigned int nxC,
+                                                  unsigned int nyC,
+                                                  unsigned int nxF,
+                                                  unsigned int nyF,
+                                                  OffFC offFC);
 
 __global__ void scaleFC_comp_D3Q27F3_2018( real* DC,
-													  real* DF,
-													  real* G6,
-													  unsigned int* neighborCX,
-													  unsigned int* neighborCY,
-													  unsigned int* neighborCZ,
-													  unsigned int* neighborFX,
-													  unsigned int* neighborFY,
-													  unsigned int* neighborFZ,
-													  unsigned int size_MatC,
-													  unsigned int size_MatF,
-													  bool isEvenTimestep,
-													  unsigned int* posC,
-													  unsigned int* posFSWB,
-													  unsigned int kFC,
-													  real omCoarse,
-													  real omFine,
-													  real nu,
-													  unsigned int nxC,
-													  unsigned int nyC,
-													  unsigned int nxF,
-													  unsigned int nyF,
-													  OffFC offFC);
+                                                      real* DF,
+                                                      real* G6,
+                                                      unsigned int* neighborCX,
+                                                      unsigned int* neighborCY,
+                                                      unsigned int* neighborCZ,
+                                                      unsigned int* neighborFX,
+                                                      unsigned int* neighborFY,
+                                                      unsigned int* neighborFZ,
+                                                      unsigned long long numberOfLBnodesC,
+                                                      unsigned long long numberOfLBnodesF,
+                                                      bool isEvenTimestep,
+                                                      unsigned int* posC,
+                                                      unsigned int* posFSWB,
+                                                      unsigned int kFC,
+                                                      real omCoarse,
+                                                      real omFine,
+                                                      real nu,
+                                                      unsigned int nxC,
+                                                      unsigned int nyC,
+                                                      unsigned int nxF,
+                                                      unsigned int nyF,
+                                                      OffFC offFC);
 
 __global__ void scaleFC_comp_D3Q27F3( real* DC,
-												 real* DF,
-												 real* G6,
-												 unsigned int* neighborCX,
-												 unsigned int* neighborCY,
-												 unsigned int* neighborCZ,
-												 unsigned int* neighborFX,
-												 unsigned int* neighborFY,
-												 unsigned int* neighborFZ,
-												 unsigned int size_MatC,
-												 unsigned int size_MatF,
-												 bool isEvenTimestep,
-												 unsigned int* posC,
-												 unsigned int* posFSWB,
-												 unsigned int kFC,
-												 real omCoarse,
-												 real omFine,
-												 real nu,
-												 unsigned int nxC,
-												 unsigned int nyC,
-												 unsigned int nxF,
-												 unsigned int nyF,
-												 OffFC offFC);
+                                                 real* DF,
+                                                 real* G6,
+                                                 unsigned int* neighborCX,
+                                                 unsigned int* neighborCY,
+                                                 unsigned int* neighborCZ,
+                                                 unsigned int* neighborFX,
+                                                 unsigned int* neighborFY,
+                                                 unsigned int* neighborFZ,
+                                                 unsigned long long numberOfLBnodesC,
+                                                 unsigned long long numberOfLBnodesF,
+                                                 bool isEvenTimestep,
+                                                 unsigned int* posC,
+                                                 unsigned int* posFSWB,
+                                                 unsigned int kFC,
+                                                 real omCoarse,
+                                                 real omFine,
+                                                 real nu,
+                                                 unsigned int nxC,
+                                                 unsigned int nyC,
+                                                 unsigned int nxF,
+                                                 unsigned int nyF,
+                                                 OffFC offFC);
 
 
 __global__ void scaleFC_staggered_time_comp_27(real* DC,
-														  real* DF,
-														  unsigned int* neighborCX,
-														  unsigned int* neighborCY,
-														  unsigned int* neighborCZ,
-														  unsigned int* neighborFX,
-														  unsigned int* neighborFY,
-														  unsigned int* neighborFZ,
-														  unsigned int size_MatC,
-														  unsigned int size_MatF,
-														  bool isEvenTimestep,
-														  unsigned int* posC,
-														  unsigned int* posFSWB,
-														  unsigned int kFC,
-														  real omCoarse,
-														  real omFine,
-														  real nu,
-														  unsigned int nxC,
-														  unsigned int nyC,
-														  unsigned int nxF,
-														  unsigned int nyF,
-														  OffFC offFC);
+                                                          real* DF,
+                                                          unsigned int* neighborCX,
+                                                          unsigned int* neighborCY,
+                                                          unsigned int* neighborCZ,
+                                                          unsigned int* neighborFX,
+                                                          unsigned int* neighborFY,
+                                                          unsigned int* neighborFZ,
+                                                          unsigned long long numberOfLBnodesC,
+                                                          unsigned long long numberOfLBnodesF,
+                                                          bool isEvenTimestep,
+                                                          unsigned int* posC,
+                                                          unsigned int* posFSWB,
+                                                          unsigned int kFC,
+                                                          real omCoarse,
+                                                          real omFine,
+                                                          real nu,
+                                                          unsigned int nxC,
+                                                          unsigned int nyC,
+                                                          unsigned int nxF,
+                                                          unsigned int nyF,
+                                                          OffFC offFC);
 
 __global__ void scaleFC_RhoSq_comp_27( real* DC,
-												  real* DF,
-												  unsigned int* neighborCX,
-												  unsigned int* neighborCY,
-												  unsigned int* neighborCZ,
-												  unsigned int* neighborFX,
-												  unsigned int* neighborFY,
-												  unsigned int* neighborFZ,
-												  unsigned int size_MatC,
-												  unsigned int size_MatF,
-												  bool isEvenTimestep,
-												  unsigned int* posC,
-												  unsigned int* posFSWB,
-												  unsigned int kFC,
-												  real omCoarse,
-												  real omFine,
-												  real nu,
-												  unsigned int nxC,
-												  unsigned int nyC,
-												  unsigned int nxF,
-												  unsigned int nyF,
-												  OffFC offFC);
+                                                  real* DF,
+                                                  unsigned int* neighborCX,
+                                                  unsigned int* neighborCY,
+                                                  unsigned int* neighborCZ,
+                                                  unsigned int* neighborFX,
+                                                  unsigned int* neighborFY,
+                                                  unsigned int* neighborFZ,
+                                                  unsigned long long numberOfLBnodesC,
+                                                  unsigned long long numberOfLBnodesF,
+                                                  bool isEvenTimestep,
+                                                  unsigned int* posC,
+                                                  unsigned int* posFSWB,
+                                                  unsigned int kFC,
+                                                  real omCoarse,
+                                                  real omFine,
+                                                  real nu,
+                                                  unsigned int nxC,
+                                                  unsigned int nyC,
+                                                  unsigned int nxF,
+                                                  unsigned int nyF,
+                                                  OffFC offFC);
 
 __global__ void scaleFC_compressible(
     real *distributionsCoarse,
@@ -2149,8 +2272,8 @@ __global__ void scaleFC_compressible(
     unsigned int *neighborXfine,
     unsigned int *neighborYfine,
     unsigned int *neighborZfine,
-    unsigned int numberOfLBnodesCoarse,
-    unsigned int numberOfLBnodesFine,
+    unsigned long long numberOfLBnodesCoarse,
+    unsigned long long numberOfLBnodesFine,
     bool isEvenTimestep,
     unsigned int *indicesCoarse000,
     unsigned int *indicesFineMMM,
@@ -2160,73 +2283,73 @@ __global__ void scaleFC_compressible(
     OffFC offsetFC);
 
 __global__ void scaleFC_RhoSq_3rdMom_comp_27(real* DC,
-														real* DF,
-														unsigned int* neighborCX,
-														unsigned int* neighborCY,
-														unsigned int* neighborCZ,
-														unsigned int* neighborFX,
-														unsigned int* neighborFY,
-														unsigned int* neighborFZ,
-														unsigned int size_MatC,
-														unsigned int size_MatF,
-														bool isEvenTimestep,
-														unsigned int* posC,
-														unsigned int* posFSWB,
-														unsigned int kFC,
-														real omCoarse,
-														real omFine,
-														real nu,
-														unsigned int nxC,
-														unsigned int nyC,
-														unsigned int nxF,
-														unsigned int nyF,
-														OffFC offFC);
+                                                        real* DF,
+                                                        unsigned int* neighborCX,
+                                                        unsigned int* neighborCY,
+                                                        unsigned int* neighborCZ,
+                                                        unsigned int* neighborFX,
+                                                        unsigned int* neighborFY,
+                                                        unsigned int* neighborFZ,
+                                                        unsigned long long numberOfLBnodesC,
+                                                        unsigned long long numberOfLBnodesF,
+                                                        bool isEvenTimestep,
+                                                        unsigned int* posC,
+                                                        unsigned int* posFSWB,
+                                                        unsigned int kFC,
+                                                        real omCoarse,
+                                                        real omFine,
+                                                        real nu,
+                                                        unsigned int nxC,
+                                                        unsigned int nyC,
+                                                        unsigned int nxF,
+                                                        unsigned int nyF,
+                                                        OffFC offFC);
 
 __global__ void scaleFC_AA2016_comp_27(real* DC,
-												  real* DF,
-												  unsigned int* neighborCX,
-												  unsigned int* neighborCY,
-												  unsigned int* neighborCZ,
-												  unsigned int* neighborFX,
-												  unsigned int* neighborFY,
-												  unsigned int* neighborFZ,
-												  unsigned int size_MatC,
-												  unsigned int size_MatF,
-												  bool isEvenTimestep,
-												  unsigned int* posC,
-												  unsigned int* posFSWB,
-												  unsigned int kFC,
-												  real omCoarse,
-												  real omFine,
-												  real nu,
-												  unsigned int nxC,
-												  unsigned int nyC,
-												  unsigned int nxF,
-												  unsigned int nyF,
-												  OffFC offFC);
+                                                  real* DF,
+                                                  unsigned int* neighborCX,
+                                                  unsigned int* neighborCY,
+                                                  unsigned int* neighborCZ,
+                                                  unsigned int* neighborFX,
+                                                  unsigned int* neighborFY,
+                                                  unsigned int* neighborFZ,
+                                                  unsigned long long numberOfLBnodesC,
+                                                  unsigned long long numberOfLBnodesF,
+                                                  bool isEvenTimestep,
+                                                  unsigned int* posC,
+                                                  unsigned int* posFSWB,
+                                                  unsigned int kFC,
+                                                  real omCoarse,
+                                                  real omFine,
+                                                  real nu,
+                                                  unsigned int nxC,
+                                                  unsigned int nyC,
+                                                  unsigned int nxF,
+                                                  unsigned int nyF,
+                                                  OffFC offFC);
 
 __global__ void scaleFC_NSPress_27(real* DC,
-											  real* DF,
-											  unsigned int* neighborCX,
-											  unsigned int* neighborCY,
-											  unsigned int* neighborCZ,
-											  unsigned int* neighborFX,
-											  unsigned int* neighborFY,
-											  unsigned int* neighborFZ,
-											  unsigned int size_MatC,
-											  unsigned int size_MatF,
-											  bool isEvenTimestep,
-											  unsigned int* posC,
-											  unsigned int* posFSWB,
-											  unsigned int kFC,
-											  real omCoarse,
-											  real omFine,
-											  real nu,
-											  unsigned int nxC,
-											  unsigned int nyC,
-											  unsigned int nxF,
-											  unsigned int nyF,
-											  OffFC offFC);
+                                              real* DF,
+                                              unsigned int* neighborCX,
+                                              unsigned int* neighborCY,
+                                              unsigned int* neighborCZ,
+                                              unsigned int* neighborFX,
+                                              unsigned int* neighborFY,
+                                              unsigned int* neighborFZ,
+                                              unsigned long long numberOfLBnodesC,
+                                              unsigned long long numberOfLBnodesF,
+                                              bool isEvenTimestep,
+                                              unsigned int* posC,
+                                              unsigned int* posFSWB,
+                                              unsigned int kFC,
+                                              real omCoarse,
+                                              real omFine,
+                                              real nu,
+                                              unsigned int nxC,
+                                              unsigned int nyC,
+                                              unsigned int nxF,
+                                              unsigned int nyF,
+                                              OffFC offFC);
 
 __global__ void scaleFCThSMG7( real* DC,
                                           real* DF,
@@ -2238,8 +2361,8 @@ __global__ void scaleFCThSMG7( real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC,
-                                          unsigned int size_MatF,
+                                          unsigned long long numberOfLBnodesC,
+                                          unsigned long long numberOfLBnodesF,
                                           bool isEvenTimestep,
                                           unsigned int* posC,
                                           unsigned int* posFSWB,
@@ -2258,8 +2381,8 @@ __global__ void scaleFCThS7(real* DC,
                                        unsigned int* neighborFX,
                                        unsigned int* neighborFY,
                                        unsigned int* neighborFZ,
-                                       unsigned int size_MatC,
-                                       unsigned int size_MatF,
+                                       unsigned long long numberOfLBnodesC,
+                                       unsigned long long numberOfLBnodesF,
                                        bool isEvenTimestep,
                                        unsigned int* posC,
                                        unsigned int* posFSWB,
@@ -2277,242 +2400,242 @@ __global__ void scaleFCThS27(  real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC,
-                                          unsigned int size_MatF,
+                                          unsigned long long numberOfLBnodesC,
+                                          unsigned long long numberOfLBnodesF,
                                           bool isEvenTimestep,
                                           unsigned int* posC,
                                           unsigned int* posFSWB,
                                           unsigned int kFC,
                                           real nu,
                                           real diffusivity_coarse,
-										  OffFC offFC);
+                                          OffFC offFC);
 
 __global__ void DragLiftPost27(  real* DD,
-											int* k_Q,
-											real* QQ,
-											int numberOfBCnodes,
-											double *DragX,
-											double *DragY,
-											double *DragZ,
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int size_Mat,
-											bool isEvenTimestep);
+                                            int* k_Q,
+                                            real* QQ,
+                                            int numberOfBCnodes,
+                                            double *DragX,
+                                            double *DragY,
+                                            double *DragZ,
+                                            unsigned int* neighborX,
+                                            unsigned int* neighborY,
+                                            unsigned int* neighborZ,
+                                            unsigned long long numberOfLBnodes,
+                                            bool isEvenTimestep);
 
 __global__ void DragLiftPre27(   real* DD,
-											int* k_Q,
-											real* QQ,
-											int numberOfBCnodes,
-											double *DragX,
-											double *DragY,
-											double *DragZ,
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int size_Mat,
-											bool isEvenTimestep);
+                                            int* k_Q,
+                                            real* QQ,
+                                            int numberOfBCnodes,
+                                            double *DragX,
+                                            double *DragY,
+                                            double *DragZ,
+                                            unsigned int* neighborX,
+                                            unsigned int* neighborY,
+                                            unsigned int* neighborZ,
+                                            unsigned long long numberOfLBnodes,
+                                            bool isEvenTimestep);
 
 __global__ void CalcCP27(real* DD,
-									int* cpIndex,
-									int nonCp,
-									double *cpPress,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									unsigned int size_Mat,
-									bool isEvenTimestep);
+                                    int* cpIndex,
+                                    int nonCp,
+                                    double *cpPress,
+                                    unsigned int* neighborX,
+                                    unsigned int* neighborY,
+                                    unsigned int* neighborZ,
+                                    unsigned long long numberOfLBnodes,
+                                    bool isEvenTimestep);
 
 __global__ void getSendFsPre27(real* DD,
-										  real* bufferFs,
-										  int* sendIndex,
+                                          real* bufferFs,
+                                          int* sendIndex,
                                           int buffmax,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat,
+                                          unsigned long long numberOfLBnodes,
                                           bool isEvenTimestep);
 
 __global__ void getSendFsPost27(real* DD,
-										   real* bufferFs,
-										   int* sendIndex,
+                                           real* bufferFs,
+                                           int* sendIndex,
                                            int buffmax,
                                            unsigned int* neighborX,
                                            unsigned int* neighborY,
                                            unsigned int* neighborZ,
-                                           unsigned int size_Mat,
+                                           unsigned long long numberOfLBnodes,
                                            bool isEvenTimestep);
 
 __global__ void setRecvFsPre27(real* DD,
-										  real* bufferFs,
-										  int* recvIndex,
+                                          real* bufferFs,
+                                          int* recvIndex,
                                           int buffmax,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat,
+                                          unsigned long long numberOfLBnodes,
                                           bool isEvenTimestep);
 
 __global__ void setRecvFsPost27(real* DD,
-										   real* bufferFs,
-										   int* recvIndex,
+                                           real* bufferFs,
+                                           int* recvIndex,
                                            int buffmax,
                                            unsigned int* neighborX,
                                            unsigned int* neighborY,
                                            unsigned int* neighborZ,
-                                           unsigned int size_Mat,
+                                           unsigned long long numberOfLBnodes,
                                            bool isEvenTimestep);
 
 __global__ void getSendGsF3(
-	real* G6,
-	real* bufferGs,
-	int* sendIndex,
-	int buffmax,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	unsigned int size_Mat,
-	bool isEvenTimestep);
+    real* G6,
+    real* bufferGs,
+    int* sendIndex,
+    int buffmax,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep);
 
 __global__ void setRecvGsF3(
-	real* G6,
-	real* bufferGs,
-	int* recvIndex,
-	int buffmax,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	unsigned int size_Mat,
-	bool isEvenTimestep);
+    real* G6,
+    real* bufferGs,
+    int* recvIndex,
+    int buffmax,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep);
 
 __global__ void WallFunction27( 	real* vx,
-											real* vy,
-											real* vz,
-											real* DD,
-											int* k_Q,
-											real* QQ,
-											unsigned int numberOfBCnodes,
-											real om1,
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int size_Mat,
-											bool isEvenTimestep);
+                                            real* vy,
+                                            real* vz,
+                                            real* DD,
+                                            int* k_Q,
+                                            real* QQ,
+                                            unsigned int numberOfBCnodes,
+                                            real om1,
+                                            unsigned int* neighborX,
+                                            unsigned int* neighborY,
+                                            unsigned int* neighborZ,
+                                            unsigned long long numberOfLBnodes,
+                                            bool isEvenTimestep);
 
 __global__ void LBSetOutputWallVelocitySP27( real* vxD,
-														real* vyD,
-														real* vzD,
-														real* vxWall,
-														real* vyWall,
-														real* vzWall,
-														int numberOfWallNodes,
-														int* kWallNodes,
-														real* rhoD,
-														real* pressD,
-														unsigned int* geoD,
-														unsigned int* neighborX,
-														unsigned int* neighborY,
-														unsigned int* neighborZ,
-														unsigned int size_Mat,
-														real* DD,
-														bool isEvenTimestep);
+                                                        real* vyD,
+                                                        real* vzD,
+                                                        real* vxWall,
+                                                        real* vyWall,
+                                                        real* vzWall,
+                                                        int numberOfWallNodes,
+                                                        int* kWallNodes,
+                                                        real* rhoD,
+                                                        real* pressD,
+                                                        unsigned int* geoD,
+                                                        unsigned int* neighborX,
+                                                        unsigned int* neighborY,
+                                                        unsigned int* neighborZ,
+                                                        unsigned long long numberOfLBnodes,
+                                                        real* DD,
+                                                        bool isEvenTimestep);
 
 __global__ void GetVeloforForcing27( real* DD,
-												int* bcIndex,
-												int nonAtBC,
-												real* Vx,
-												real* Vy,
-												real* Vz,
-												unsigned int* neighborX,
-												unsigned int* neighborY,
-												unsigned int* neighborZ,
-												unsigned int size_Mat,
-												bool isEvenTimestep);
+                                                int* bcIndex,
+                                                int nonAtBC,
+                                                real* Vx,
+                                                real* Vy,
+                                                real* Vz,
+                                                unsigned int* neighborX,
+                                                unsigned int* neighborY,
+                                                unsigned int* neighborZ,
+                                                unsigned long long numberOfLBnodes,
+                                                bool isEvenTimestep);
 
 __global__ void InitParticles( real* coordX,
-										  real* coordY,
-										  real* coordZ,
-										  real* coordParticleXlocal,
-										  real* coordParticleYlocal,
-										  real* coordParticleZlocal,
-										  real* coordParticleXglobal,
-										  real* coordParticleYglobal,
-										  real* coordParticleZglobal,
-										  real* veloParticleX,
-										  real* veloParticleY,
-										  real* veloParticleZ,
-										  real* randArray,
-										  unsigned int* particleID,
-										  unsigned int* cellBaseID,
-										  unsigned int* bcMatD,
-										  unsigned int* neighborX,
-										  unsigned int* neighborY,
-										  unsigned int* neighborZ,
-										  unsigned int* neighborWSB,
-										  int level,
-									      unsigned int numberOfParticles,
-										  unsigned int size_Mat);
+                                          real* coordY,
+                                          real* coordZ,
+                                          real* coordParticleXlocal,
+                                          real* coordParticleYlocal,
+                                          real* coordParticleZlocal,
+                                          real* coordParticleXglobal,
+                                          real* coordParticleYglobal,
+                                          real* coordParticleZglobal,
+                                          real* veloParticleX,
+                                          real* veloParticleY,
+                                          real* veloParticleZ,
+                                          real* randArray,
+                                          unsigned int* particleID,
+                                          unsigned int* cellBaseID,
+                                          unsigned int* bcMatD,
+                                          unsigned int* neighborX,
+                                          unsigned int* neighborY,
+                                          unsigned int* neighborZ,
+                                          unsigned int* neighborWSB,
+                                          int level,
+                                          unsigned int numberOfParticles,
+                                          unsigned long long numberOfLBnodes);
 
 __global__ void MoveParticles( real* coordX,
-										  real* coordY,
-										  real* coordZ,
-										  real* coordParticleXlocal,
-										  real* coordParticleYlocal,
-										  real* coordParticleZlocal,
-										  real* coordParticleXglobal,
-										  real* coordParticleYglobal,
-										  real* coordParticleZglobal,
-										  real* veloParticleX,
-										  real* veloParticleY,
-										  real* veloParticleZ,
-										  real* DD,
-										  real  omega,
-										  unsigned int* particleID,
-										  unsigned int* cellBaseID,
-										  unsigned int* bcMatD,
-										  unsigned int* neighborX,
-										  unsigned int* neighborY,
-										  unsigned int* neighborZ,
-										  unsigned int* neighborWSB,
-										  int level,
-										  unsigned int timestep,
-										  unsigned int numberOfTimesteps,
-									      unsigned int numberOfParticles,
-										  unsigned int size_Mat,
-										  bool isEvenTimestep);
+                                          real* coordY,
+                                          real* coordZ,
+                                          real* coordParticleXlocal,
+                                          real* coordParticleYlocal,
+                                          real* coordParticleZlocal,
+                                          real* coordParticleXglobal,
+                                          real* coordParticleYglobal,
+                                          real* coordParticleZglobal,
+                                          real* veloParticleX,
+                                          real* veloParticleY,
+                                          real* veloParticleZ,
+                                          real* DD,
+                                          real  omega,
+                                          unsigned int* particleID,
+                                          unsigned int* cellBaseID,
+                                          unsigned int* bcMatD,
+                                          unsigned int* neighborX,
+                                          unsigned int* neighborY,
+                                          unsigned int* neighborZ,
+                                          unsigned int* neighborWSB,
+                                          int level,
+                                          unsigned int timestep,
+                                          unsigned int numberOfTimesteps,
+                                          unsigned int numberOfParticles,
+                                          unsigned long long numberOfLBnodes,
+                                          bool isEvenTimestep);
 
 __global__ void MoveParticlesWithoutBCs(   real* coordX,
-													  real* coordY,
-													  real* coordZ,
-													  real* coordParticleXlocal,
-													  real* coordParticleYlocal,
-													  real* coordParticleZlocal,
-													  real* coordParticleXglobal,
-													  real* coordParticleYglobal,
-													  real* coordParticleZglobal,
-													  real* veloParticleX,
-													  real* veloParticleY,
-													  real* veloParticleZ,
-													  real* DD,
-													  real  omega,
-													  unsigned int* particleID,
-													  unsigned int* cellBaseID,
-													  unsigned int* bcMatD,
-													  unsigned int* neighborX,
-													  unsigned int* neighborY,
-													  unsigned int* neighborZ,
-													  unsigned int* neighborWSB,
-													  int level,
-													  unsigned int timestep,
-													  unsigned int numberOfTimesteps,
-													  unsigned int numberOfParticles,
-													  unsigned int size_Mat,
-													  bool isEvenTimestep);
+                                                      real* coordY,
+                                                      real* coordZ,
+                                                      real* coordParticleXlocal,
+                                                      real* coordParticleYlocal,
+                                                      real* coordParticleZlocal,
+                                                      real* coordParticleXglobal,
+                                                      real* coordParticleYglobal,
+                                                      real* coordParticleZglobal,
+                                                      real* veloParticleX,
+                                                      real* veloParticleY,
+                                                      real* veloParticleZ,
+                                                      real* DD,
+                                                      real  omega,
+                                                      unsigned int* particleID,
+                                                      unsigned int* cellBaseID,
+                                                      unsigned int* bcMatD,
+                                                      unsigned int* neighborX,
+                                                      unsigned int* neighborY,
+                                                      unsigned int* neighborZ,
+                                                      unsigned int* neighborWSB,
+                                                      int level,
+                                                      unsigned int timestep,
+                                                      unsigned int numberOfTimesteps,
+                                                      unsigned int numberOfParticles,
+                                                      unsigned long long numberOfLBnodes,
+                                                      bool isEvenTimestep);
 
 __global__ void initRandom(curandState* state);
 
 __global__ void generateRandomValues(curandState* state,
-												real* randArray);
+                                                real* randArray);
 
 __global__ void CalcTurbulenceIntensity(
    real* vxx,
@@ -2529,7 +2652,7 @@ __global__ void CalcTurbulenceIntensity(
    unsigned int* neighborX,
    unsigned int* neighborY,
    unsigned int* neighborZ,
-   unsigned int size_Mat,
+   unsigned long long numberOfLBnodes,
    bool isEvenTimestep);
 
 #endif
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleCF27.cu b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleCF27.cu
index 619d68c87d7a707e70be4c56d434191994144148..641d6519669b1522430fe88990c00d0630d00e9b 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleCF27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleCF27.cu
@@ -22,8 +22,8 @@ __global__ void scaleCF_0817_comp_27( real* DC,
 												 unsigned int* neighborFX,
 												 unsigned int* neighborFY,
 												 unsigned int* neighborFZ,
-												 unsigned int size_MatC, 
-												 unsigned int size_MatF, 
+												 unsigned long long numberOfLBnodesCoarse, 
+												 unsigned long long numberOfLBnodesFine, 
 												 bool isEvenTimestep,
 												 unsigned int* posCSWB, 
 												 unsigned int* posFSWB, 
@@ -43,33 +43,33 @@ __global__ void scaleCF_0817_comp_27( real* DC,
 		*f000dest, *fMMMdest, *fMMPdest, *fMPPdest, *fMPMdest, *fPPMdest, *fPPPdest, *fPMPdest, *fPMMdest;
 
 
-	fP00dest = &DF[DIR_P00   *size_MatF];
-	fM00dest = &DF[DIR_M00   *size_MatF];
-	f0P0dest = &DF[DIR_0P0   *size_MatF];
-	f0M0dest = &DF[DIR_0M0   *size_MatF];
-	f00Pdest = &DF[DIR_00P   *size_MatF];
-	f00Mdest = &DF[DIR_00M   *size_MatF];
-	fPP0dest = &DF[DIR_PP0  *size_MatF];
-	fMM0dest = &DF[DIR_MM0  *size_MatF];
-	fPM0dest = &DF[DIR_PM0  *size_MatF];
-	fMP0dest = &DF[DIR_MP0  *size_MatF];
-	fP0Pdest = &DF[DIR_P0P  *size_MatF];
-	fM0Mdest = &DF[DIR_M0M  *size_MatF];
-	fP0Mdest = &DF[DIR_P0M  *size_MatF];
-	fM0Pdest = &DF[DIR_M0P  *size_MatF];
-	f0PPdest = &DF[DIR_0PP  *size_MatF];
-	f0MMdest = &DF[DIR_0MM  *size_MatF];
-	f0PMdest = &DF[DIR_0PM  *size_MatF];
-	f0MPdest = &DF[DIR_0MP  *size_MatF];
-	f000dest = &DF[DIR_000*size_MatF];
-	fMMMdest = &DF[DIR_MMM *size_MatF];
-	fMMPdest = &DF[DIR_MMP *size_MatF];
-	fMPPdest = &DF[DIR_MPP *size_MatF];
-	fMPMdest = &DF[DIR_MPM *size_MatF];
-	fPPMdest = &DF[DIR_PPM *size_MatF];
-	fPPPdest = &DF[DIR_PPP *size_MatF];
-	fPMPdest = &DF[DIR_PMP *size_MatF];
-	fPMMdest = &DF[DIR_PMM *size_MatF];
+	fP00dest = &DF[DIR_P00 * numberOfLBnodesFine];
+	fM00dest = &DF[DIR_M00 * numberOfLBnodesFine];
+	f0P0dest = &DF[DIR_0P0 * numberOfLBnodesFine];
+	f0M0dest = &DF[DIR_0M0 * numberOfLBnodesFine];
+	f00Pdest = &DF[DIR_00P * numberOfLBnodesFine];
+	f00Mdest = &DF[DIR_00M * numberOfLBnodesFine];
+	fPP0dest = &DF[DIR_PP0 * numberOfLBnodesFine];
+	fMM0dest = &DF[DIR_MM0 * numberOfLBnodesFine];
+	fPM0dest = &DF[DIR_PM0 * numberOfLBnodesFine];
+	fMP0dest = &DF[DIR_MP0 * numberOfLBnodesFine];
+	fP0Pdest = &DF[DIR_P0P * numberOfLBnodesFine];
+	fM0Mdest = &DF[DIR_M0M * numberOfLBnodesFine];
+	fP0Mdest = &DF[DIR_P0M * numberOfLBnodesFine];
+	fM0Pdest = &DF[DIR_M0P * numberOfLBnodesFine];
+	f0PPdest = &DF[DIR_0PP * numberOfLBnodesFine];
+	f0MMdest = &DF[DIR_0MM * numberOfLBnodesFine];
+	f0PMdest = &DF[DIR_0PM * numberOfLBnodesFine];
+	f0MPdest = &DF[DIR_0MP * numberOfLBnodesFine];
+	f000dest = &DF[DIR_000 * numberOfLBnodesFine];
+	fMMMdest = &DF[DIR_MMM * numberOfLBnodesFine];
+	fMMPdest = &DF[DIR_MMP * numberOfLBnodesFine];
+	fMPPdest = &DF[DIR_MPP * numberOfLBnodesFine];
+	fMPMdest = &DF[DIR_MPM * numberOfLBnodesFine];
+	fPPMdest = &DF[DIR_PPM * numberOfLBnodesFine];
+	fPPPdest = &DF[DIR_PPP * numberOfLBnodesFine];
+	fPMPdest = &DF[DIR_PMP * numberOfLBnodesFine];
+	fPMMdest = &DF[DIR_PMM * numberOfLBnodesFine];
 
 	real
 		*fP00source, *fM00source, *f0P0source, *f0M0source, *f00Psource, *f00Msource, *fPP0source, *fMM0source, *fPM0source,
@@ -78,63 +78,63 @@ __global__ void scaleCF_0817_comp_27( real* DC,
 
 	if (isEvenTimestep == true)
 	{
-		fP00source = &DC[DIR_P00   *size_MatC];
-		fM00source = &DC[DIR_M00   *size_MatC];
-		f0P0source = &DC[DIR_0P0   *size_MatC];
-		f0M0source = &DC[DIR_0M0   *size_MatC];
-		f00Psource = &DC[DIR_00P   *size_MatC];
-		f00Msource = &DC[DIR_00M   *size_MatC];
-		fPP0source = &DC[DIR_PP0  *size_MatC];
-		fMM0source = &DC[DIR_MM0  *size_MatC];
-		fPM0source = &DC[DIR_PM0  *size_MatC];
-		fMP0source = &DC[DIR_MP0  *size_MatC];
-		fP0Psource = &DC[DIR_P0P  *size_MatC];
-		fM0Msource = &DC[DIR_M0M  *size_MatC];
-		fP0Msource = &DC[DIR_P0M  *size_MatC];
-		fM0Psource = &DC[DIR_M0P  *size_MatC];
-		f0PPsource = &DC[DIR_0PP  *size_MatC];
-		f0MMsource = &DC[DIR_0MM  *size_MatC];
-		f0PMsource = &DC[DIR_0PM  *size_MatC];
-		f0MPsource = &DC[DIR_0MP  *size_MatC];
-		f000source = &DC[DIR_000*size_MatC];
-		fMMMsource = &DC[DIR_MMM *size_MatC];
-		fMMPsource = &DC[DIR_MMP *size_MatC];
-		fMPPsource = &DC[DIR_MPP *size_MatC];
-		fMPMsource = &DC[DIR_MPM *size_MatC];
-		fPPMsource = &DC[DIR_PPM *size_MatC];
-		fPPPsource = &DC[DIR_PPP *size_MatC];
-		fPMPsource = &DC[DIR_PMP *size_MatC];
-		fPMMsource = &DC[DIR_PMM *size_MatC];
+		fP00source = &DC[DIR_P00 * numberOfLBnodesCoarse];
+		fM00source = &DC[DIR_M00 * numberOfLBnodesCoarse];
+		f0P0source = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+		f0M0source = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+		f00Psource = &DC[DIR_00P * numberOfLBnodesCoarse];
+		f00Msource = &DC[DIR_00M * numberOfLBnodesCoarse];
+		fPP0source = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+		fMM0source = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+		fPM0source = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+		fMP0source = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+		fP0Psource = &DC[DIR_P0P * numberOfLBnodesCoarse];
+		fM0Msource = &DC[DIR_M0M * numberOfLBnodesCoarse];
+		fP0Msource = &DC[DIR_P0M * numberOfLBnodesCoarse];
+		fM0Psource = &DC[DIR_M0P * numberOfLBnodesCoarse];
+		f0PPsource = &DC[DIR_0PP * numberOfLBnodesCoarse];
+		f0MMsource = &DC[DIR_0MM * numberOfLBnodesCoarse];
+		f0PMsource = &DC[DIR_0PM * numberOfLBnodesCoarse];
+		f0MPsource = &DC[DIR_0MP * numberOfLBnodesCoarse];
+		f000source = &DC[DIR_000 * numberOfLBnodesCoarse];
+		fMMMsource = &DC[DIR_MMM * numberOfLBnodesCoarse];
+		fMMPsource = &DC[DIR_MMP * numberOfLBnodesCoarse];
+		fMPPsource = &DC[DIR_MPP * numberOfLBnodesCoarse];
+		fMPMsource = &DC[DIR_MPM * numberOfLBnodesCoarse];
+		fPPMsource = &DC[DIR_PPM * numberOfLBnodesCoarse];
+		fPPPsource = &DC[DIR_PPP * numberOfLBnodesCoarse];
+		fPMPsource = &DC[DIR_PMP * numberOfLBnodesCoarse];
+		fPMMsource = &DC[DIR_PMM * numberOfLBnodesCoarse];
 	}
 	else
 	{
-		fP00source = &DC[DIR_M00   *size_MatC];
-		fM00source = &DC[DIR_P00   *size_MatC];
-		f0P0source = &DC[DIR_0M0   *size_MatC];
-		f0M0source = &DC[DIR_0P0   *size_MatC];
-		f00Psource = &DC[DIR_00M   *size_MatC];
-		f00Msource = &DC[DIR_00P   *size_MatC];
-		fPP0source = &DC[DIR_MM0  *size_MatC];
-		fMM0source = &DC[DIR_PP0  *size_MatC];
-		fPM0source = &DC[DIR_MP0  *size_MatC];
-		fMP0source = &DC[DIR_PM0  *size_MatC];
-		fP0Psource = &DC[DIR_M0M  *size_MatC];
-		fM0Msource = &DC[DIR_P0P  *size_MatC];
-		fP0Msource = &DC[DIR_M0P  *size_MatC];
-		fM0Psource = &DC[DIR_P0M  *size_MatC];
-		f0PPsource = &DC[DIR_0MM  *size_MatC];
-		f0MMsource = &DC[DIR_0PP  *size_MatC];
-		f0PMsource = &DC[DIR_0MP  *size_MatC];
-		f0MPsource = &DC[DIR_0PM  *size_MatC];
-		f000source = &DC[DIR_000*size_MatC];
-		fMMMsource = &DC[DIR_PPP *size_MatC];
-		fMMPsource = &DC[DIR_PPM *size_MatC];
-		fMPPsource = &DC[DIR_PMM *size_MatC];
-		fMPMsource = &DC[DIR_PMP *size_MatC];
-		fPPMsource = &DC[DIR_MMP *size_MatC];
-		fPPPsource = &DC[DIR_MMM *size_MatC];
-		fPMPsource = &DC[DIR_MPM *size_MatC];
-		fPMMsource = &DC[DIR_MPP *size_MatC];
+		fP00source = &DC[DIR_M00 * numberOfLBnodesCoarse];
+		fM00source = &DC[DIR_P00 * numberOfLBnodesCoarse];
+		f0P0source = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+		f0M0source = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+		f00Psource = &DC[DIR_00M * numberOfLBnodesCoarse];
+		f00Msource = &DC[DIR_00P * numberOfLBnodesCoarse];
+		fPP0source = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+		fMM0source = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+		fPM0source = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+		fMP0source = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+		fP0Psource = &DC[DIR_M0M * numberOfLBnodesCoarse];
+		fM0Msource = &DC[DIR_P0P * numberOfLBnodesCoarse];
+		fP0Msource = &DC[DIR_M0P * numberOfLBnodesCoarse];
+		fM0Psource = &DC[DIR_P0M * numberOfLBnodesCoarse];
+		f0PPsource = &DC[DIR_0MM * numberOfLBnodesCoarse];
+		f0MMsource = &DC[DIR_0PP * numberOfLBnodesCoarse];
+		f0PMsource = &DC[DIR_0MP * numberOfLBnodesCoarse];
+		f0MPsource = &DC[DIR_0PM * numberOfLBnodesCoarse];
+		f000source = &DC[DIR_000 * numberOfLBnodesCoarse];
+		fMMMsource = &DC[DIR_PPP * numberOfLBnodesCoarse];
+		fMMPsource = &DC[DIR_PPM * numberOfLBnodesCoarse];
+		fMPPsource = &DC[DIR_PMM * numberOfLBnodesCoarse];
+		fMPMsource = &DC[DIR_PMP * numberOfLBnodesCoarse];
+		fPPMsource = &DC[DIR_MMP * numberOfLBnodesCoarse];
+		fPPPsource = &DC[DIR_MMM * numberOfLBnodesCoarse];
+		fPMPsource = &DC[DIR_MPM * numberOfLBnodesCoarse];
+		fPMMsource = &DC[DIR_MPP * numberOfLBnodesCoarse];
 	}
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
@@ -4091,8 +4091,8 @@ __global__ void scaleCF_AA2016_comp_27(real* DC,
 												  unsigned int* neighborFX,
 												  unsigned int* neighborFY,
 												  unsigned int* neighborFZ,
-												  unsigned int size_MatC, 
-												  unsigned int size_MatF, 
+												  unsigned long long numberOfLBnodesCoarse, 
+												  unsigned long long numberOfLBnodesFine, 
 												  bool isEvenTimestep,
 												  unsigned int* posCSWB, 
 												  unsigned int* posFSWB, 
@@ -4109,96 +4109,96 @@ __global__ void scaleCF_AA2016_comp_27(real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -10974,8 +10974,8 @@ __global__ void scaleCF_RhoSq_3rdMom_comp_27(real* DC,
 														unsigned int* neighborFX,
 														unsigned int* neighborFY,
 														unsigned int* neighborFZ,
-														unsigned int size_MatC, 
-														unsigned int size_MatF, 
+														unsigned long long numberOfLBnodesCoarse, 
+														unsigned long long numberOfLBnodesFine, 
 														bool isEvenTimestep,
 														unsigned int* posCSWB, 
 														unsigned int* posFSWB, 
@@ -10992,96 +10992,96 @@ __global__ void scaleCF_RhoSq_3rdMom_comp_27(real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -17849,8 +17849,8 @@ __global__ void scaleCF_RhoSq_comp_27(real* DC,
 												 unsigned int* neighborFX,
 												 unsigned int* neighborFY,
 												 unsigned int* neighborFZ,
-												 unsigned int size_MatC, 
-												 unsigned int size_MatF, 
+												 unsigned long long numberOfLBnodesCoarse, 
+												 unsigned long long numberOfLBnodesFine, 
 												 bool isEvenTimestep,
 												 unsigned int* posCSWB, 
 												 unsigned int* posFSWB, 
@@ -17867,96 +17867,96 @@ __global__ void scaleCF_RhoSq_comp_27(real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -22133,8 +22133,8 @@ __global__ void scaleCF_staggered_time_comp_27(   real* DC,
 															 unsigned int* neighborFX,
 															 unsigned int* neighborFY,
 															 unsigned int* neighborFZ,
-															 unsigned int size_MatC, 
-															 unsigned int size_MatF, 
+															 unsigned long long numberOfLBnodesCoarse, 
+															 unsigned long long numberOfLBnodesFine, 
 															 bool isEvenTimestep,
 															 unsigned int* posCSWB, 
 															 unsigned int* posFSWB, 
@@ -22151,96 +22151,96 @@ __global__ void scaleCF_staggered_time_comp_27(   real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -26369,8 +26369,8 @@ __global__ void scaleCF_Fix_comp_27(  real* DC,
 												 unsigned int* neighborFX,
 												 unsigned int* neighborFY,
 												 unsigned int* neighborFZ,
-												 unsigned int size_MatC, 
-												 unsigned int size_MatF, 
+												 unsigned long long numberOfLBnodesCoarse, 
+												 unsigned long long numberOfLBnodesFine, 
 												 bool isEvenTimestep,
 												 unsigned int* posCSWB, 
 												 unsigned int* posFSWB, 
@@ -26387,96 +26387,96 @@ __global__ void scaleCF_Fix_comp_27(  real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -31136,8 +31136,8 @@ __global__ void scaleCF_NSPress_27(   real* DC,
 												 unsigned int* neighborFX,
 												 unsigned int* neighborFY,
 												 unsigned int* neighborFZ,
-												 unsigned int size_MatC, 
-												 unsigned int size_MatF, 
+												 unsigned long long numberOfLBnodesCoarse, 
+												 unsigned long long numberOfLBnodesFine, 
 												 bool isEvenTimestep,
 												 unsigned int* posCSWB, 
 												 unsigned int* posFSWB, 
@@ -31154,96 +31154,96 @@ __global__ void scaleCF_NSPress_27(   real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -35080,8 +35080,8 @@ __global__ void scaleCF_Fix_27(   real* DC,
                                              unsigned int* neighborFX,
                                              unsigned int* neighborFY,
                                              unsigned int* neighborFZ,
-                                             unsigned int size_MatC, 
-                                             unsigned int size_MatF, 
+                                             unsigned long long numberOfLBnodesCoarse, 
+                                             unsigned long long numberOfLBnodesFine, 
                                              bool isEvenTimestep,
                                              unsigned int* posCSWB, 
                                              unsigned int* posFSWB, 
@@ -35098,96 +35098,96 @@ __global__ void scaleCF_Fix_27(   real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -39338,8 +39338,8 @@ __global__ void scaleCFpress27(   real* DC,
                                              unsigned int* neighborFX,
                                              unsigned int* neighborFY,
                                              unsigned int* neighborFZ,
-                                             unsigned int size_MatC, 
-                                             unsigned int size_MatF, 
+                                             unsigned long long numberOfLBnodesCoarse, 
+                                             unsigned long long numberOfLBnodesFine, 
                                              bool isEvenTimestep,
                                              unsigned int* posCSWB, 
                                              unsigned int* posFSWB, 
@@ -39356,96 +39356,96 @@ __global__ void scaleCFpress27(   real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -41012,8 +41012,8 @@ __global__ void scaleCFLast27( real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC, 
-                                          unsigned int size_MatF, 
+                                          unsigned long long numberOfLBnodesCoarse, 
+                                          unsigned long long numberOfLBnodesFine, 
                                           bool isEvenTimestep,
                                           unsigned int* posCSWB, 
                                           unsigned int* posFSWB, 
@@ -41030,96 +41030,96 @@ __global__ void scaleCFLast27( real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -43249,8 +43249,8 @@ __global__ void scaleCFThSMG7(    real* DC,
                                              unsigned int* neighborFX,
                                              unsigned int* neighborFY,
                                              unsigned int* neighborFZ,
-                                             unsigned int size_MatC, 
-                                             unsigned int size_MatF, 
+                                             unsigned long long numberOfLBnodesCoarse, 
+                                             unsigned long long numberOfLBnodesFine, 
                                              bool isEvenTimestep,
                                              unsigned int* posCSWB, 
                                              unsigned int* posFSWB, 
@@ -43261,128 +43261,128 @@ __global__ void scaleCFThSMG7(    real* DC,
 {
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, /**fzeroF,*/ *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   //fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   //fzeroF = &DF[DIR_000 * size_MatF];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, //*fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      //fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      //fzeroC = &DC[DIR_000 * size_MatC];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      //fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      //fzeroC = &DC[DIR_000 * size_MatC];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
 
    Distributions7 D7F;
-   D7F.f[0] = &DD7F[0*size_MatF];
-   D7F.f[1] = &DD7F[1*size_MatF];
-   D7F.f[2] = &DD7F[2*size_MatF];
-   D7F.f[3] = &DD7F[3*size_MatF];
-   D7F.f[4] = &DD7F[4*size_MatF];
-   D7F.f[5] = &DD7F[5*size_MatF];
-   D7F.f[6] = &DD7F[6*size_MatF];
+   D7F.f[0] = &DD7F[0*numberOfLBnodesFine];
+   D7F.f[1] = &DD7F[1*numberOfLBnodesFine];
+   D7F.f[2] = &DD7F[2*numberOfLBnodesFine];
+   D7F.f[3] = &DD7F[3*numberOfLBnodesFine];
+   D7F.f[4] = &DD7F[4*numberOfLBnodesFine];
+   D7F.f[5] = &DD7F[5*numberOfLBnodesFine];
+   D7F.f[6] = &DD7F[6*numberOfLBnodesFine];
                       
    Distributions7 D7C;
    if (isEvenTimestep==true)
    {
-      D7C.f[0] = &DD7C[0*size_MatC];
-      D7C.f[1] = &DD7C[1*size_MatC];
-      D7C.f[2] = &DD7C[2*size_MatC];
-      D7C.f[3] = &DD7C[3*size_MatC];
-      D7C.f[4] = &DD7C[4*size_MatC];
-      D7C.f[5] = &DD7C[5*size_MatC];
-      D7C.f[6] = &DD7C[6*size_MatC];
+      D7C.f[0] = &DD7C[0*numberOfLBnodesCoarse];
+      D7C.f[1] = &DD7C[1*numberOfLBnodesCoarse];
+      D7C.f[2] = &DD7C[2*numberOfLBnodesCoarse];
+      D7C.f[3] = &DD7C[3*numberOfLBnodesCoarse];
+      D7C.f[4] = &DD7C[4*numberOfLBnodesCoarse];
+      D7C.f[5] = &DD7C[5*numberOfLBnodesCoarse];
+      D7C.f[6] = &DD7C[6*numberOfLBnodesCoarse];
    }
    else
    {
-      D7C.f[0] = &DD7C[0*size_MatC];
-      D7C.f[2] = &DD7C[1*size_MatC];
-      D7C.f[1] = &DD7C[2*size_MatC];
-      D7C.f[4] = &DD7C[3*size_MatC];
-      D7C.f[3] = &DD7C[4*size_MatC];
-      D7C.f[6] = &DD7C[5*size_MatC];
-      D7C.f[5] = &DD7C[6*size_MatC];
+      D7C.f[0] = &DD7C[0*numberOfLBnodesCoarse];
+      D7C.f[2] = &DD7C[1*numberOfLBnodesCoarse];
+      D7C.f[1] = &DD7C[2*numberOfLBnodesCoarse];
+      D7C.f[4] = &DD7C[3*numberOfLBnodesCoarse];
+      D7C.f[3] = &DD7C[4*numberOfLBnodesCoarse];
+      D7C.f[6] = &DD7C[5*numberOfLBnodesCoarse];
+      D7C.f[5] = &DD7C[6*numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -44476,8 +44476,8 @@ __global__ void scaleCFThS7(   real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC, 
-                                          unsigned int size_MatF, 
+                                          unsigned long long numberOfLBnodesCoarse, 
+                                          unsigned long long numberOfLBnodesFine, 
                                           bool isEvenTimestep,
                                           unsigned int* posCSWB, 
                                           unsigned int* posFSWB, 
@@ -44487,128 +44487,128 @@ __global__ void scaleCFThS7(   real* DC,
 {
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, /**fzeroF,*/ *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   //fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   //fzeroF = &DF[DIR_000 * size_MatF];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, //*fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      //fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      //fzeroC = &DC[DIR_000 * size_MatC];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      //fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      //fzeroC = &DC[DIR_000 * size_MatC];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
 
    Distributions7 D7F;
-   D7F.f[0] = &DD7F[0*size_MatF];
-   D7F.f[1] = &DD7F[1*size_MatF];
-   D7F.f[2] = &DD7F[2*size_MatF];
-   D7F.f[3] = &DD7F[3*size_MatF];
-   D7F.f[4] = &DD7F[4*size_MatF];
-   D7F.f[5] = &DD7F[5*size_MatF];
-   D7F.f[6] = &DD7F[6*size_MatF];
+   D7F.f[0] = &DD7F[0*numberOfLBnodesFine];
+   D7F.f[1] = &DD7F[1*numberOfLBnodesFine];
+   D7F.f[2] = &DD7F[2*numberOfLBnodesFine];
+   D7F.f[3] = &DD7F[3*numberOfLBnodesFine];
+   D7F.f[4] = &DD7F[4*numberOfLBnodesFine];
+   D7F.f[5] = &DD7F[5*numberOfLBnodesFine];
+   D7F.f[6] = &DD7F[6*numberOfLBnodesFine];
                       
    Distributions7 D7C;
    if (isEvenTimestep==true)
    {
-      D7C.f[0] = &DD7C[0*size_MatC];
-      D7C.f[1] = &DD7C[1*size_MatC];
-      D7C.f[2] = &DD7C[2*size_MatC];
-      D7C.f[3] = &DD7C[3*size_MatC];
-      D7C.f[4] = &DD7C[4*size_MatC];
-      D7C.f[5] = &DD7C[5*size_MatC];
-      D7C.f[6] = &DD7C[6*size_MatC];
+      D7C.f[0] = &DD7C[0*numberOfLBnodesCoarse];
+      D7C.f[1] = &DD7C[1*numberOfLBnodesCoarse];
+      D7C.f[2] = &DD7C[2*numberOfLBnodesCoarse];
+      D7C.f[3] = &DD7C[3*numberOfLBnodesCoarse];
+      D7C.f[4] = &DD7C[4*numberOfLBnodesCoarse];
+      D7C.f[5] = &DD7C[5*numberOfLBnodesCoarse];
+      D7C.f[6] = &DD7C[6*numberOfLBnodesCoarse];
    }
    else
    {
-      D7C.f[0] = &DD7C[0*size_MatC];
-      D7C.f[2] = &DD7C[1*size_MatC];
-      D7C.f[1] = &DD7C[2*size_MatC];
-      D7C.f[4] = &DD7C[3*size_MatC];
-      D7C.f[3] = &DD7C[4*size_MatC];
-      D7C.f[6] = &DD7C[5*size_MatC];
-      D7C.f[5] = &DD7C[6*size_MatC];
+      D7C.f[0] = &DD7C[0*numberOfLBnodesCoarse];
+      D7C.f[2] = &DD7C[1*numberOfLBnodesCoarse];
+      D7C.f[1] = &DD7C[2*numberOfLBnodesCoarse];
+      D7C.f[4] = &DD7C[3*numberOfLBnodesCoarse];
+      D7C.f[3] = &DD7C[4*numberOfLBnodesCoarse];
+      D7C.f[6] = &DD7C[5*numberOfLBnodesCoarse];
+      D7C.f[5] = &DD7C[6*numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -45599,8 +45599,8 @@ __global__ void scaleCFThS27(     real* DC,
                                              unsigned int* neighborFX,
                                              unsigned int* neighborFY,
                                              unsigned int* neighborFZ,
-                                             unsigned int size_MatC, 
-                                             unsigned int size_MatF, 
+                                             unsigned long long numberOfLBnodesCoarse, 
+                                             unsigned long long numberOfLBnodesFine, 
                                              bool isEvenTimestep,
                                              unsigned int* posCSWB, 
                                              unsigned int* posFSWB, 
@@ -45611,188 +45611,188 @@ __global__ void scaleCFThS27(     real* DC,
 {
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, /**fzeroF,*/ *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   //fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   //fzeroF = &DF[DIR_000 * size_MatF];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, //*fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      //fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      //fzeroC = &DC[DIR_000 * size_MatC];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      //fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      //fzeroC = &DC[DIR_000 * size_MatC];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
 
    Distributions27 D27F;
-   D27F.f[DIR_P00   ] = &DD27F[DIR_P00   *size_MatF];
-   D27F.f[DIR_M00   ] = &DD27F[DIR_M00   *size_MatF];
-   D27F.f[DIR_0P0   ] = &DD27F[DIR_0P0   *size_MatF];
-   D27F.f[DIR_0M0   ] = &DD27F[DIR_0M0   *size_MatF];
-   D27F.f[DIR_00P   ] = &DD27F[DIR_00P   *size_MatF];
-   D27F.f[DIR_00M   ] = &DD27F[DIR_00M   *size_MatF];
-   D27F.f[DIR_PP0  ] = &DD27F[DIR_PP0  *size_MatF];
-   D27F.f[DIR_MM0  ] = &DD27F[DIR_MM0  *size_MatF];
-   D27F.f[DIR_PM0  ] = &DD27F[DIR_PM0  *size_MatF];
-   D27F.f[DIR_MP0  ] = &DD27F[DIR_MP0  *size_MatF];
-   D27F.f[DIR_P0P  ] = &DD27F[DIR_P0P  *size_MatF];
-   D27F.f[DIR_M0M  ] = &DD27F[DIR_M0M  *size_MatF];
-   D27F.f[DIR_P0M  ] = &DD27F[DIR_P0M  *size_MatF];
-   D27F.f[DIR_M0P  ] = &DD27F[DIR_M0P  *size_MatF];
-   D27F.f[DIR_0PP  ] = &DD27F[DIR_0PP  *size_MatF];
-   D27F.f[DIR_0MM  ] = &DD27F[DIR_0MM  *size_MatF];
-   D27F.f[DIR_0PM  ] = &DD27F[DIR_0PM  *size_MatF];
-   D27F.f[DIR_0MP  ] = &DD27F[DIR_0MP  *size_MatF];
-   D27F.f[DIR_000] = &DD27F[DIR_000*size_MatF];
-   D27F.f[DIR_PPP ] = &DD27F[DIR_PPP *size_MatF];
-   D27F.f[DIR_MMP ] = &DD27F[DIR_MMP *size_MatF];
-   D27F.f[DIR_PMP ] = &DD27F[DIR_PMP *size_MatF];
-   D27F.f[DIR_MPP ] = &DD27F[DIR_MPP *size_MatF];
-   D27F.f[DIR_PPM ] = &DD27F[DIR_PPM *size_MatF];
-   D27F.f[DIR_MMM ] = &DD27F[DIR_MMM *size_MatF];
-   D27F.f[DIR_PMM ] = &DD27F[DIR_PMM *size_MatF];
-   D27F.f[DIR_MPM ] = &DD27F[DIR_MPM *size_MatF];
+   D27F.f[DIR_P00] = &DD27F[DIR_P00 * numberOfLBnodesFine];
+   D27F.f[DIR_M00] = &DD27F[DIR_M00 * numberOfLBnodesFine];
+   D27F.f[DIR_0P0] = &DD27F[DIR_0P0 * numberOfLBnodesFine];
+   D27F.f[DIR_0M0] = &DD27F[DIR_0M0 * numberOfLBnodesFine];
+   D27F.f[DIR_00P] = &DD27F[DIR_00P * numberOfLBnodesFine];
+   D27F.f[DIR_00M] = &DD27F[DIR_00M * numberOfLBnodesFine];
+   D27F.f[DIR_PP0] = &DD27F[DIR_PP0 * numberOfLBnodesFine];
+   D27F.f[DIR_MM0] = &DD27F[DIR_MM0 * numberOfLBnodesFine];
+   D27F.f[DIR_PM0] = &DD27F[DIR_PM0 * numberOfLBnodesFine];
+   D27F.f[DIR_MP0] = &DD27F[DIR_MP0 * numberOfLBnodesFine];
+   D27F.f[DIR_P0P] = &DD27F[DIR_P0P * numberOfLBnodesFine];
+   D27F.f[DIR_M0M] = &DD27F[DIR_M0M * numberOfLBnodesFine];
+   D27F.f[DIR_P0M] = &DD27F[DIR_P0M * numberOfLBnodesFine];
+   D27F.f[DIR_M0P] = &DD27F[DIR_M0P * numberOfLBnodesFine];
+   D27F.f[DIR_0PP] = &DD27F[DIR_0PP * numberOfLBnodesFine];
+   D27F.f[DIR_0MM] = &DD27F[DIR_0MM * numberOfLBnodesFine];
+   D27F.f[DIR_0PM] = &DD27F[DIR_0PM * numberOfLBnodesFine];
+   D27F.f[DIR_0MP] = &DD27F[DIR_0MP * numberOfLBnodesFine];
+   D27F.f[DIR_000] = &DD27F[DIR_000 * numberOfLBnodesFine];
+   D27F.f[DIR_PPP] = &DD27F[DIR_PPP * numberOfLBnodesFine];
+   D27F.f[DIR_MMP] = &DD27F[DIR_MMP * numberOfLBnodesFine];
+   D27F.f[DIR_PMP] = &DD27F[DIR_PMP * numberOfLBnodesFine];
+   D27F.f[DIR_MPP] = &DD27F[DIR_MPP * numberOfLBnodesFine];
+   D27F.f[DIR_PPM] = &DD27F[DIR_PPM * numberOfLBnodesFine];
+   D27F.f[DIR_MMM] = &DD27F[DIR_MMM * numberOfLBnodesFine];
+   D27F.f[DIR_PMM] = &DD27F[DIR_PMM * numberOfLBnodesFine];
+   D27F.f[DIR_MPM] = &DD27F[DIR_MPM * numberOfLBnodesFine];
 
    Distributions27 D27C;
    if (isEvenTimestep==true)
    {
-      D27C.f[DIR_P00   ] = &DD27C[DIR_P00   *size_MatC];
-      D27C.f[DIR_M00   ] = &DD27C[DIR_M00   *size_MatC];
-      D27C.f[DIR_0P0   ] = &DD27C[DIR_0P0   *size_MatC];
-      D27C.f[DIR_0M0   ] = &DD27C[DIR_0M0   *size_MatC];
-      D27C.f[DIR_00P   ] = &DD27C[DIR_00P   *size_MatC];
-      D27C.f[DIR_00M   ] = &DD27C[DIR_00M   *size_MatC];
-      D27C.f[DIR_PP0  ] = &DD27C[DIR_PP0  *size_MatC];
-      D27C.f[DIR_MM0  ] = &DD27C[DIR_MM0  *size_MatC];
-      D27C.f[DIR_PM0  ] = &DD27C[DIR_PM0  *size_MatC];
-      D27C.f[DIR_MP0  ] = &DD27C[DIR_MP0  *size_MatC];
-      D27C.f[DIR_P0P  ] = &DD27C[DIR_P0P  *size_MatC];
-      D27C.f[DIR_M0M  ] = &DD27C[DIR_M0M  *size_MatC];
-      D27C.f[DIR_P0M  ] = &DD27C[DIR_P0M  *size_MatC];
-      D27C.f[DIR_M0P  ] = &DD27C[DIR_M0P  *size_MatC];
-      D27C.f[DIR_0PP  ] = &DD27C[DIR_0PP  *size_MatC];
-      D27C.f[DIR_0MM  ] = &DD27C[DIR_0MM  *size_MatC];
-      D27C.f[DIR_0PM  ] = &DD27C[DIR_0PM  *size_MatC];
-      D27C.f[DIR_0MP  ] = &DD27C[DIR_0MP  *size_MatC];
-      D27C.f[DIR_000] = &DD27C[DIR_000*size_MatC];
-      D27C.f[DIR_PPP ] = &DD27C[DIR_PPP *size_MatC];
-      D27C.f[DIR_MMP ] = &DD27C[DIR_MMP *size_MatC];
-      D27C.f[DIR_PMP ] = &DD27C[DIR_PMP *size_MatC];
-      D27C.f[DIR_MPP ] = &DD27C[DIR_MPP *size_MatC];
-      D27C.f[DIR_PPM ] = &DD27C[DIR_PPM *size_MatC];
-      D27C.f[DIR_MMM ] = &DD27C[DIR_MMM *size_MatC];
-      D27C.f[DIR_PMM ] = &DD27C[DIR_PMM *size_MatC];
-      D27C.f[DIR_MPM ] = &DD27C[DIR_MPM *size_MatC];
+      D27C.f[DIR_P00] = &DD27C[DIR_P00 * numberOfLBnodesCoarse];
+      D27C.f[DIR_M00] = &DD27C[DIR_M00 * numberOfLBnodesCoarse];
+      D27C.f[DIR_0P0] = &DD27C[DIR_0P0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_0M0] = &DD27C[DIR_0M0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_00P] = &DD27C[DIR_00P * numberOfLBnodesCoarse];
+      D27C.f[DIR_00M] = &DD27C[DIR_00M * numberOfLBnodesCoarse];
+      D27C.f[DIR_PP0] = &DD27C[DIR_PP0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_MM0] = &DD27C[DIR_MM0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_PM0] = &DD27C[DIR_PM0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_MP0] = &DD27C[DIR_MP0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_P0P] = &DD27C[DIR_P0P * numberOfLBnodesCoarse];
+      D27C.f[DIR_M0M] = &DD27C[DIR_M0M * numberOfLBnodesCoarse];
+      D27C.f[DIR_P0M] = &DD27C[DIR_P0M * numberOfLBnodesCoarse];
+      D27C.f[DIR_M0P] = &DD27C[DIR_M0P * numberOfLBnodesCoarse];
+      D27C.f[DIR_0PP] = &DD27C[DIR_0PP * numberOfLBnodesCoarse];
+      D27C.f[DIR_0MM] = &DD27C[DIR_0MM * numberOfLBnodesCoarse];
+      D27C.f[DIR_0PM] = &DD27C[DIR_0PM * numberOfLBnodesCoarse];
+      D27C.f[DIR_0MP] = &DD27C[DIR_0MP * numberOfLBnodesCoarse];
+      D27C.f[DIR_000] = &DD27C[DIR_000 * numberOfLBnodesCoarse];
+      D27C.f[DIR_PPP] = &DD27C[DIR_PPP * numberOfLBnodesCoarse];
+      D27C.f[DIR_MMP] = &DD27C[DIR_MMP * numberOfLBnodesCoarse];
+      D27C.f[DIR_PMP] = &DD27C[DIR_PMP * numberOfLBnodesCoarse];
+      D27C.f[DIR_MPP] = &DD27C[DIR_MPP * numberOfLBnodesCoarse];
+      D27C.f[DIR_PPM] = &DD27C[DIR_PPM * numberOfLBnodesCoarse];
+      D27C.f[DIR_MMM] = &DD27C[DIR_MMM * numberOfLBnodesCoarse];
+      D27C.f[DIR_PMM] = &DD27C[DIR_PMM * numberOfLBnodesCoarse];
+      D27C.f[DIR_MPM] = &DD27C[DIR_MPM * numberOfLBnodesCoarse];
    }
    else
    {
-      D27C.f[DIR_M00   ] = &DD27C[DIR_P00   *size_MatC];
-      D27C.f[DIR_P00   ] = &DD27C[DIR_M00   *size_MatC];
-      D27C.f[DIR_0M0   ] = &DD27C[DIR_0P0   *size_MatC];
-      D27C.f[DIR_0P0   ] = &DD27C[DIR_0M0   *size_MatC];
-      D27C.f[DIR_00M   ] = &DD27C[DIR_00P   *size_MatC];
-      D27C.f[DIR_00P   ] = &DD27C[DIR_00M   *size_MatC];
-      D27C.f[DIR_MM0  ] = &DD27C[DIR_PP0  *size_MatC];
-      D27C.f[DIR_PP0  ] = &DD27C[DIR_MM0  *size_MatC];
-      D27C.f[DIR_MP0  ] = &DD27C[DIR_PM0  *size_MatC];
-      D27C.f[DIR_PM0  ] = &DD27C[DIR_MP0  *size_MatC];
-      D27C.f[DIR_M0M  ] = &DD27C[DIR_P0P  *size_MatC];
-      D27C.f[DIR_P0P  ] = &DD27C[DIR_M0M  *size_MatC];
-      D27C.f[DIR_M0P  ] = &DD27C[DIR_P0M  *size_MatC];
-      D27C.f[DIR_P0M  ] = &DD27C[DIR_M0P  *size_MatC];
-      D27C.f[DIR_0MM  ] = &DD27C[DIR_0PP  *size_MatC];
-      D27C.f[DIR_0PP  ] = &DD27C[DIR_0MM  *size_MatC];
-      D27C.f[DIR_0MP  ] = &DD27C[DIR_0PM  *size_MatC];
-      D27C.f[DIR_0PM  ] = &DD27C[DIR_0MP  *size_MatC];
-      D27C.f[DIR_000] = &DD27C[DIR_000*size_MatC];
-      D27C.f[DIR_MMM ] = &DD27C[DIR_PPP *size_MatC];
-      D27C.f[DIR_PPM ] = &DD27C[DIR_MMP *size_MatC];
-      D27C.f[DIR_MPM ] = &DD27C[DIR_PMP *size_MatC];
-      D27C.f[DIR_PMM ] = &DD27C[DIR_MPP *size_MatC];
-      D27C.f[DIR_MMP ] = &DD27C[DIR_PPM *size_MatC];
-      D27C.f[DIR_PPP ] = &DD27C[DIR_MMM *size_MatC];
-      D27C.f[DIR_MPP ] = &DD27C[DIR_PMM *size_MatC];
-      D27C.f[DIR_PMP ] = &DD27C[DIR_MPM *size_MatC];
+      D27C.f[DIR_M00] = &DD27C[DIR_P00 * numberOfLBnodesCoarse];
+      D27C.f[DIR_P00] = &DD27C[DIR_M00 * numberOfLBnodesCoarse];
+      D27C.f[DIR_0M0] = &DD27C[DIR_0P0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_0P0] = &DD27C[DIR_0M0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_00M] = &DD27C[DIR_00P * numberOfLBnodesCoarse];
+      D27C.f[DIR_00P] = &DD27C[DIR_00M * numberOfLBnodesCoarse];
+      D27C.f[DIR_MM0] = &DD27C[DIR_PP0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_PP0] = &DD27C[DIR_MM0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_MP0] = &DD27C[DIR_PM0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_PM0] = &DD27C[DIR_MP0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_M0M] = &DD27C[DIR_P0P * numberOfLBnodesCoarse];
+      D27C.f[DIR_P0P] = &DD27C[DIR_M0M * numberOfLBnodesCoarse];
+      D27C.f[DIR_M0P] = &DD27C[DIR_P0M * numberOfLBnodesCoarse];
+      D27C.f[DIR_P0M] = &DD27C[DIR_M0P * numberOfLBnodesCoarse];
+      D27C.f[DIR_0MM] = &DD27C[DIR_0PP * numberOfLBnodesCoarse];
+      D27C.f[DIR_0PP] = &DD27C[DIR_0MM * numberOfLBnodesCoarse];
+      D27C.f[DIR_0MP] = &DD27C[DIR_0PM * numberOfLBnodesCoarse];
+      D27C.f[DIR_0PM] = &DD27C[DIR_0MP * numberOfLBnodesCoarse];
+      D27C.f[DIR_000] = &DD27C[DIR_000 * numberOfLBnodesCoarse];
+      D27C.f[DIR_MMM] = &DD27C[DIR_PPP * numberOfLBnodesCoarse];
+      D27C.f[DIR_PPM] = &DD27C[DIR_MMP * numberOfLBnodesCoarse];
+      D27C.f[DIR_MPM] = &DD27C[DIR_PMP * numberOfLBnodesCoarse];
+      D27C.f[DIR_PMM] = &DD27C[DIR_MPP * numberOfLBnodesCoarse];
+      D27C.f[DIR_MMP] = &DD27C[DIR_PPM * numberOfLBnodesCoarse];
+      D27C.f[DIR_PPP] = &DD27C[DIR_MMM * numberOfLBnodesCoarse];
+      D27C.f[DIR_MPP] = &DD27C[DIR_PMM * numberOfLBnodesCoarse];
+      D27C.f[DIR_PMP] = &DD27C[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -45892,33 +45892,33 @@ __global__ void scaleCFThS27(     real* DC,
       f_BSE  = fbseC[kbs];
       f_BNW  = fbnwC[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27C.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27C.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27C.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27C.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27C.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27C.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27C.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27C.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27C.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27C.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27C.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27C.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27C.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27C.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27C.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27C.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27C.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27C.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27C.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27C.f[DIR_M00])[kw   ];
+      f27N    =  (D27C.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27C.f[DIR_0M0])[ks   ];
+      f27T    =  (D27C.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27C.f[DIR_00M])[kb   ];
+      f27NE   =  (D27C.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27C.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27C.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27C.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27C.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27C.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27C.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27C.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27C.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27C.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27C.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27C.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27C.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27C.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27C.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27C.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27C.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27C.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27C.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27C.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27C.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27C.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27C.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27C.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27C.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27C.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27C.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27C.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27C.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_C_SWB = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -45979,33 +45979,33 @@ __global__ void scaleCFThS27(     real* DC,
       f_BSE  = fbseC[kbs];
       f_BNW  = fbnwC[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27C.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27C.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27C.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27C.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27C.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27C.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27C.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27C.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27C.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27C.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27C.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27C.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27C.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27C.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27C.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27C.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27C.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27C.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27C.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27C.f[DIR_M00])[kw   ];
+      f27N    =  (D27C.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27C.f[DIR_0M0])[ks   ];
+      f27T    =  (D27C.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27C.f[DIR_00M])[kb   ];
+      f27NE   =  (D27C.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27C.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27C.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27C.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27C.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27C.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27C.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27C.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27C.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27C.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27C.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27C.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27C.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27C.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27C.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27C.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27C.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27C.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27C.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27C.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27C.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27C.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27C.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27C.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27C.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27C.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27C.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27C.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27C.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_C_SWT = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -46066,33 +46066,33 @@ __global__ void scaleCFThS27(     real* DC,
       f_BSE  = fbseC[kbs];
       f_BNW  = fbnwC[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27C.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27C.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27C.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27C.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27C.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27C.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27C.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27C.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27C.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27C.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27C.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27C.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27C.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27C.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27C.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27C.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27C.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27C.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27C.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27C.f[DIR_M00])[kw   ];
+      f27N    =  (D27C.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27C.f[DIR_0M0])[ks   ];
+      f27T    =  (D27C.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27C.f[DIR_00M])[kb   ];
+      f27NE   =  (D27C.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27C.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27C.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27C.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27C.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27C.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27C.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27C.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27C.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27C.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27C.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27C.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27C.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27C.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27C.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27C.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27C.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27C.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27C.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27C.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27C.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27C.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27C.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27C.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27C.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27C.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27C.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27C.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27C.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_C_SET = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -46153,33 +46153,33 @@ __global__ void scaleCFThS27(     real* DC,
       f_BSE  = fbseC[kbs];
       f_BNW  = fbnwC[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27C.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27C.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27C.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27C.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27C.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27C.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27C.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27C.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27C.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27C.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27C.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27C.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27C.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27C.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27C.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27C.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27C.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27C.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27C.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27C.f[DIR_M00])[kw   ];
+      f27N    =  (D27C.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27C.f[DIR_0M0])[ks   ];
+      f27T    =  (D27C.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27C.f[DIR_00M])[kb   ];
+      f27NE   =  (D27C.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27C.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27C.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27C.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27C.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27C.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27C.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27C.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27C.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27C.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27C.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27C.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27C.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27C.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27C.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27C.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27C.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27C.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27C.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27C.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27C.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27C.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27C.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27C.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27C.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27C.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27C.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27C.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27C.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_C_SEB = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -46250,33 +46250,33 @@ __global__ void scaleCFThS27(     real* DC,
       f_BSE  = fbseC[kbs];
       f_BNW  = fbnwC[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27C.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27C.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27C.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27C.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27C.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27C.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27C.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27C.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27C.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27C.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27C.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27C.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27C.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27C.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27C.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27C.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27C.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27C.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27C.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27C.f[DIR_M00])[kw   ];
+      f27N    =  (D27C.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27C.f[DIR_0M0])[ks   ];
+      f27T    =  (D27C.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27C.f[DIR_00M])[kb   ];
+      f27NE   =  (D27C.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27C.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27C.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27C.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27C.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27C.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27C.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27C.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27C.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27C.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27C.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27C.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27C.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27C.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27C.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27C.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27C.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27C.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27C.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27C.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27C.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27C.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27C.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27C.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27C.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27C.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27C.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27C.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27C.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_C_NWB = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -46337,33 +46337,33 @@ __global__ void scaleCFThS27(     real* DC,
       f_BSE  = fbseC[kbs];
       f_BNW  = fbnwC[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27C.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27C.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27C.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27C.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27C.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27C.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27C.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27C.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27C.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27C.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27C.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27C.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27C.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27C.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27C.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27C.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27C.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27C.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27C.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27C.f[DIR_M00])[kw   ];
+      f27N    =  (D27C.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27C.f[DIR_0M0])[ks   ];
+      f27T    =  (D27C.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27C.f[DIR_00M])[kb   ];
+      f27NE   =  (D27C.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27C.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27C.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27C.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27C.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27C.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27C.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27C.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27C.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27C.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27C.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27C.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27C.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27C.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27C.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27C.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27C.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27C.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27C.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27C.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27C.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27C.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27C.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27C.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27C.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27C.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27C.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27C.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27C.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_C_NWT = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -46424,33 +46424,33 @@ __global__ void scaleCFThS27(     real* DC,
       f_BSE  = fbseC[kbs];
       f_BNW  = fbnwC[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27C.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27C.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27C.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27C.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27C.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27C.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27C.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27C.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27C.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27C.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27C.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27C.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27C.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27C.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27C.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27C.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27C.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27C.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27C.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27C.f[DIR_M00])[kw   ];
+      f27N    =  (D27C.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27C.f[DIR_0M0])[ks   ];
+      f27T    =  (D27C.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27C.f[DIR_00M])[kb   ];
+      f27NE   =  (D27C.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27C.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27C.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27C.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27C.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27C.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27C.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27C.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27C.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27C.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27C.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27C.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27C.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27C.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27C.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27C.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27C.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27C.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27C.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27C.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27C.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27C.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27C.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27C.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27C.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27C.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27C.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27C.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27C.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_C_NET = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -46511,33 +46511,33 @@ __global__ void scaleCFThS27(     real* DC,
       f_BSE  = fbseC[kbs];
       f_BNW  = fbnwC[kbw];
       ////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27C.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27C.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27C.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27C.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27C.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27C.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27C.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27C.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27C.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27C.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27C.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27C.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27C.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27C.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27C.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27C.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27C.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27C.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27C.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27C.f[DIR_M00])[kw   ];
+      f27N    =  (D27C.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27C.f[DIR_0M0])[ks   ];
+      f27T    =  (D27C.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27C.f[DIR_00M])[kb   ];
+      f27NE   =  (D27C.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27C.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27C.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27C.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27C.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27C.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27C.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27C.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27C.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27C.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27C.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27C.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27C.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27C.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27C.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27C.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27C.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27C.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27C.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27C.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27C.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27C.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27C.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27C.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27C.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27C.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27C.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27C.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27C.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_C_NEB = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -46656,32 +46656,32 @@ __global__ void scaleCFThS27(     real* DC,
       cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
       (D27F.f[DIR_000])[kzero] =   c8o27* Conc_F*(c1o1-cu_sq);
-      (D27F.f[DIR_P00   ])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
-      (D27F.f[DIR_M00   ])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
-      (D27F.f[DIR_0P0   ])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
-      (D27F.f[DIR_0M0   ])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
-      (D27F.f[DIR_00P   ])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
-      (D27F.f[DIR_00M   ])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
-      (D27F.f[DIR_PP0  ])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_MM0  ])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_PM0  ])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_MP0  ])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_P0P  ])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_M0M  ])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_P0M  ])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_M0P  ])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_0PP  ])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
-      (D27F.f[DIR_0MM  ])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0PM  ])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0MP  ])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PPP ])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MMM ])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PPM ])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MMP ])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PMP ])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MPM ])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PMM ])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MPP ])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_P00])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
+      (D27F.f[DIR_M00])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
+      (D27F.f[DIR_0P0])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
+      (D27F.f[DIR_0M0])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
+      (D27F.f[DIR_00P])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
+      (D27F.f[DIR_00M])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
+      (D27F.f[DIR_PP0])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_MM0])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_PM0])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_MP0])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_P0P])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_M0M])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_P0M])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_M0P])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_0PP])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
+      (D27F.f[DIR_0MM])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0PM])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0MP])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PPP])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MMM])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PPM])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MMP])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PMP])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MPM])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PMM])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MPP])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
 
 
 
@@ -46734,32 +46734,32 @@ __global__ void scaleCFThS27(     real* DC,
       cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
       (D27F.f[DIR_000])[kzero] =   c8o27* Conc_F*(c1o1-cu_sq);
-      (D27F.f[DIR_P00   ])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
-      (D27F.f[DIR_M00   ])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
-      (D27F.f[DIR_0P0   ])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
-      (D27F.f[DIR_0M0   ])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
-      (D27F.f[DIR_00P   ])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
-      (D27F.f[DIR_00M   ])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
-      (D27F.f[DIR_PP0  ])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_MM0  ])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_PM0  ])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_MP0  ])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_P0P  ])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_M0M  ])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_P0M  ])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_M0P  ])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_0PP  ])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
-      (D27F.f[DIR_0MM  ])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0PM  ])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0MP  ])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PPP ])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MMM ])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PPM ])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MMP ])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PMP ])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MPM ])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PMM ])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MPP ])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_P00])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
+      (D27F.f[DIR_M00])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
+      (D27F.f[DIR_0P0])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
+      (D27F.f[DIR_0M0])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
+      (D27F.f[DIR_00P])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
+      (D27F.f[DIR_00M])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
+      (D27F.f[DIR_PP0])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_MM0])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_PM0])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_MP0])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_P0P])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_M0M])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_P0M])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_M0P])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_0PP])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
+      (D27F.f[DIR_0MM])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0PM])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0MP])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PPP])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MMM])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PPM])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MMP])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PMP])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MPM])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PMM])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MPP])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
 
 
 
@@ -46812,32 +46812,32 @@ __global__ void scaleCFThS27(     real* DC,
       cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
       (D27F.f[DIR_000])[kzero] =   c8o27* Conc_F*(c1o1-cu_sq);
-      (D27F.f[DIR_P00   ])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
-      (D27F.f[DIR_M00   ])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
-      (D27F.f[DIR_0P0   ])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
-      (D27F.f[DIR_0M0   ])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
-      (D27F.f[DIR_00P   ])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
-      (D27F.f[DIR_00M   ])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
-      (D27F.f[DIR_PP0  ])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_MM0  ])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_PM0  ])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_MP0  ])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_P0P  ])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_M0M  ])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_P0M  ])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_M0P  ])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_0PP  ])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
-      (D27F.f[DIR_0MM  ])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0PM  ])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0MP  ])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PPP ])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MMM ])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PPM ])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MMP ])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PMP ])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MPM ])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PMM ])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MPP ])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_P00])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
+      (D27F.f[DIR_M00])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
+      (D27F.f[DIR_0P0])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
+      (D27F.f[DIR_0M0])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
+      (D27F.f[DIR_00P])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
+      (D27F.f[DIR_00M])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
+      (D27F.f[DIR_PP0])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_MM0])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_PM0])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_MP0])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_P0P])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_M0M])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_P0M])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_M0P])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_0PP])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
+      (D27F.f[DIR_0MM])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0PM])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0MP])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PPP])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MMM])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PPM])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MMP])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PMP])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MPM])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PMM])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MPP])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
 
 
 
@@ -46890,32 +46890,32 @@ __global__ void scaleCFThS27(     real* DC,
       cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
       (D27F.f[DIR_000])[kzero] =   c8o27* Conc_F*(c1o1-cu_sq);
-      (D27F.f[DIR_P00   ])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
-      (D27F.f[DIR_M00   ])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
-      (D27F.f[DIR_0P0   ])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
-      (D27F.f[DIR_0M0   ])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
-      (D27F.f[DIR_00P   ])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
-      (D27F.f[DIR_00M   ])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
-      (D27F.f[DIR_PP0  ])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_MM0  ])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_PM0  ])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_MP0  ])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_P0P  ])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_M0M  ])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_P0M  ])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_M0P  ])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_0PP  ])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
-      (D27F.f[DIR_0MM  ])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0PM  ])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0MP  ])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PPP ])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MMM ])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PPM ])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MMP ])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PMP ])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MPM ])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PMM ])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MPP ])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_P00])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
+      (D27F.f[DIR_M00])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
+      (D27F.f[DIR_0P0])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
+      (D27F.f[DIR_0M0])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
+      (D27F.f[DIR_00P])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
+      (D27F.f[DIR_00M])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
+      (D27F.f[DIR_PP0])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_MM0])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_PM0])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_MP0])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_P0P])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_M0M])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_P0M])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_M0P])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_0PP])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
+      (D27F.f[DIR_0MM])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0PM])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0MP])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PPP])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MMM])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PPM])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MMP])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PMP])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MPM])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PMM])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MPP])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
 
 
 
@@ -46978,32 +46978,32 @@ __global__ void scaleCFThS27(     real* DC,
       cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
       (D27F.f[DIR_000])[kzero] =   c8o27* Conc_F*(c1o1-cu_sq);
-      (D27F.f[DIR_P00   ])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
-      (D27F.f[DIR_M00   ])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
-      (D27F.f[DIR_0P0   ])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
-      (D27F.f[DIR_0M0   ])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
-      (D27F.f[DIR_00P   ])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
-      (D27F.f[DIR_00M   ])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
-      (D27F.f[DIR_PP0  ])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_MM0  ])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_PM0  ])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_MP0  ])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_P0P  ])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_M0M  ])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_P0M  ])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_M0P  ])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_0PP  ])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
-      (D27F.f[DIR_0MM  ])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0PM  ])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0MP  ])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PPP ])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MMM ])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PPM ])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MMP ])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PMP ])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MPM ])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PMM ])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MPP ])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_P00])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
+      (D27F.f[DIR_M00])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
+      (D27F.f[DIR_0P0])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
+      (D27F.f[DIR_0M0])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
+      (D27F.f[DIR_00P])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
+      (D27F.f[DIR_00M])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
+      (D27F.f[DIR_PP0])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_MM0])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_PM0])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_MP0])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_P0P])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_M0M])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_P0M])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_M0P])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_0PP])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
+      (D27F.f[DIR_0MM])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0PM])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0MP])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PPP])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MMM])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PPM])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MMP])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PMP])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MPM])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PMM])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MPP])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
 
 
 
@@ -47056,32 +47056,32 @@ __global__ void scaleCFThS27(     real* DC,
       cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
       (D27F.f[DIR_000])[kzero] =   c8o27* Conc_F*(c1o1-cu_sq);
-      (D27F.f[DIR_P00   ])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
-      (D27F.f[DIR_M00   ])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
-      (D27F.f[DIR_0P0   ])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
-      (D27F.f[DIR_0M0   ])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
-      (D27F.f[DIR_00P   ])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
-      (D27F.f[DIR_00M   ])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
-      (D27F.f[DIR_PP0  ])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_MM0  ])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_PM0  ])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_MP0  ])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_P0P  ])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_M0M  ])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_P0M  ])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_M0P  ])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_0PP  ])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
-      (D27F.f[DIR_0MM  ])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0PM  ])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0MP  ])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PPP ])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MMM ])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PPM ])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MMP ])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PMP ])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MPM ])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PMM ])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MPP ])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_P00])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
+      (D27F.f[DIR_M00])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
+      (D27F.f[DIR_0P0])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
+      (D27F.f[DIR_0M0])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
+      (D27F.f[DIR_00P])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
+      (D27F.f[DIR_00M])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
+      (D27F.f[DIR_PP0])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_MM0])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_PM0])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_MP0])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_P0P])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_M0M])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_P0M])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_M0P])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_0PP])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
+      (D27F.f[DIR_0MM])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0PM])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0MP])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PPP])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MMM])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PPM])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MMP])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PMP])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MPM])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PMM])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MPP])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
 
 
 
@@ -47134,32 +47134,32 @@ __global__ void scaleCFThS27(     real* DC,
       cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
       (D27F.f[DIR_000])[kzero] =   c8o27* Conc_F*(c1o1-cu_sq);
-      (D27F.f[DIR_P00   ])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
-      (D27F.f[DIR_M00   ])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
-      (D27F.f[DIR_0P0   ])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
-      (D27F.f[DIR_0M0   ])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
-      (D27F.f[DIR_00P   ])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
-      (D27F.f[DIR_00M   ])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
-      (D27F.f[DIR_PP0  ])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_MM0  ])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_PM0  ])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_MP0  ])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_P0P  ])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_M0M  ])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_P0M  ])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_M0P  ])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_0PP  ])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
-      (D27F.f[DIR_0MM  ])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0PM  ])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0MP  ])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PPP ])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MMM ])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PPM ])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MMP ])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PMP ])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MPM ])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PMM ])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MPP ])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_P00])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
+      (D27F.f[DIR_M00])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
+      (D27F.f[DIR_0P0])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
+      (D27F.f[DIR_0M0])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
+      (D27F.f[DIR_00P])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
+      (D27F.f[DIR_00M])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
+      (D27F.f[DIR_PP0])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_MM0])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_PM0])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_MP0])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_P0P])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_M0M])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_P0M])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_M0P])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_0PP])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
+      (D27F.f[DIR_0MM])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0PM])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0MP])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PPP])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MMM])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PPM])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MMP])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PMP])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MPM])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PMM])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MPP])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
 
 
 
@@ -47212,32 +47212,32 @@ __global__ void scaleCFThS27(     real* DC,
       cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
       (D27F.f[DIR_000])[kzero] =   c8o27* Conc_F*(c1o1-cu_sq);
-      (D27F.f[DIR_P00   ])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
-      (D27F.f[DIR_M00   ])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
-      (D27F.f[DIR_0P0   ])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
-      (D27F.f[DIR_0M0   ])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
-      (D27F.f[DIR_00P   ])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
-      (D27F.f[DIR_00M   ])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
-      (D27F.f[DIR_PP0  ])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_MM0  ])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_PM0  ])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
-      (D27F.f[DIR_MP0  ])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
-      (D27F.f[DIR_P0P  ])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_M0M  ])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_P0M  ])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
-      (D27F.f[DIR_M0P  ])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
-      (D27F.f[DIR_0PP  ])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
-      (D27F.f[DIR_0MM  ])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0PM  ])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
-      (D27F.f[DIR_0MP  ])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PPP ])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MMM ])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PPM ])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MMP ])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_PMP ])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
-      (D27F.f[DIR_MPM ])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
-      (D27F.f[DIR_PMM ])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
-      (D27F.f[DIR_MPP ])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_P00])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_F*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
+      (D27F.f[DIR_M00])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_F*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
+      (D27F.f[DIR_0P0])[kzero] =   c2o27* (c3o1*(     My    )+Conc_F*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
+      (D27F.f[DIR_0M0])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_F*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
+      (D27F.f[DIR_00P])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_F*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
+      (D27F.f[DIR_00M])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_F*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
+      (D27F.f[DIR_PP0])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_F*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_MM0])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_F*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_PM0])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_F*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
+      (D27F.f[DIR_MP0])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_F*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
+      (D27F.f[DIR_P0P])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_F*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_M0M])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_F*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_P0M])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_F*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
+      (D27F.f[DIR_M0P])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_F*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
+      (D27F.f[DIR_0PP])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_F*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
+      (D27F.f[DIR_0MM])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_F*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0PM])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_F*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
+      (D27F.f[DIR_0MP])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_F*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PPP])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MMM])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PPM])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_F*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MMP])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_F*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_PMP])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
+      (D27F.f[DIR_MPM])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
+      (D27F.f[DIR_PMM])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_F*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
+      (D27F.f[DIR_MPP])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_F*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -47287,8 +47287,8 @@ __global__ void scaleCFEff27(real* DC,
                                         unsigned int* neighborFX,
                                         unsigned int* neighborFY,
                                         unsigned int* neighborFZ,
-									             unsigned int size_MatC, 
-									             unsigned int size_MatF, 
+									             unsigned long long numberOfLBnodesCoarse, 
+									             unsigned long long numberOfLBnodesFine, 
 									             bool isEvenTimestep,
                                         unsigned int* posCSWB, 
                                         unsigned int* posFSWB, 
@@ -47305,96 +47305,96 @@ __global__ void scaleCFEff27(real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -48997,8 +48997,8 @@ __global__ void scaleCF27(real* DC,
                                      unsigned int* neighborFX,
                                      unsigned int* neighborFY,
                                      unsigned int* neighborFZ,
-                                     unsigned int size_MatC, 
-                                     unsigned int size_MatF, 
+                                     unsigned long long numberOfLBnodesCoarse, 
+                                     unsigned long long numberOfLBnodesFine, 
                                      bool isEvenTimestep,
                                      unsigned int* posCSWB, 
                                      unsigned int* posFSWB, 
@@ -49014,96 +49014,96 @@ __global__ void scaleCF27(real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleCF_F3_27.cu b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleCF_F3_27.cu
index cb8bd2a322cc9176cd0aa31625ee386e1f62d63d..386493280fd71fff93c117483e754a248bb0830d 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleCF_F3_27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleCF_F3_27.cu
@@ -23,8 +23,8 @@ __global__ void scaleCF_comp_D3Q27F3_2018(real* DC,
 													 unsigned int* neighborFX,
 													 unsigned int* neighborFY,
 													 unsigned int* neighborFZ,
-													 unsigned int size_MatC, 
-													 unsigned int size_MatF, 
+													 unsigned long long numberOfLBnodesCoarse, 
+													 unsigned long long numberOfLBnodesFine, 
 													 bool isEvenTimestep,
 													 unsigned int* posCSWB, 
 													 unsigned int* posFSWB, 
@@ -44,33 +44,33 @@ __global__ void scaleCF_comp_D3Q27F3_2018(real* DC,
 		*f000dest, *fMMMdest, *fMMPdest, *fMPPdest, *fMPMdest, *fPPMdest, *fPPPdest, *fPMPdest, *fPMMdest;
 
 
-	fP00dest = &DF[DIR_P00   *size_MatF];
-	fM00dest = &DF[DIR_M00   *size_MatF];
-	f0P0dest = &DF[DIR_0P0   *size_MatF];
-	f0M0dest = &DF[DIR_0M0   *size_MatF];
-	f00Pdest = &DF[DIR_00P   *size_MatF];
-	f00Mdest = &DF[DIR_00M   *size_MatF];
-	fPP0dest = &DF[DIR_PP0  *size_MatF];
-	fMM0dest = &DF[DIR_MM0  *size_MatF];
-	fPM0dest = &DF[DIR_PM0  *size_MatF];
-	fMP0dest = &DF[DIR_MP0  *size_MatF];
-	fP0Pdest = &DF[DIR_P0P  *size_MatF];
-	fM0Mdest = &DF[DIR_M0M  *size_MatF];
-	fP0Mdest = &DF[DIR_P0M  *size_MatF];
-	fM0Pdest = &DF[DIR_M0P  *size_MatF];
-	f0PPdest = &DF[DIR_0PP  *size_MatF];
-	f0MMdest = &DF[DIR_0MM  *size_MatF];
-	f0PMdest = &DF[DIR_0PM  *size_MatF];
-	f0MPdest = &DF[DIR_0MP  *size_MatF];
-	f000dest = &DF[DIR_000*size_MatF];
-	fMMMdest = &DF[DIR_MMM *size_MatF];
-	fMMPdest = &DF[DIR_MMP *size_MatF];
-	fMPPdest = &DF[DIR_MPP *size_MatF];
-	fMPMdest = &DF[DIR_MPM *size_MatF];
-	fPPMdest = &DF[DIR_PPM *size_MatF];
-	fPPPdest = &DF[DIR_PPP *size_MatF];
-	fPMPdest = &DF[DIR_PMP *size_MatF];
-	fPMMdest = &DF[DIR_PMM *size_MatF];
+	fP00dest = &DF[DIR_P00 * numberOfLBnodesFine];
+	fM00dest = &DF[DIR_M00 * numberOfLBnodesFine];
+	f0P0dest = &DF[DIR_0P0 * numberOfLBnodesFine];
+	f0M0dest = &DF[DIR_0M0 * numberOfLBnodesFine];
+	f00Pdest = &DF[DIR_00P * numberOfLBnodesFine];
+	f00Mdest = &DF[DIR_00M * numberOfLBnodesFine];
+	fPP0dest = &DF[DIR_PP0 * numberOfLBnodesFine];
+	fMM0dest = &DF[DIR_MM0 * numberOfLBnodesFine];
+	fPM0dest = &DF[DIR_PM0 * numberOfLBnodesFine];
+	fMP0dest = &DF[DIR_MP0 * numberOfLBnodesFine];
+	fP0Pdest = &DF[DIR_P0P * numberOfLBnodesFine];
+	fM0Mdest = &DF[DIR_M0M * numberOfLBnodesFine];
+	fP0Mdest = &DF[DIR_P0M * numberOfLBnodesFine];
+	fM0Pdest = &DF[DIR_M0P * numberOfLBnodesFine];
+	f0PPdest = &DF[DIR_0PP * numberOfLBnodesFine];
+	f0MMdest = &DF[DIR_0MM * numberOfLBnodesFine];
+	f0PMdest = &DF[DIR_0PM * numberOfLBnodesFine];
+	f0MPdest = &DF[DIR_0MP * numberOfLBnodesFine];
+	f000dest = &DF[DIR_000 * numberOfLBnodesFine];
+	fMMMdest = &DF[DIR_MMM * numberOfLBnodesFine];
+	fMMPdest = &DF[DIR_MMP * numberOfLBnodesFine];
+	fMPPdest = &DF[DIR_MPP * numberOfLBnodesFine];
+	fMPMdest = &DF[DIR_MPM * numberOfLBnodesFine];
+	fPPMdest = &DF[DIR_PPM * numberOfLBnodesFine];
+	fPPPdest = &DF[DIR_PPP * numberOfLBnodesFine];
+	fPMPdest = &DF[DIR_PMP * numberOfLBnodesFine];
+	fPMMdest = &DF[DIR_PMM * numberOfLBnodesFine];
 
 	real
 		*fP00source, *fM00source, *f0P0source, *f0M0source, *f00Psource, *f00Msource, *fPP0source, *fMM0source, *fPM0source,
@@ -79,72 +79,72 @@ __global__ void scaleCF_comp_D3Q27F3_2018(real* DC,
 
 	if (isEvenTimestep == true)
 	{
-		fP00source = &DC[DIR_P00   *size_MatC];
-		fM00source = &DC[DIR_M00   *size_MatC];
-		f0P0source = &DC[DIR_0P0   *size_MatC];
-		f0M0source = &DC[DIR_0M0   *size_MatC];
-		f00Psource = &DC[DIR_00P   *size_MatC];
-		f00Msource = &DC[DIR_00M   *size_MatC];
-		fPP0source = &DC[DIR_PP0  *size_MatC];
-		fMM0source = &DC[DIR_MM0  *size_MatC];
-		fPM0source = &DC[DIR_PM0  *size_MatC];
-		fMP0source = &DC[DIR_MP0  *size_MatC];
-		fP0Psource = &DC[DIR_P0P  *size_MatC];
-		fM0Msource = &DC[DIR_M0M  *size_MatC];
-		fP0Msource = &DC[DIR_P0M  *size_MatC];
-		fM0Psource = &DC[DIR_M0P  *size_MatC];
-		f0PPsource = &DC[DIR_0PP  *size_MatC];
-		f0MMsource = &DC[DIR_0MM  *size_MatC];
-		f0PMsource = &DC[DIR_0PM  *size_MatC];
-		f0MPsource = &DC[DIR_0MP  *size_MatC];
-		f000source = &DC[DIR_000*size_MatC];
-		fMMMsource = &DC[DIR_MMM *size_MatC];
-		fMMPsource = &DC[DIR_MMP *size_MatC];
-		fMPPsource = &DC[DIR_MPP *size_MatC];
-		fMPMsource = &DC[DIR_MPM *size_MatC];
-		fPPMsource = &DC[DIR_PPM *size_MatC];
-		fPPPsource = &DC[DIR_PPP *size_MatC];
-		fPMPsource = &DC[DIR_PMP *size_MatC];
-		fPMMsource = &DC[DIR_PMM *size_MatC];
+		fP00source = &DC[DIR_P00 * numberOfLBnodesCoarse];
+		fM00source = &DC[DIR_M00 * numberOfLBnodesCoarse];
+		f0P0source = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+		f0M0source = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+		f00Psource = &DC[DIR_00P * numberOfLBnodesCoarse];
+		f00Msource = &DC[DIR_00M * numberOfLBnodesCoarse];
+		fPP0source = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+		fMM0source = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+		fPM0source = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+		fMP0source = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+		fP0Psource = &DC[DIR_P0P * numberOfLBnodesCoarse];
+		fM0Msource = &DC[DIR_M0M * numberOfLBnodesCoarse];
+		fP0Msource = &DC[DIR_P0M * numberOfLBnodesCoarse];
+		fM0Psource = &DC[DIR_M0P * numberOfLBnodesCoarse];
+		f0PPsource = &DC[DIR_0PP * numberOfLBnodesCoarse];
+		f0MMsource = &DC[DIR_0MM * numberOfLBnodesCoarse];
+		f0PMsource = &DC[DIR_0PM * numberOfLBnodesCoarse];
+		f0MPsource = &DC[DIR_0MP * numberOfLBnodesCoarse];
+		f000source = &DC[DIR_000 * numberOfLBnodesCoarse];
+		fMMMsource = &DC[DIR_MMM * numberOfLBnodesCoarse];
+		fMMPsource = &DC[DIR_MMP * numberOfLBnodesCoarse];
+		fMPPsource = &DC[DIR_MPP * numberOfLBnodesCoarse];
+		fMPMsource = &DC[DIR_MPM * numberOfLBnodesCoarse];
+		fPPMsource = &DC[DIR_PPM * numberOfLBnodesCoarse];
+		fPPPsource = &DC[DIR_PPP * numberOfLBnodesCoarse];
+		fPMPsource = &DC[DIR_PMP * numberOfLBnodesCoarse];
+		fPMMsource = &DC[DIR_PMM * numberOfLBnodesCoarse];
 	}
 	else
 	{
-		fP00source = &DC[DIR_M00   *size_MatC];
-		fM00source = &DC[DIR_P00   *size_MatC];
-		f0P0source = &DC[DIR_0M0   *size_MatC];
-		f0M0source = &DC[DIR_0P0   *size_MatC];
-		f00Psource = &DC[DIR_00M   *size_MatC];
-		f00Msource = &DC[DIR_00P   *size_MatC];
-		fPP0source = &DC[DIR_MM0  *size_MatC];
-		fMM0source = &DC[DIR_PP0  *size_MatC];
-		fPM0source = &DC[DIR_MP0  *size_MatC];
-		fMP0source = &DC[DIR_PM0  *size_MatC];
-		fP0Psource = &DC[DIR_M0M  *size_MatC];
-		fM0Msource = &DC[DIR_P0P  *size_MatC];
-		fP0Msource = &DC[DIR_M0P  *size_MatC];
-		fM0Psource = &DC[DIR_P0M  *size_MatC];
-		f0PPsource = &DC[DIR_0MM  *size_MatC];
-		f0MMsource = &DC[DIR_0PP  *size_MatC];
-		f0PMsource = &DC[DIR_0MP  *size_MatC];
-		f0MPsource = &DC[DIR_0PM  *size_MatC];
-		f000source = &DC[DIR_000*size_MatC];
-		fMMMsource = &DC[DIR_PPP *size_MatC];
-		fMMPsource = &DC[DIR_PPM *size_MatC];
-		fMPPsource = &DC[DIR_PMM *size_MatC];
-		fMPMsource = &DC[DIR_PMP *size_MatC];
-		fPPMsource = &DC[DIR_MMP *size_MatC];
-		fPPPsource = &DC[DIR_MMM *size_MatC];
-		fPMPsource = &DC[DIR_MPM *size_MatC];
-		fPMMsource = &DC[DIR_MPP *size_MatC];
+		fP00source = &DC[DIR_M00 * numberOfLBnodesCoarse];
+		fM00source = &DC[DIR_P00 * numberOfLBnodesCoarse];
+		f0P0source = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+		f0M0source = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+		f00Psource = &DC[DIR_00M * numberOfLBnodesCoarse];
+		f00Msource = &DC[DIR_00P * numberOfLBnodesCoarse];
+		fPP0source = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+		fMM0source = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+		fPM0source = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+		fMP0source = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+		fP0Psource = &DC[DIR_M0M * numberOfLBnodesCoarse];
+		fM0Msource = &DC[DIR_P0P * numberOfLBnodesCoarse];
+		fP0Msource = &DC[DIR_M0P * numberOfLBnodesCoarse];
+		fM0Psource = &DC[DIR_P0M * numberOfLBnodesCoarse];
+		f0PPsource = &DC[DIR_0MM * numberOfLBnodesCoarse];
+		f0MMsource = &DC[DIR_0PP * numberOfLBnodesCoarse];
+		f0PMsource = &DC[DIR_0MP * numberOfLBnodesCoarse];
+		f0MPsource = &DC[DIR_0PM * numberOfLBnodesCoarse];
+		f000source = &DC[DIR_000 * numberOfLBnodesCoarse];
+		fMMMsource = &DC[DIR_PPP * numberOfLBnodesCoarse];
+		fMMPsource = &DC[DIR_PPM * numberOfLBnodesCoarse];
+		fMPPsource = &DC[DIR_PMM * numberOfLBnodesCoarse];
+		fMPMsource = &DC[DIR_PMP * numberOfLBnodesCoarse];
+		fPPMsource = &DC[DIR_MMP * numberOfLBnodesCoarse];
+		fPPPsource = &DC[DIR_MMM * numberOfLBnodesCoarse];
+		fPMPsource = &DC[DIR_MPM * numberOfLBnodesCoarse];
+		fPMMsource = &DC[DIR_MPP * numberOfLBnodesCoarse];
 	}
 
 	Distributions6 G;
-	G.g[DIR_P00] = &G6[DIR_P00   *size_MatF];
-	G.g[DIR_M00] = &G6[DIR_M00   *size_MatF];
-	G.g[DIR_0P0] = &G6[DIR_0P0   *size_MatF];
-	G.g[DIR_0M0] = &G6[DIR_0M0   *size_MatF];
-	G.g[DIR_00P] = &G6[DIR_00P   *size_MatF];
-	G.g[DIR_00M] = &G6[DIR_00M   *size_MatF];
+	G.g[DIR_P00] = &G6[DIR_P00 * numberOfLBnodesFine];
+	G.g[DIR_M00] = &G6[DIR_M00 * numberOfLBnodesFine];
+	G.g[DIR_0P0] = &G6[DIR_0P0 * numberOfLBnodesFine];
+	G.g[DIR_0M0] = &G6[DIR_0M0 * numberOfLBnodesFine];
+	G.g[DIR_00P] = &G6[DIR_00P * numberOfLBnodesFine];
+	G.g[DIR_00M] = &G6[DIR_00M * numberOfLBnodesFine];
 
 	////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
@@ -4370,8 +4370,8 @@ __global__ void scaleCF_comp_D3Q27F3( real* DC,
 												 unsigned int* neighborFX,
 												 unsigned int* neighborFY,
 												 unsigned int* neighborFZ,
-												 unsigned int size_MatC, 
-												 unsigned int size_MatF, 
+												 unsigned long long numberOfLBnodesCoarse, 
+												 unsigned long long numberOfLBnodesFine, 
 												 bool isEvenTimestep,
 												 unsigned int* posCSWB, 
 												 unsigned int* posFSWB, 
@@ -4391,33 +4391,33 @@ __global__ void scaleCF_comp_D3Q27F3( real* DC,
 		*f000dest, *fMMMdest, *fMMPdest, *fMPPdest, *fMPMdest, *fPPMdest, *fPPPdest, *fPMPdest, *fPMMdest;
 
 
-	fP00dest = &DF[DIR_P00   *size_MatF];
-	fM00dest = &DF[DIR_M00   *size_MatF];
-	f0P0dest = &DF[DIR_0P0   *size_MatF];
-	f0M0dest = &DF[DIR_0M0   *size_MatF];
-	f00Pdest = &DF[DIR_00P   *size_MatF];
-	f00Mdest = &DF[DIR_00M   *size_MatF];
-	fPP0dest = &DF[DIR_PP0  *size_MatF];
-	fMM0dest = &DF[DIR_MM0  *size_MatF];
-	fPM0dest = &DF[DIR_PM0  *size_MatF];
-	fMP0dest = &DF[DIR_MP0  *size_MatF];
-	fP0Pdest = &DF[DIR_P0P  *size_MatF];
-	fM0Mdest = &DF[DIR_M0M  *size_MatF];
-	fP0Mdest = &DF[DIR_P0M  *size_MatF];
-	fM0Pdest = &DF[DIR_M0P  *size_MatF];
-	f0PPdest = &DF[DIR_0PP  *size_MatF];
-	f0MMdest = &DF[DIR_0MM  *size_MatF];
-	f0PMdest = &DF[DIR_0PM  *size_MatF];
-	f0MPdest = &DF[DIR_0MP  *size_MatF];
-	f000dest = &DF[DIR_000*size_MatF];
-	fMMMdest = &DF[DIR_MMM *size_MatF];
-	fMMPdest = &DF[DIR_MMP *size_MatF];
-	fMPPdest = &DF[DIR_MPP *size_MatF];
-	fMPMdest = &DF[DIR_MPM *size_MatF];
-	fPPMdest = &DF[DIR_PPM *size_MatF];
-	fPPPdest = &DF[DIR_PPP *size_MatF];
-	fPMPdest = &DF[DIR_PMP *size_MatF];
-	fPMMdest = &DF[DIR_PMM *size_MatF];
+	fP00dest = &DF[DIR_P00 * numberOfLBnodesFine];
+	fM00dest = &DF[DIR_M00 * numberOfLBnodesFine];
+	f0P0dest = &DF[DIR_0P0 * numberOfLBnodesFine];
+	f0M0dest = &DF[DIR_0M0 * numberOfLBnodesFine];
+	f00Pdest = &DF[DIR_00P * numberOfLBnodesFine];
+	f00Mdest = &DF[DIR_00M * numberOfLBnodesFine];
+	fPP0dest = &DF[DIR_PP0 * numberOfLBnodesFine];
+	fMM0dest = &DF[DIR_MM0 * numberOfLBnodesFine];
+	fPM0dest = &DF[DIR_PM0 * numberOfLBnodesFine];
+	fMP0dest = &DF[DIR_MP0 * numberOfLBnodesFine];
+	fP0Pdest = &DF[DIR_P0P * numberOfLBnodesFine];
+	fM0Mdest = &DF[DIR_M0M * numberOfLBnodesFine];
+	fP0Mdest = &DF[DIR_P0M * numberOfLBnodesFine];
+	fM0Pdest = &DF[DIR_M0P * numberOfLBnodesFine];
+	f0PPdest = &DF[DIR_0PP * numberOfLBnodesFine];
+	f0MMdest = &DF[DIR_0MM * numberOfLBnodesFine];
+	f0PMdest = &DF[DIR_0PM * numberOfLBnodesFine];
+	f0MPdest = &DF[DIR_0MP * numberOfLBnodesFine];
+	f000dest = &DF[DIR_000 * numberOfLBnodesFine];
+	fMMMdest = &DF[DIR_MMM * numberOfLBnodesFine];
+	fMMPdest = &DF[DIR_MMP * numberOfLBnodesFine];
+	fMPPdest = &DF[DIR_MPP * numberOfLBnodesFine];
+	fMPMdest = &DF[DIR_MPM * numberOfLBnodesFine];
+	fPPMdest = &DF[DIR_PPM * numberOfLBnodesFine];
+	fPPPdest = &DF[DIR_PPP * numberOfLBnodesFine];
+	fPMPdest = &DF[DIR_PMP * numberOfLBnodesFine];
+	fPMMdest = &DF[DIR_PMM * numberOfLBnodesFine];
 
 	real
 		*fP00source, *fM00source, *f0P0source, *f0M0source, *f00Psource, *f00Msource, *fPP0source, *fMM0source, *fPM0source,
@@ -4426,72 +4426,72 @@ __global__ void scaleCF_comp_D3Q27F3( real* DC,
 
 	if (isEvenTimestep == true)
 	{
-		fP00source = &DC[DIR_P00   *size_MatC];
-		fM00source = &DC[DIR_M00   *size_MatC];
-		f0P0source = &DC[DIR_0P0   *size_MatC];
-		f0M0source = &DC[DIR_0M0   *size_MatC];
-		f00Psource = &DC[DIR_00P   *size_MatC];
-		f00Msource = &DC[DIR_00M   *size_MatC];
-		fPP0source = &DC[DIR_PP0  *size_MatC];
-		fMM0source = &DC[DIR_MM0  *size_MatC];
-		fPM0source = &DC[DIR_PM0  *size_MatC];
-		fMP0source = &DC[DIR_MP0  *size_MatC];
-		fP0Psource = &DC[DIR_P0P  *size_MatC];
-		fM0Msource = &DC[DIR_M0M  *size_MatC];
-		fP0Msource = &DC[DIR_P0M  *size_MatC];
-		fM0Psource = &DC[DIR_M0P  *size_MatC];
-		f0PPsource = &DC[DIR_0PP  *size_MatC];
-		f0MMsource = &DC[DIR_0MM  *size_MatC];
-		f0PMsource = &DC[DIR_0PM  *size_MatC];
-		f0MPsource = &DC[DIR_0MP  *size_MatC];
-		f000source = &DC[DIR_000*size_MatC];
-		fMMMsource = &DC[DIR_MMM *size_MatC];
-		fMMPsource = &DC[DIR_MMP *size_MatC];
-		fMPPsource = &DC[DIR_MPP *size_MatC];
-		fMPMsource = &DC[DIR_MPM *size_MatC];
-		fPPMsource = &DC[DIR_PPM *size_MatC];
-		fPPPsource = &DC[DIR_PPP *size_MatC];
-		fPMPsource = &DC[DIR_PMP *size_MatC];
-		fPMMsource = &DC[DIR_PMM *size_MatC];
+		fP00source = &DC[DIR_P00 * numberOfLBnodesCoarse];
+		fM00source = &DC[DIR_M00 * numberOfLBnodesCoarse];
+		f0P0source = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+		f0M0source = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+		f00Psource = &DC[DIR_00P * numberOfLBnodesCoarse];
+		f00Msource = &DC[DIR_00M * numberOfLBnodesCoarse];
+		fPP0source = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+		fMM0source = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+		fPM0source = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+		fMP0source = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+		fP0Psource = &DC[DIR_P0P * numberOfLBnodesCoarse];
+		fM0Msource = &DC[DIR_M0M * numberOfLBnodesCoarse];
+		fP0Msource = &DC[DIR_P0M * numberOfLBnodesCoarse];
+		fM0Psource = &DC[DIR_M0P * numberOfLBnodesCoarse];
+		f0PPsource = &DC[DIR_0PP * numberOfLBnodesCoarse];
+		f0MMsource = &DC[DIR_0MM * numberOfLBnodesCoarse];
+		f0PMsource = &DC[DIR_0PM * numberOfLBnodesCoarse];
+		f0MPsource = &DC[DIR_0MP * numberOfLBnodesCoarse];
+		f000source = &DC[DIR_000 * numberOfLBnodesCoarse];
+		fMMMsource = &DC[DIR_MMM * numberOfLBnodesCoarse];
+		fMMPsource = &DC[DIR_MMP * numberOfLBnodesCoarse];
+		fMPPsource = &DC[DIR_MPP * numberOfLBnodesCoarse];
+		fMPMsource = &DC[DIR_MPM * numberOfLBnodesCoarse];
+		fPPMsource = &DC[DIR_PPM * numberOfLBnodesCoarse];
+		fPPPsource = &DC[DIR_PPP * numberOfLBnodesCoarse];
+		fPMPsource = &DC[DIR_PMP * numberOfLBnodesCoarse];
+		fPMMsource = &DC[DIR_PMM * numberOfLBnodesCoarse];
 	}
 	else
 	{
-		fP00source = &DC[DIR_M00   *size_MatC];
-		fM00source = &DC[DIR_P00   *size_MatC];
-		f0P0source = &DC[DIR_0M0   *size_MatC];
-		f0M0source = &DC[DIR_0P0   *size_MatC];
-		f00Psource = &DC[DIR_00M   *size_MatC];
-		f00Msource = &DC[DIR_00P   *size_MatC];
-		fPP0source = &DC[DIR_MM0  *size_MatC];
-		fMM0source = &DC[DIR_PP0  *size_MatC];
-		fPM0source = &DC[DIR_MP0  *size_MatC];
-		fMP0source = &DC[DIR_PM0  *size_MatC];
-		fP0Psource = &DC[DIR_M0M  *size_MatC];
-		fM0Msource = &DC[DIR_P0P  *size_MatC];
-		fP0Msource = &DC[DIR_M0P  *size_MatC];
-		fM0Psource = &DC[DIR_P0M  *size_MatC];
-		f0PPsource = &DC[DIR_0MM  *size_MatC];
-		f0MMsource = &DC[DIR_0PP  *size_MatC];
-		f0PMsource = &DC[DIR_0MP  *size_MatC];
-		f0MPsource = &DC[DIR_0PM  *size_MatC];
-		f000source = &DC[DIR_000*size_MatC];
-		fMMMsource = &DC[DIR_PPP *size_MatC];
-		fMMPsource = &DC[DIR_PPM *size_MatC];
-		fMPPsource = &DC[DIR_PMM *size_MatC];
-		fMPMsource = &DC[DIR_PMP *size_MatC];
-		fPPMsource = &DC[DIR_MMP *size_MatC];
-		fPPPsource = &DC[DIR_MMM *size_MatC];
-		fPMPsource = &DC[DIR_MPM *size_MatC];
-		fPMMsource = &DC[DIR_MPP *size_MatC];
+		fP00source = &DC[DIR_M00 * numberOfLBnodesCoarse];
+		fM00source = &DC[DIR_P00 * numberOfLBnodesCoarse];
+		f0P0source = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+		f0M0source = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+		f00Psource = &DC[DIR_00M * numberOfLBnodesCoarse];
+		f00Msource = &DC[DIR_00P * numberOfLBnodesCoarse];
+		fPP0source = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+		fMM0source = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+		fPM0source = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+		fMP0source = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+		fP0Psource = &DC[DIR_M0M * numberOfLBnodesCoarse];
+		fM0Msource = &DC[DIR_P0P * numberOfLBnodesCoarse];
+		fP0Msource = &DC[DIR_M0P * numberOfLBnodesCoarse];
+		fM0Psource = &DC[DIR_P0M * numberOfLBnodesCoarse];
+		f0PPsource = &DC[DIR_0MM * numberOfLBnodesCoarse];
+		f0MMsource = &DC[DIR_0PP * numberOfLBnodesCoarse];
+		f0PMsource = &DC[DIR_0MP * numberOfLBnodesCoarse];
+		f0MPsource = &DC[DIR_0PM * numberOfLBnodesCoarse];
+		f000source = &DC[DIR_000 * numberOfLBnodesCoarse];
+		fMMMsource = &DC[DIR_PPP * numberOfLBnodesCoarse];
+		fMMPsource = &DC[DIR_PPM * numberOfLBnodesCoarse];
+		fMPPsource = &DC[DIR_PMM * numberOfLBnodesCoarse];
+		fMPMsource = &DC[DIR_PMP * numberOfLBnodesCoarse];
+		fPPMsource = &DC[DIR_MMP * numberOfLBnodesCoarse];
+		fPPPsource = &DC[DIR_MMM * numberOfLBnodesCoarse];
+		fPMPsource = &DC[DIR_MPM * numberOfLBnodesCoarse];
+		fPMMsource = &DC[DIR_MPP * numberOfLBnodesCoarse];
 	}
 
 	Distributions6 G;
-	G.g[DIR_P00] = &G6[DIR_P00   *size_MatF];
-	G.g[DIR_M00] = &G6[DIR_M00   *size_MatF];
-	G.g[DIR_0P0] = &G6[DIR_0P0   *size_MatF];
-	G.g[DIR_0M0] = &G6[DIR_0M0   *size_MatF];
-	G.g[DIR_00P] = &G6[DIR_00P   *size_MatF];
-	G.g[DIR_00M] = &G6[DIR_00M   *size_MatF];
+	G.g[DIR_P00] = &G6[DIR_P00 * numberOfLBnodesFine];
+	G.g[DIR_M00] = &G6[DIR_M00 * numberOfLBnodesFine];
+	G.g[DIR_0P0] = &G6[DIR_0P0 * numberOfLBnodesFine];
+	G.g[DIR_0M0] = &G6[DIR_0M0 * numberOfLBnodesFine];
+	G.g[DIR_00P] = &G6[DIR_00P * numberOfLBnodesFine];
+	G.g[DIR_00M] = &G6[DIR_00M * numberOfLBnodesFine];
 
 	////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleFC27.cu b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleFC27.cu
index f2a66876cf39e3519e22fc2b0e236514f05ce85a..b37ab44d81d15fbbde46c875c860acd7198b8041 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleFC27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleFC27.cu
@@ -22,8 +22,8 @@ __global__ void scaleFC_0817_comp_27( real* DC,
 												 unsigned int* neighborFX,
 												 unsigned int* neighborFY,
 												 unsigned int* neighborFZ,
-												 unsigned int size_MatC, 
-												 unsigned int size_MatF, 
+												 unsigned long long numberOfLBnodesCoarse, 
+												 unsigned long long numberOfLBnodesFine, 
 												 bool isEvenTimestep,
 												 unsigned int* posC, 
 												 unsigned int* posFSWB, 
@@ -43,33 +43,33 @@ __global__ void scaleFC_0817_comp_27( real* DC,
 	   *f000source, *fMMMsource, *fMMPsource, *fMPPsource, *fMPMsource, *fPPMsource, *fPPPsource, *fPMPsource, *fPMMsource;
 
 
-   fP00source = &DF[DIR_P00   *size_MatF];
-   fM00source = &DF[DIR_M00   *size_MatF];
-   f0P0source = &DF[DIR_0P0   *size_MatF];
-   f0M0source = &DF[DIR_0M0   *size_MatF];
-   f00Psource = &DF[DIR_00P   *size_MatF];
-   f00Msource = &DF[DIR_00M   *size_MatF];
-   fPP0source = &DF[DIR_PP0  *size_MatF];
-   fMM0source = &DF[DIR_MM0  *size_MatF];
-   fPM0source = &DF[DIR_PM0  *size_MatF];
-   fMP0source = &DF[DIR_MP0  *size_MatF];
-   fP0Psource = &DF[DIR_P0P  *size_MatF];
-   fM0Msource = &DF[DIR_M0M  *size_MatF];
-   fP0Msource = &DF[DIR_P0M  *size_MatF];
-   fM0Psource = &DF[DIR_M0P  *size_MatF];
-   f0PPsource = &DF[DIR_0PP  *size_MatF];
-   f0MMsource = &DF[DIR_0MM  *size_MatF];
-   f0PMsource = &DF[DIR_0PM  *size_MatF];
-   f0MPsource = &DF[DIR_0MP  *size_MatF];
-   f000source = &DF[DIR_000*size_MatF];
-   fMMMsource = &DF[DIR_MMM *size_MatF];
-   fMMPsource = &DF[DIR_MMP *size_MatF];
-   fMPPsource = &DF[DIR_MPP *size_MatF];
-   fMPMsource = &DF[DIR_MPM *size_MatF];
-   fPPMsource = &DF[DIR_PPM *size_MatF];
-   fPPPsource = &DF[DIR_PPP *size_MatF];
-   fPMPsource = &DF[DIR_PMP *size_MatF];
-   fPMMsource = &DF[DIR_PMM *size_MatF];
+   fP00source = &DF[DIR_P00 * numberOfLBnodesFine];
+   fM00source = &DF[DIR_M00 * numberOfLBnodesFine];
+   f0P0source = &DF[DIR_0P0 * numberOfLBnodesFine];
+   f0M0source = &DF[DIR_0M0 * numberOfLBnodesFine];
+   f00Psource = &DF[DIR_00P * numberOfLBnodesFine];
+   f00Msource = &DF[DIR_00M * numberOfLBnodesFine];
+   fPP0source = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fMM0source = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fPM0source = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fMP0source = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fP0Psource = &DF[DIR_P0P * numberOfLBnodesFine];
+   fM0Msource = &DF[DIR_M0M * numberOfLBnodesFine];
+   fP0Msource = &DF[DIR_P0M * numberOfLBnodesFine];
+   fM0Psource = &DF[DIR_M0P * numberOfLBnodesFine];
+   f0PPsource = &DF[DIR_0PP * numberOfLBnodesFine];
+   f0MMsource = &DF[DIR_0MM * numberOfLBnodesFine];
+   f0PMsource = &DF[DIR_0PM * numberOfLBnodesFine];
+   f0MPsource = &DF[DIR_0MP * numberOfLBnodesFine];
+   f000source = &DF[DIR_000 * numberOfLBnodesFine];
+   fMMMsource = &DF[DIR_MMM * numberOfLBnodesFine];
+   fMMPsource = &DF[DIR_MMP * numberOfLBnodesFine];
+   fMPPsource = &DF[DIR_MPP * numberOfLBnodesFine];
+   fMPMsource = &DF[DIR_MPM * numberOfLBnodesFine];
+   fPPMsource = &DF[DIR_PPM * numberOfLBnodesFine];
+   fPPPsource = &DF[DIR_PPP * numberOfLBnodesFine];
+   fPMPsource = &DF[DIR_PMP * numberOfLBnodesFine];
+   fPMMsource = &DF[DIR_PMM * numberOfLBnodesFine];
 
    real
 	   *fP00dest, *fM00dest, *f0P0dest, *f0M0dest, *f00Pdest, *f00Mdest, *fPP0dest, *fMM0dest, *fPM0dest,
@@ -78,63 +78,63 @@ __global__ void scaleFC_0817_comp_27( real* DC,
 
    if (isEvenTimestep==true)
    {
-	   fP00dest = &DC[DIR_P00   *size_MatC];
-	   fM00dest = &DC[DIR_M00   *size_MatC];
-	   f0P0dest = &DC[DIR_0P0   *size_MatC];
-	   f0M0dest = &DC[DIR_0M0   *size_MatC];
-	   f00Pdest = &DC[DIR_00P   *size_MatC];
-	   f00Mdest = &DC[DIR_00M   *size_MatC];
-	   fPP0dest = &DC[DIR_PP0  *size_MatC];
-	   fMM0dest = &DC[DIR_MM0  *size_MatC];
-	   fPM0dest = &DC[DIR_PM0  *size_MatC];
-	   fMP0dest = &DC[DIR_MP0  *size_MatC];
-	   fP0Pdest = &DC[DIR_P0P  *size_MatC];
-	   fM0Mdest = &DC[DIR_M0M  *size_MatC];
-	   fP0Mdest = &DC[DIR_P0M  *size_MatC];
-	   fM0Pdest = &DC[DIR_M0P  *size_MatC];
-	   f0PPdest = &DC[DIR_0PP  *size_MatC];
-	   f0MMdest = &DC[DIR_0MM  *size_MatC];
-	   f0PMdest = &DC[DIR_0PM  *size_MatC];
-	   f0MPdest = &DC[DIR_0MP  *size_MatC];
-	   f000dest = &DC[DIR_000*size_MatC];
-	   fMMMdest = &DC[DIR_MMM *size_MatC];
-	   fMMPdest = &DC[DIR_MMP *size_MatC];
-	   fMPPdest = &DC[DIR_MPP *size_MatC];
-	   fMPMdest = &DC[DIR_MPM *size_MatC];
-	   fPPMdest = &DC[DIR_PPM *size_MatC];
-	   fPPPdest = &DC[DIR_PPP *size_MatC];
-	   fPMPdest = &DC[DIR_PMP *size_MatC];
-	   fPMMdest = &DC[DIR_PMM *size_MatC];
+	   fP00dest = &DC[DIR_P00 * numberOfLBnodesCoarse];
+	   fM00dest = &DC[DIR_M00 * numberOfLBnodesCoarse];
+	   f0P0dest = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+	   f0M0dest = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+	   f00Pdest = &DC[DIR_00P * numberOfLBnodesCoarse];
+	   f00Mdest = &DC[DIR_00M * numberOfLBnodesCoarse];
+	   fPP0dest = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+	   fMM0dest = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+	   fPM0dest = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+	   fMP0dest = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+	   fP0Pdest = &DC[DIR_P0P * numberOfLBnodesCoarse];
+	   fM0Mdest = &DC[DIR_M0M * numberOfLBnodesCoarse];
+	   fP0Mdest = &DC[DIR_P0M * numberOfLBnodesCoarse];
+	   fM0Pdest = &DC[DIR_M0P * numberOfLBnodesCoarse];
+	   f0PPdest = &DC[DIR_0PP * numberOfLBnodesCoarse];
+	   f0MMdest = &DC[DIR_0MM * numberOfLBnodesCoarse];
+	   f0PMdest = &DC[DIR_0PM * numberOfLBnodesCoarse];
+	   f0MPdest = &DC[DIR_0MP * numberOfLBnodesCoarse];
+	   f000dest = &DC[DIR_000 * numberOfLBnodesCoarse];
+	   fMMMdest = &DC[DIR_MMM * numberOfLBnodesCoarse];
+	   fMMPdest = &DC[DIR_MMP * numberOfLBnodesCoarse];
+	   fMPPdest = &DC[DIR_MPP * numberOfLBnodesCoarse];
+	   fMPMdest = &DC[DIR_MPM * numberOfLBnodesCoarse];
+	   fPPMdest = &DC[DIR_PPM * numberOfLBnodesCoarse];
+	   fPPPdest = &DC[DIR_PPP * numberOfLBnodesCoarse];
+	   fPMPdest = &DC[DIR_PMP * numberOfLBnodesCoarse];
+	   fPMMdest = &DC[DIR_PMM * numberOfLBnodesCoarse];
    } 
    else
    {
-	   fP00dest = &DC[DIR_M00   *size_MatC];
-	   fM00dest = &DC[DIR_P00   *size_MatC];
-	   f0P0dest = &DC[DIR_0M0   *size_MatC];
-	   f0M0dest = &DC[DIR_0P0   *size_MatC];
-	   f00Pdest = &DC[DIR_00M   *size_MatC];
-	   f00Mdest = &DC[DIR_00P   *size_MatC];
-	   fPP0dest = &DC[DIR_MM0  *size_MatC];
-	   fMM0dest = &DC[DIR_PP0  *size_MatC];
-	   fPM0dest = &DC[DIR_MP0  *size_MatC];
-	   fMP0dest = &DC[DIR_PM0  *size_MatC];
-	   fP0Pdest = &DC[DIR_M0M  *size_MatC];
-	   fM0Mdest = &DC[DIR_P0P  *size_MatC];
-	   fP0Mdest = &DC[DIR_M0P  *size_MatC];
-	   fM0Pdest = &DC[DIR_P0M  *size_MatC];
-	   f0PPdest = &DC[DIR_0MM  *size_MatC];
-	   f0MMdest = &DC[DIR_0PP  *size_MatC];
-	   f0PMdest = &DC[DIR_0MP  *size_MatC];
-	   f0MPdest = &DC[DIR_0PM  *size_MatC];
-	   f000dest = &DC[DIR_000*size_MatC];
-	   fMMMdest = &DC[DIR_PPP *size_MatC];
-	   fMMPdest = &DC[DIR_PPM *size_MatC];
-	   fMPPdest = &DC[DIR_PMM *size_MatC];
-	   fMPMdest = &DC[DIR_PMP *size_MatC];
-	   fPPMdest = &DC[DIR_MMP *size_MatC];
-	   fPPPdest = &DC[DIR_MMM *size_MatC];
-	   fPMPdest = &DC[DIR_MPM *size_MatC];
-	   fPMMdest = &DC[DIR_MPP *size_MatC];
+	   fP00dest = &DC[DIR_M00 * numberOfLBnodesCoarse];
+	   fM00dest = &DC[DIR_P00 * numberOfLBnodesCoarse];
+	   f0P0dest = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+	   f0M0dest = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+	   f00Pdest = &DC[DIR_00M * numberOfLBnodesCoarse];
+	   f00Mdest = &DC[DIR_00P * numberOfLBnodesCoarse];
+	   fPP0dest = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+	   fMM0dest = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+	   fPM0dest = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+	   fMP0dest = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+	   fP0Pdest = &DC[DIR_M0M * numberOfLBnodesCoarse];
+	   fM0Mdest = &DC[DIR_P0P * numberOfLBnodesCoarse];
+	   fP0Mdest = &DC[DIR_M0P * numberOfLBnodesCoarse];
+	   fM0Pdest = &DC[DIR_P0M * numberOfLBnodesCoarse];
+	   f0PPdest = &DC[DIR_0MM * numberOfLBnodesCoarse];
+	   f0MMdest = &DC[DIR_0PP * numberOfLBnodesCoarse];
+	   f0PMdest = &DC[DIR_0MP * numberOfLBnodesCoarse];
+	   f0MPdest = &DC[DIR_0PM * numberOfLBnodesCoarse];
+	   f000dest = &DC[DIR_000 * numberOfLBnodesCoarse];
+	   fMMMdest = &DC[DIR_PPP * numberOfLBnodesCoarse];
+	   fMMPdest = &DC[DIR_PPM * numberOfLBnodesCoarse];
+	   fMPPdest = &DC[DIR_PMM * numberOfLBnodesCoarse];
+	   fMPMdest = &DC[DIR_PMP * numberOfLBnodesCoarse];
+	   fPPMdest = &DC[DIR_MMP * numberOfLBnodesCoarse];
+	   fPPPdest = &DC[DIR_MMM * numberOfLBnodesCoarse];
+	   fPMPdest = &DC[DIR_MPM * numberOfLBnodesCoarse];
+	   fPMMdest = &DC[DIR_MPP * numberOfLBnodesCoarse];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
@@ -1218,8 +1218,8 @@ __global__ void scaleFC_AA2016_comp_27(real* DC,
 												  unsigned int* neighborFX,
 												  unsigned int* neighborFY,
 												  unsigned int* neighborFZ,
-												  unsigned int size_MatC, 
-												  unsigned int size_MatF, 
+												  unsigned long long numberOfLBnodesCoarse, 
+												  unsigned long long numberOfLBnodesFine, 
 												  bool isEvenTimestep,
 												  unsigned int* posC, 
 												  unsigned int* posFSWB, 
@@ -1236,96 +1236,96 @@ __global__ void scaleFC_AA2016_comp_27(real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
@@ -5407,8 +5407,8 @@ __global__ void scaleFC_RhoSq_3rdMom_comp_27(real* DC,
 														unsigned int* neighborFX,
 														unsigned int* neighborFY,
 														unsigned int* neighborFZ,
-														unsigned int size_MatC, 
-														unsigned int size_MatF, 
+														unsigned long long numberOfLBnodesCoarse, 
+														unsigned long long numberOfLBnodesFine, 
 														bool isEvenTimestep,
 														unsigned int* posC, 
 														unsigned int* posFSWB, 
@@ -5425,96 +5425,96 @@ __global__ void scaleFC_RhoSq_3rdMom_comp_27(real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
@@ -9587,103 +9587,120 @@ __global__ void scaleFC_RhoSq_3rdMom_comp_27(real* DC,
 
 
 //////////////////////////////////////////////////////////////////////////
-__device__ void scaleFC_RhoSq_comp_27_Calculation(real *DC, real *DF, unsigned int *neighborCX, unsigned int *neighborCY,
-                                                  unsigned int *neighborCZ, unsigned int *neighborFX, unsigned int *neighborFY,
-                                                  unsigned int *neighborFZ, unsigned int size_MatC, unsigned int size_MatF,
-                                                  bool isEvenTimestep, unsigned int *posC, unsigned int *posFSWB, unsigned int kFC,
-                                                  real omCoarse, real omFine, real nu, unsigned int nxC, unsigned int nyC,
-                                                  unsigned int nxF, unsigned int nyF, OffFC offFC, const unsigned k)
+__device__ void scaleFC_RhoSq_comp_27_Calculation(
+    real *DC, real *DF, 
+    unsigned int *neighborCX,
+    unsigned int *neighborCY,
+    unsigned int *neighborCZ,
+    unsigned int *neighborFX,
+    unsigned int *neighborFY,
+    unsigned int *neighborFZ,
+    unsigned long long numberOfLBnodesCoarse,
+    unsigned long long numberOfLBnodesFine,
+    bool isEvenTimestep,
+    unsigned int *posC,
+    unsigned int *posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    OffFC offFC,
+    const unsigned k)
 {
     real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF,
         *fbnF, *ftsF, *fzeroF, *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-    feF    = &DF[DIR_P00 * size_MatF];
-    fwF    = &DF[DIR_M00 * size_MatF];
-    fnF    = &DF[DIR_0P0 * size_MatF];
-    fsF    = &DF[DIR_0M0 * size_MatF];
-    ftF    = &DF[DIR_00P * size_MatF];
-    fbF    = &DF[DIR_00M * size_MatF];
-    fneF   = &DF[DIR_PP0 * size_MatF];
-    fswF   = &DF[DIR_MM0 * size_MatF];
-    fseF   = &DF[DIR_PM0 * size_MatF];
-    fnwF   = &DF[DIR_MP0 * size_MatF];
-    fteF   = &DF[DIR_P0P * size_MatF];
-    fbwF   = &DF[DIR_M0M * size_MatF];
-    fbeF   = &DF[DIR_P0M * size_MatF];
-    ftwF   = &DF[DIR_M0P * size_MatF];
-    ftnF   = &DF[DIR_0PP * size_MatF];
-    fbsF   = &DF[DIR_0MM * size_MatF];
-    fbnF   = &DF[DIR_0PM * size_MatF];
-    ftsF   = &DF[DIR_0MP * size_MatF];
-    fzeroF = &DF[DIR_000 * size_MatF];
-    ftneF  = &DF[DIR_PPP * size_MatF];
-    ftswF  = &DF[DIR_MMP * size_MatF];
-    ftseF  = &DF[DIR_PMP * size_MatF];
-    ftnwF  = &DF[DIR_MPP * size_MatF];
-    fbneF  = &DF[DIR_PPM * size_MatF];
-    fbswF  = &DF[DIR_MMM * size_MatF];
-    fbseF  = &DF[DIR_PMM * size_MatF];
-    fbnwF  = &DF[DIR_MPM * size_MatF];
+    feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+    fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+    fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+    fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+    ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+    fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+    fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+    fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+    fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+    fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+    fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+    fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+    fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+    ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+    ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+    fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+    fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+    ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+    fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+    ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+    ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+    ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+    ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+    fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+    fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+    fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+    fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
     real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC,
         *fbnC, *ftsC, *fzeroC, *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
     if (isEvenTimestep == true) {
-        feC    = &DC[DIR_P00 * size_MatC];
-        fwC    = &DC[DIR_M00 * size_MatC];
-        fnC    = &DC[DIR_0P0 * size_MatC];
-        fsC    = &DC[DIR_0M0 * size_MatC];
-        ftC    = &DC[DIR_00P * size_MatC];
-        fbC    = &DC[DIR_00M * size_MatC];
-        fneC   = &DC[DIR_PP0 * size_MatC];
-        fswC   = &DC[DIR_MM0 * size_MatC];
-        fseC   = &DC[DIR_PM0 * size_MatC];
-        fnwC   = &DC[DIR_MP0 * size_MatC];
-        fteC   = &DC[DIR_P0P * size_MatC];
-        fbwC   = &DC[DIR_M0M * size_MatC];
-        fbeC   = &DC[DIR_P0M * size_MatC];
-        ftwC   = &DC[DIR_M0P * size_MatC];
-        ftnC   = &DC[DIR_0PP * size_MatC];
-        fbsC   = &DC[DIR_0MM * size_MatC];
-        fbnC   = &DC[DIR_0PM * size_MatC];
-        ftsC   = &DC[DIR_0MP * size_MatC];
-        fzeroC = &DC[DIR_000 * size_MatC];
-        ftneC  = &DC[DIR_PPP * size_MatC];
-        ftswC  = &DC[DIR_MMP * size_MatC];
-        ftseC  = &DC[DIR_PMP * size_MatC];
-        ftnwC  = &DC[DIR_MPP * size_MatC];
-        fbneC  = &DC[DIR_PPM * size_MatC];
-        fbswC  = &DC[DIR_MMM * size_MatC];
-        fbseC  = &DC[DIR_PMM * size_MatC];
-        fbnwC  = &DC[DIR_MPM * size_MatC];
+        feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+        fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+        fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+        fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+        ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+        fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+        fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+        fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+        fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+        fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+        fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+        fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+        fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+        ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+        ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+        fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+        fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+        ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+        fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+        ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+        ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+        ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+        ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+        fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+        fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+        fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+        fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
     } else {
-        fwC    = &DC[DIR_P00 * size_MatC];
-        feC    = &DC[DIR_M00 * size_MatC];
-        fsC    = &DC[DIR_0P0 * size_MatC];
-        fnC    = &DC[DIR_0M0 * size_MatC];
-        fbC    = &DC[DIR_00P * size_MatC];
-        ftC    = &DC[DIR_00M * size_MatC];
-        fswC   = &DC[DIR_PP0 * size_MatC];
-        fneC   = &DC[DIR_MM0 * size_MatC];
-        fnwC   = &DC[DIR_PM0 * size_MatC];
-        fseC   = &DC[DIR_MP0 * size_MatC];
-        fbwC   = &DC[DIR_P0P * size_MatC];
-        fteC   = &DC[DIR_M0M * size_MatC];
-        ftwC   = &DC[DIR_P0M * size_MatC];
-        fbeC   = &DC[DIR_M0P * size_MatC];
-        fbsC   = &DC[DIR_0PP * size_MatC];
-        ftnC   = &DC[DIR_0MM * size_MatC];
-        ftsC   = &DC[DIR_0PM * size_MatC];
-        fbnC   = &DC[DIR_0MP * size_MatC];
-        fzeroC = &DC[DIR_000 * size_MatC];
-        fbswC  = &DC[DIR_PPP * size_MatC];
-        fbneC  = &DC[DIR_MMP * size_MatC];
-        fbnwC  = &DC[DIR_PMP * size_MatC];
-        fbseC  = &DC[DIR_MPP * size_MatC];
-        ftswC  = &DC[DIR_PPM * size_MatC];
-        ftneC  = &DC[DIR_MMM * size_MatC];
-        ftnwC  = &DC[DIR_PMM * size_MatC];
-        ftseC  = &DC[DIR_MPM * size_MatC];
+        fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+        feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+        fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+        fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+        fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+        ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+        fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+        fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+        fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+        fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+        fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+        fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+        ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+        fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+        fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+        ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+        ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+        fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+        fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+        fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+        fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+        fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+        fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+        ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+        ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+        ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+        ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
     }
 
     ////////////////////////////////////////////////////////////////////////////////
@@ -11064,8 +11081,8 @@ __global__ void scaleFC_RhoSq_comp_27(real* DC,
 												 unsigned int* neighborFX,
 												 unsigned int* neighborFY,
 												 unsigned int* neighborFZ,
-												 unsigned int size_MatC, 
-												 unsigned int size_MatF, 
+												 unsigned long long numberOfLBnodesCoarse, 
+												 unsigned long long numberOfLBnodesFine, 
 												 bool isEvenTimestep,
 												 unsigned int* posC, 
 												 unsigned int* posFSWB, 
@@ -11091,7 +11108,7 @@ __global__ void scaleFC_RhoSq_comp_27(real* DC,
    //////////////////////////////////////////////////////////////////////////
 
    scaleFC_RhoSq_comp_27_Calculation(DC, DF, neighborCX, neighborCY, neighborCZ, neighborFX, neighborFY, neighborFZ,
-                                     size_MatC, size_MatF, isEvenTimestep, posC, posFSWB, kFC, omCoarse, omFine, nu, nxC,
+                                     numberOfLBnodesCoarse, numberOfLBnodesFine, isEvenTimestep, posC, posFSWB, kFC, omCoarse, omFine, nu, nxC,
                                      nyC, nxF, nyF, offFC, k);
 }
 
@@ -11157,8 +11174,8 @@ __global__ void scaleFC_staggered_time_comp_27(   real* DC,
 															 unsigned int* neighborFX,
 															 unsigned int* neighborFY,
 															 unsigned int* neighborFZ,
-															 unsigned int size_MatC, 
-															 unsigned int size_MatF, 
+															 unsigned long long numberOfLBnodesCoarse, 
+															 unsigned long long numberOfLBnodesFine, 
 															 bool isEvenTimestep,
 															 unsigned int* posC, 
 															 unsigned int* posFSWB, 
@@ -11175,96 +11192,96 @@ __global__ void scaleFC_staggered_time_comp_27(   real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
@@ -11755,827 +11772,6 @@ __global__ void scaleFC_staggered_time_comp_27(   real* DC,
       kxxMyyFromfcNEQ_NEB = -c3o2*omegaS *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (c1o1 + drho_NEB) - ((vx1_NEB*vx1_NEB-vx2_NEB*vx2_NEB)));
       kxxMzzFromfcNEQ_NEB = -c3o2*omegaS *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (c1o1 + drho_NEB) - ((vx1_NEB*vx1_NEB-vx3_NEB*vx3_NEB)));
 
-   //   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  ////pointertausch
-	  // if (isEvenTimestep==false)
-	  // {
-		 // feC    = &DC[DIR_P00   *size_MatC];
-		 // fwC    = &DC[DIR_M00   *size_MatC];
-		 // fnC    = &DC[DIR_0P0   *size_MatC];
-		 // fsC    = &DC[DIR_0M0   *size_MatC];
-		 // ftC    = &DC[DIR_00P   *size_MatC];
-		 // fbC    = &DC[DIR_00M   *size_MatC];
-		 // fneC   = &DC[DIR_PP0  *size_MatC];
-		 // fswC   = &DC[DIR_MM0  *size_MatC];
-		 // fseC   = &DC[DIR_PM0  *size_MatC];
-		 // fnwC   = &DC[DIR_MP0  *size_MatC];
-		 // fteC   = &DC[DIR_P0P  *size_MatC];
-		 // fbwC   = &DC[DIR_M0M  *size_MatC];
-		 // fbeC   = &DC[DIR_P0M  *size_MatC];
-		 // ftwC   = &DC[DIR_M0P  *size_MatC];
-		 // ftnC   = &DC[DIR_0PP  *size_MatC];
-		 // fbsC   = &DC[DIR_0MM  *size_MatC];
-		 // fbnC   = &DC[DIR_0PM  *size_MatC];
-		 // ftsC   = &DC[DIR_0MP  *size_MatC];
-		 // fzeroC = &DC[DIR_000*size_MatC];
-		 // ftneC  = &DC[DIR_PPP *size_MatC];
-		 // ftswC  = &DC[DIR_MMP *size_MatC];
-		 // ftseC  = &DC[DIR_PMP *size_MatC];
-		 // ftnwC  = &DC[DIR_MPP *size_MatC];
-		 // fbneC  = &DC[DIR_PPM *size_MatC];
-		 // fbswC  = &DC[DIR_MMM *size_MatC];
-		 // fbseC  = &DC[DIR_PMM *size_MatC];
-		 // fbnwC  = &DC[DIR_MPM *size_MatC];
-	  // } 
-	  // else
-	  // {
-		 // fwC    = &DC[DIR_P00   *size_MatC];
-		 // feC    = &DC[DIR_M00   *size_MatC];
-		 // fsC    = &DC[DIR_0P0   *size_MatC];
-		 // fnC    = &DC[DIR_0M0   *size_MatC];
-		 // fbC    = &DC[DIR_00P   *size_MatC];
-		 // ftC    = &DC[DIR_00M   *size_MatC];
-		 // fswC   = &DC[DIR_PP0  *size_MatC];
-		 // fneC   = &DC[DIR_MM0  *size_MatC];
-		 // fnwC   = &DC[DIR_PM0  *size_MatC];
-		 // fseC   = &DC[DIR_MP0  *size_MatC];
-		 // fbwC   = &DC[DIR_P0P  *size_MatC];
-		 // fteC   = &DC[DIR_M0M  *size_MatC];
-		 // ftwC   = &DC[DIR_P0M  *size_MatC];
-		 // fbeC   = &DC[DIR_M0P  *size_MatC];
-		 // fbsC   = &DC[DIR_0PP  *size_MatC];
-		 // ftnC   = &DC[DIR_0MM  *size_MatC];
-		 // ftsC   = &DC[DIR_0PM  *size_MatC];
-		 // fbnC   = &DC[DIR_0MP  *size_MatC];
-		 // fzeroC = &DC[DIR_000*size_MatC];
-		 // fbswC  = &DC[DIR_PPP *size_MatC];
-		 // fbneC  = &DC[DIR_MMP *size_MatC];
-		 // fbnwC  = &DC[DIR_PMP *size_MatC];
-		 // fbseC  = &DC[DIR_MPP *size_MatC];
-		 // ftswC  = &DC[DIR_PPM *size_MatC];
-		 // ftneC  = &DC[DIR_MMM *size_MatC];
-		 // ftnwC  = &DC[DIR_PMM *size_MatC];
-		 // ftseC  = &DC[DIR_MPM *size_MatC];
-	  // }
-
- 	 // real rho_tmp;
-	  //real vx1_tmp;
-	  //real vx2_tmp;
-	  //real vx3_tmp;
-
-   //  //////////////////////////////////////////////////////////////////////////
-   //   xoff = offFC.xOffFC[k];
-   //   yoff = offFC.yOffFC[k];
-   //   zoff = offFC.zOffFC[k];      
-   //   xoff_sq = xoff * xoff;
-   //   yoff_sq = yoff * yoff;
-   //   zoff_sq = zoff * zoff;
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //SWB//
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //index 0
-   //   k0zero= posFSWB[k];
-   //   k0w   = neighborFX[k0zero];
-   //   k0s   = neighborFY[k0zero];
-   //   k0b   = neighborFZ[k0zero];
-   //   k0sw  = neighborFY[k0w];
-   //   k0bw  = neighborFZ[k0w];
-   //   k0bs  = neighborFZ[k0s];
-   //   k0bsw = neighborFZ[k0sw];
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //index 
-   //   kzero= k0zero;
-   //   kw   = k0w;   
-   //   ks   = k0s;   
-   //   kb   = k0b;   
-   //   ksw  = k0sw;  
-   //   kbw  = k0bw;  
-   //   kbs  = k0bs;  
-   //   kbsw = k0bsw; 
-   //   ////////////////////////////////////////////////////////////////////////////////
-   //   f_E    = fwF[kw];
-   //   f_W    = feF[kzero];
-   //   f_N    = fsF[ks];
-   //   f_S    = fnF[kzero];
-   //   f_T    = fbF[kb];
-   //   f_B    = ftF[kzero];
-   //   f_NE   = fswF[ksw];
-   //   f_SW   = fneF[kzero];
-   //   f_SE   = fnwF[kw];
-   //   f_NW   = fseF[ks];
-   //   f_TE   = fbwF[kbw];
-   //   f_BW   = fteF[kzero];
-   //   f_BE   = ftwF[kw];
-   //   f_TW   = fbeF[kb];
-   //   f_TN   = fbsF[kbs];
-   //   f_BS   = ftnF[kzero];
-   //   f_BN   = ftsF[ks];
-   //   f_TS   = fbnF[kb];
-   //   f_ZERO = fzeroF[kzero];
-   //   f_TNE  = fbswF[kbsw];
-   //   f_TSW  = fbneF[kb];
-   //   f_TSE  = fbnwF[kbw];
-   //   f_TNW  = fbseF[kbs];
-   //   f_BNE  = ftswF[ksw];
-   //   f_BSW  = ftneF[kzero];
-   //   f_BSE  = ftnwF[kw];
-   //   f_BNW  = ftseF[ks];
-
-   //   //drho_SWB = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-   //   //vx1_SWB  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + drho_SWB);
-	  ////vx2_SWB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + drho_SWB);
-	  ////vx3_SWB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + drho_SWB);
-
-   //   //kxyFromfcNEQ_SWB    = -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + drho_SWB) - ((vx1_SWB*vx2_SWB)));
-   //   //kyzFromfcNEQ_SWB    = -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + drho_SWB) - ((vx2_SWB*vx3_SWB)));
-   //   //kxzFromfcNEQ_SWB    = -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + drho_SWB) - ((vx1_SWB*vx3_SWB)));
-   //   //kxxMyyFromfcNEQ_SWB = -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + drho_SWB) - ((vx1_SWB*vx1_SWB-vx2_SWB*vx2_SWB)));
-   //   //kxxMzzFromfcNEQ_SWB = -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + drho_SWB) - ((vx1_SWB*vx1_SWB-vx3_SWB*vx3_SWB)));
-
-	  //rho_tmp = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-	  //
-	  //drho_SWB += rho_tmp;
-
-	  //vx1_tmp  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + rho_tmp);
-	  //vx2_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + rho_tmp);
-	  //vx3_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + rho_tmp);
-
-   //   vx1_SWB  += vx1_tmp;
-	  //vx2_SWB  += vx2_tmp;
-	  //vx3_SWB  += vx3_tmp;
-
-   //   drho_SWB *= c1o2;
-   //   vx1_SWB  *= c1o2;
-	  //vx2_SWB  *= c1o2;
-	  //vx3_SWB  *= c1o2;
-
-   //   kxyFromfcNEQ_SWB    += -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx2_tmp)));
-   //   kyzFromfcNEQ_SWB    += -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + rho_tmp) - ((vx2_tmp*vx3_tmp)));
-   //   kxzFromfcNEQ_SWB    += -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx3_tmp)));
-   //   kxxMyyFromfcNEQ_SWB += -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx2_tmp*vx2_tmp)));
-   //   kxxMzzFromfcNEQ_SWB += -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx3_tmp*vx3_tmp)));
-
-	  //kxyFromfcNEQ_SWB    *= c1o2;
-	  //kyzFromfcNEQ_SWB    *= c1o2;
-	  //kxzFromfcNEQ_SWB    *= c1o2;
-	  //kxxMyyFromfcNEQ_SWB *= c1o2;
-	  //kxxMzzFromfcNEQ_SWB *= c1o2;
-
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //SWT//
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //index 
-   //   kzero= kb;
-   //   kw   = kbw;   
-   //   ks   = kbs;   
-   //   kb   = neighborFZ[kb];   
-   //   ksw  = kbsw;  
-   //   kbw  = neighborFZ[kbw];  
-   //   kbs  = neighborFZ[kbs];  
-   //   kbsw = neighborFZ[kbsw]; 
-   //   ////////////////////////////////////////////////////////////////////////////////
-   //   f_E    = fwF[kw];
-   //   f_W    = feF[kzero];
-   //   f_N    = fsF[ks];
-   //   f_S    = fnF[kzero];
-   //   f_T    = fbF[kb];
-   //   f_B    = ftF[kzero];
-   //   f_NE   = fswF[ksw];
-   //   f_SW   = fneF[kzero];
-   //   f_SE   = fnwF[kw];
-   //   f_NW   = fseF[ks];
-   //   f_TE   = fbwF[kbw];
-   //   f_BW   = fteF[kzero];
-   //   f_BE   = ftwF[kw];
-   //   f_TW   = fbeF[kb];
-   //   f_TN   = fbsF[kbs];
-   //   f_BS   = ftnF[kzero];
-   //   f_BN   = ftsF[ks];
-   //   f_TS   = fbnF[kb];
-   //   f_ZERO = fzeroF[kzero];
-   //   f_TNE  = fbswF[kbsw];
-   //   f_TSW  = fbneF[kb];
-   //   f_TSE  = fbnwF[kbw];
-   //   f_TNW  = fbseF[kbs];
-   //   f_BNE  = ftswF[ksw];
-   //   f_BSW  = ftneF[kzero];
-   //   f_BSE  = ftnwF[kw];
-   //   f_BNW  = ftseF[ks];
-
-   //   //drho_SWT = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-   //   //vx1_SWT  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + drho_SWT);
-	  ////vx2_SWT  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + drho_SWT);
-	  ////vx3_SWT  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + drho_SWT);
-
-   //   //kxyFromfcNEQ_SWT    = -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + drho_SWT) - ((vx1_SWT*vx2_SWT)));
-   //   //kyzFromfcNEQ_SWT    = -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + drho_SWT) - ((vx2_SWT*vx3_SWT)));
-   //   //kxzFromfcNEQ_SWT    = -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + drho_SWT) - ((vx1_SWT*vx3_SWT)));
-   //   //kxxMyyFromfcNEQ_SWT = -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + drho_SWT) - ((vx1_SWT*vx1_SWT-vx2_SWT*vx2_SWT)));
-   //   //kxxMzzFromfcNEQ_SWT = -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + drho_SWT) - ((vx1_SWT*vx1_SWT-vx3_SWT*vx3_SWT)));
-
-	  //rho_tmp = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-	  //
-	  //drho_SWT += rho_tmp;
-
-	  //vx1_tmp  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + rho_tmp);
-	  //vx2_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + rho_tmp);
-	  //vx3_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + rho_tmp);
-
-   //   vx1_SWT  += vx1_tmp;
-	  //vx2_SWT  += vx2_tmp;
-	  //vx3_SWT  += vx3_tmp;
-
-   //   drho_SWT *= c1o2;
-   //   vx1_SWT  *= c1o2;
-	  //vx2_SWT  *= c1o2;
-	  //vx3_SWT  *= c1o2;
-
-   //   kxyFromfcNEQ_SWT    += -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx2_tmp)));
-   //   kyzFromfcNEQ_SWT    += -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + rho_tmp) - ((vx2_tmp*vx3_tmp)));
-   //   kxzFromfcNEQ_SWT    += -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx3_tmp)));
-   //   kxxMyyFromfcNEQ_SWT += -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx2_tmp*vx2_tmp)));
-   //   kxxMzzFromfcNEQ_SWT += -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx3_tmp*vx3_tmp)));
-
-	  //kxyFromfcNEQ_SWT    *= c1o2;
-	  //kyzFromfcNEQ_SWT    *= c1o2;
-	  //kxzFromfcNEQ_SWT    *= c1o2;
-	  //kxxMyyFromfcNEQ_SWT *= c1o2;
-	  //kxxMzzFromfcNEQ_SWT *= c1o2;
-
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //SET//
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //index 
-   //   kzero= kw;
-   //   kw   = neighborFX[kw];   
-   //   ks   = ksw;   
-   //   kb   = kbw;   
-   //   ksw  = neighborFX[ksw];  
-   //   kbw  = neighborFX[kbw];  
-   //   kbs  = kbsw;  
-   //   kbsw = neighborFX[kbsw]; 
-   //   ////////////////////////////////////////////////////////////////////////////////
-   //   f_E    = fwF[kw];
-   //   f_W    = feF[kzero];
-   //   f_N    = fsF[ks];
-   //   f_S    = fnF[kzero];
-   //   f_T    = fbF[kb];
-   //   f_B    = ftF[kzero];
-   //   f_NE   = fswF[ksw];
-   //   f_SW   = fneF[kzero];
-   //   f_SE   = fnwF[kw];
-   //   f_NW   = fseF[ks];
-   //   f_TE   = fbwF[kbw];
-   //   f_BW   = fteF[kzero];
-   //   f_BE   = ftwF[kw];
-   //   f_TW   = fbeF[kb];
-   //   f_TN   = fbsF[kbs];
-   //   f_BS   = ftnF[kzero];
-   //   f_BN   = ftsF[ks];
-   //   f_TS   = fbnF[kb];
-   //   f_ZERO = fzeroF[kzero];
-   //   f_TNE  = fbswF[kbsw];
-   //   f_TSW  = fbneF[kb];
-   //   f_TSE  = fbnwF[kbw];
-   //   f_TNW  = fbseF[kbs];
-   //   f_BNE  = ftswF[ksw];
-   //   f_BSW  = ftneF[kzero];
-   //   f_BSE  = ftnwF[kw];
-   //   f_BNW  = ftseF[ks];
-
-   //   //drho_SET = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-   //   //vx1_SET  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + drho_SET);
-	  ////vx2_SET  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + drho_SET);
-	  ////vx3_SET  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + drho_SET);
-
-   //   //kxyFromfcNEQ_SET    = -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + drho_SET) - ((vx1_SET*vx2_SET)));
-   //   //kyzFromfcNEQ_SET    = -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + drho_SET) - ((vx2_SET*vx3_SET)));
-   //   //kxzFromfcNEQ_SET    = -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + drho_SET) - ((vx1_SET*vx3_SET)));
-   //   //kxxMyyFromfcNEQ_SET = -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + drho_SET) - ((vx1_SET*vx1_SET-vx2_SET*vx2_SET)));
-   //   //kxxMzzFromfcNEQ_SET = -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + drho_SET) - ((vx1_SET*vx1_SET-vx3_SET*vx3_SET)));
-
-	  //rho_tmp = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-	  //
-	  //drho_SET += rho_tmp;
-
-	  //vx1_tmp  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + rho_tmp);
-	  //vx2_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + rho_tmp);
-	  //vx3_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + rho_tmp);
-
-   //   vx1_SET  += vx1_tmp;
-	  //vx2_SET  += vx2_tmp;
-	  //vx3_SET  += vx3_tmp;
-
-   //   drho_SET *= c1o2;
-   //   vx1_SET  *= c1o2;
-	  //vx2_SET  *= c1o2;
-	  //vx3_SET  *= c1o2;
-
-   //   kxyFromfcNEQ_SET    += -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx2_tmp)));
-   //   kyzFromfcNEQ_SET    += -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + rho_tmp) - ((vx2_tmp*vx3_tmp)));
-   //   kxzFromfcNEQ_SET    += -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx3_tmp)));
-   //   kxxMyyFromfcNEQ_SET += -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx2_tmp*vx2_tmp)));
-   //   kxxMzzFromfcNEQ_SET += -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx3_tmp*vx3_tmp)));
-
-	  //kxyFromfcNEQ_SET    *= c1o2;
-	  //kyzFromfcNEQ_SET    *= c1o2;
-	  //kxzFromfcNEQ_SET    *= c1o2;
-	  //kxxMyyFromfcNEQ_SET *= c1o2;
-	  //kxxMzzFromfcNEQ_SET *= c1o2;
-
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //SEB//
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //index 
-   //   kb   = kzero;   
-   //   kbw  = kw;  
-   //   kbs  = ks;  
-   //   kbsw = ksw; 
-   //   kzero= k0w;
-   //   kw   = neighborFX[k0w];   
-   //   ks   = k0sw;   
-   //   ksw  = neighborFX[k0sw];  
-   //   ////////////////////////////////////////////////////////////////////////////////
-   //   f_E    = fwF[kw];
-   //   f_W    = feF[kzero];
-   //   f_N    = fsF[ks];
-   //   f_S    = fnF[kzero];
-   //   f_T    = fbF[kb];
-   //   f_B    = ftF[kzero];
-   //   f_NE   = fswF[ksw];
-   //   f_SW   = fneF[kzero];
-   //   f_SE   = fnwF[kw];
-   //   f_NW   = fseF[ks];
-   //   f_TE   = fbwF[kbw];
-   //   f_BW   = fteF[kzero];
-   //   f_BE   = ftwF[kw];
-   //   f_TW   = fbeF[kb];
-   //   f_TN   = fbsF[kbs];
-   //   f_BS   = ftnF[kzero];
-   //   f_BN   = ftsF[ks];
-   //   f_TS   = fbnF[kb];
-   //   f_ZERO = fzeroF[kzero];
-   //   f_TNE  = fbswF[kbsw];
-   //   f_TSW  = fbneF[kb];
-   //   f_TSE  = fbnwF[kbw];
-   //   f_TNW  = fbseF[kbs];
-   //   f_BNE  = ftswF[ksw];
-   //   f_BSW  = ftneF[kzero];
-   //   f_BSE  = ftnwF[kw];
-   //   f_BNW  = ftseF[ks];
-
-   //   //drho_SEB = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-   //   //vx1_SEB  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + drho_SEB);
-	  ////vx2_SEB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + drho_SEB);
-	  ////vx3_SEB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + drho_SEB);
-
-   //   //kxyFromfcNEQ_SEB    = -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + drho_SEB) - ((vx1_SEB*vx2_SEB)));
-   //   //kyzFromfcNEQ_SEB    = -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + drho_SEB) - ((vx2_SEB*vx3_SEB)));
-   //   //kxzFromfcNEQ_SEB    = -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + drho_SEB) - ((vx1_SEB*vx3_SEB)));
-   //   //kxxMyyFromfcNEQ_SEB = -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + drho_SEB) - ((vx1_SEB*vx1_SEB-vx2_SEB*vx2_SEB)));
-   //   //kxxMzzFromfcNEQ_SEB = -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + drho_SEB) - ((vx1_SEB*vx1_SEB-vx3_SEB*vx3_SEB)));
-
-	  //rho_tmp = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-	  //
-	  //drho_SEB += rho_tmp;
-
-	  //vx1_tmp  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + rho_tmp);
-	  //vx2_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + rho_tmp);
-	  //vx3_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + rho_tmp);
-
-   //   vx1_SEB  += vx1_tmp;
-	  //vx2_SEB  += vx2_tmp;
-	  //vx3_SEB  += vx3_tmp;
-
-   //   drho_SEB *= c1o2;
-   //   vx1_SEB  *= c1o2;
-	  //vx2_SEB  *= c1o2;
-	  //vx3_SEB  *= c1o2;
-
-   //   kxyFromfcNEQ_SEB    += -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx2_tmp)));
-   //   kyzFromfcNEQ_SEB    += -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + rho_tmp) - ((vx2_tmp*vx3_tmp)));
-   //   kxzFromfcNEQ_SEB    += -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx3_tmp)));
-   //   kxxMyyFromfcNEQ_SEB += -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx2_tmp*vx2_tmp)));
-   //   kxxMzzFromfcNEQ_SEB += -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx3_tmp*vx3_tmp)));
-
-	  //kxyFromfcNEQ_SEB    *= c1o2;
-	  //kyzFromfcNEQ_SEB    *= c1o2;
-	  //kxzFromfcNEQ_SEB    *= c1o2;
-	  //kxxMyyFromfcNEQ_SEB *= c1o2;
-	  //kxxMzzFromfcNEQ_SEB *= c1o2;
-
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //NWB//
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //index 0
-   //   k0zero= k0s;
-   //   k0w   = k0sw;
-   //   k0s   = neighborFY[k0s];
-   //   k0b   = k0bs;
-   //   k0sw  = neighborFY[k0sw];
-   //   k0bw  = k0bsw;
-   //   k0bs  = neighborFY[k0bs];
-   //   k0bsw = neighborFY[k0bsw];
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //index 
-   //   kzero= k0zero;
-   //   kw   = k0w;   
-   //   ks   = k0s;   
-   //   kb   = k0b;   
-   //   ksw  = k0sw;  
-   //   kbw  = k0bw;  
-   //   kbs  = k0bs;  
-   //   kbsw = k0bsw; 
-   //   ////////////////////////////////////////////////////////////////////////////////
-   //   f_E    = fwF[kw];
-   //   f_W    = feF[kzero];
-   //   f_N    = fsF[ks];
-   //   f_S    = fnF[kzero];
-   //   f_T    = fbF[kb];
-   //   f_B    = ftF[kzero];
-   //   f_NE   = fswF[ksw];
-   //   f_SW   = fneF[kzero];
-   //   f_SE   = fnwF[kw];
-   //   f_NW   = fseF[ks];
-   //   f_TE   = fbwF[kbw];
-   //   f_BW   = fteF[kzero];
-   //   f_BE   = ftwF[kw];
-   //   f_TW   = fbeF[kb];
-   //   f_TN   = fbsF[kbs];
-   //   f_BS   = ftnF[kzero];
-   //   f_BN   = ftsF[ks];
-   //   f_TS   = fbnF[kb];
-   //   f_ZERO = fzeroF[kzero];
-   //   f_TNE  = fbswF[kbsw];
-   //   f_TSW  = fbneF[kb];
-   //   f_TSE  = fbnwF[kbw];
-   //   f_TNW  = fbseF[kbs];
-   //   f_BNE  = ftswF[ksw];
-   //   f_BSW  = ftneF[kzero];
-   //   f_BSE  = ftnwF[kw];
-   //   f_BNW  = ftseF[ks];
-
-   //   //drho_NWB = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-   //   //vx1_NWB  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + drho_NWB);
-	  ////vx2_NWB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + drho_NWB);
-	  ////vx3_NWB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + drho_NWB);
-
-   //   //kxyFromfcNEQ_NWB    = -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + drho_NWB) - ((vx1_NWB*vx2_NWB)));
-   //   //kyzFromfcNEQ_NWB    = -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + drho_NWB) - ((vx2_NWB*vx3_NWB)));
-   //   //kxzFromfcNEQ_NWB    = -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + drho_NWB) - ((vx1_NWB*vx3_NWB)));
-   //   //kxxMyyFromfcNEQ_NWB = -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + drho_NWB) - ((vx1_NWB*vx1_NWB-vx2_NWB*vx2_NWB)));
-   //   //kxxMzzFromfcNEQ_NWB = -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + drho_NWB) - ((vx1_NWB*vx1_NWB-vx3_NWB*vx3_NWB)));
-
-	  //rho_tmp = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-	  //
-	  //drho_NWB += rho_tmp;
-
-	  //vx1_tmp  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + rho_tmp);
-	  //vx2_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + rho_tmp);
-	  //vx3_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + rho_tmp);
-
-   //   vx1_NWB  += vx1_tmp;
-	  //vx2_NWB  += vx2_tmp;
-	  //vx3_NWB  += vx3_tmp;
-
-   //   drho_NWB *= c1o2;
-   //   vx1_NWB  *= c1o2;
-	  //vx2_NWB  *= c1o2;
-	  //vx3_NWB  *= c1o2;
-
-   //   kxyFromfcNEQ_NWB    += -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx2_tmp)));
-   //   kyzFromfcNEQ_NWB    += -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + rho_tmp) - ((vx2_tmp*vx3_tmp)));
-   //   kxzFromfcNEQ_NWB    += -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx3_tmp)));
-   //   kxxMyyFromfcNEQ_NWB += -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx2_tmp*vx2_tmp)));
-   //   kxxMzzFromfcNEQ_NWB += -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx3_tmp*vx3_tmp)));
-
-	  //kxyFromfcNEQ_NWB    *= c1o2;
-	  //kyzFromfcNEQ_NWB    *= c1o2;
-	  //kxzFromfcNEQ_NWB    *= c1o2;
-	  //kxxMyyFromfcNEQ_NWB *= c1o2;
-	  //kxxMzzFromfcNEQ_NWB *= c1o2;
-
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //NWT//
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //index 
-   //   kzero= kb;
-   //   kw   = kbw;   
-   //   ks   = kbs;   
-   //   kb   = neighborFZ[kb];   
-   //   ksw  = kbsw;  
-   //   kbw  = neighborFZ[kbw];  
-   //   kbs  = neighborFZ[kbs];  
-   //   kbsw = neighborFZ[kbsw]; 
-   //   ////////////////////////////////////////////////////////////////////////////////
-   //   f_E    = fwF[kw];
-   //   f_W    = feF[kzero];
-   //   f_N    = fsF[ks];
-   //   f_S    = fnF[kzero];
-   //   f_T    = fbF[kb];
-   //   f_B    = ftF[kzero];
-   //   f_NE   = fswF[ksw];
-   //   f_SW   = fneF[kzero];
-   //   f_SE   = fnwF[kw];
-   //   f_NW   = fseF[ks];
-   //   f_TE   = fbwF[kbw];
-   //   f_BW   = fteF[kzero];
-   //   f_BE   = ftwF[kw];
-   //   f_TW   = fbeF[kb];
-   //   f_TN   = fbsF[kbs];
-   //   f_BS   = ftnF[kzero];
-   //   f_BN   = ftsF[ks];
-   //   f_TS   = fbnF[kb];
-   //   f_ZERO = fzeroF[kzero];
-   //   f_TNE  = fbswF[kbsw];
-   //   f_TSW  = fbneF[kb];
-   //   f_TSE  = fbnwF[kbw];
-   //   f_TNW  = fbseF[kbs];
-   //   f_BNE  = ftswF[ksw];
-   //   f_BSW  = ftneF[kzero];
-   //   f_BSE  = ftnwF[kw];
-   //   f_BNW  = ftseF[ks];
-
-   //   //drho_NWT = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-   //   //vx1_NWT  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + drho_NWT);
-	  ////vx2_NWT  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + drho_NWT);
-	  ////vx3_NWT  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + drho_NWT);
-
-   //   //kxyFromfcNEQ_NWT    = -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + drho_NWT) - ((vx1_NWT*vx2_NWT)));
-   //   //kyzFromfcNEQ_NWT    = -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + drho_NWT) - ((vx2_NWT*vx3_NWT)));
-   //   //kxzFromfcNEQ_NWT    = -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + drho_NWT) - ((vx1_NWT*vx3_NWT)));
-   //   //kxxMyyFromfcNEQ_NWT = -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + drho_NWT) - ((vx1_NWT*vx1_NWT-vx2_NWT*vx2_NWT)));
-   //   //kxxMzzFromfcNEQ_NWT = -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + drho_NWT) - ((vx1_NWT*vx1_NWT-vx3_NWT*vx3_NWT)));
-
-	  //rho_tmp = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-	  //
-	  //drho_NWT += rho_tmp;
-
-	  //vx1_tmp  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + rho_tmp);
-	  //vx2_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + rho_tmp);
-	  //vx3_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + rho_tmp);
-
-   //   vx1_NWT  += vx1_tmp;
-	  //vx2_NWT  += vx2_tmp;
-	  //vx3_NWT  += vx3_tmp;
-
-   //   drho_NWT *= c1o2;
-   //   vx1_NWT  *= c1o2;
-	  //vx2_NWT  *= c1o2;
-	  //vx3_NWT  *= c1o2;
-
-   //   kxyFromfcNEQ_NWT    += -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx2_tmp)));
-   //   kyzFromfcNEQ_NWT    += -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + rho_tmp) - ((vx2_tmp*vx3_tmp)));
-   //   kxzFromfcNEQ_NWT    += -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx3_tmp)));
-   //   kxxMyyFromfcNEQ_NWT += -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx2_tmp*vx2_tmp)));
-   //   kxxMzzFromfcNEQ_NWT += -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx3_tmp*vx3_tmp)));
-
-	  //kxyFromfcNEQ_NWT    *= c1o2;
-	  //kyzFromfcNEQ_NWT    *= c1o2;
-	  //kxzFromfcNEQ_NWT    *= c1o2;
-	  //kxxMyyFromfcNEQ_NWT *= c1o2;
-	  //kxxMzzFromfcNEQ_NWT *= c1o2;
-
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //NET//
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //index 
-   //   kzero= kw;
-   //   kw   = neighborFX[kw];   
-   //   ks   = ksw;   
-   //   kb   = kbw;   
-   //   ksw  = neighborFX[ksw];  
-   //   kbw  = neighborFX[kbw];  
-   //   kbs  = kbsw;  
-   //   kbsw = neighborFX[kbsw]; 
-   //   ////////////////////////////////////////////////////////////////////////////////
-   //   f_E    = fwF[kw];
-   //   f_W    = feF[kzero];
-   //   f_N    = fsF[ks];
-   //   f_S    = fnF[kzero];
-   //   f_T    = fbF[kb];
-   //   f_B    = ftF[kzero];
-   //   f_NE   = fswF[ksw];
-   //   f_SW   = fneF[kzero];
-   //   f_SE   = fnwF[kw];
-   //   f_NW   = fseF[ks];
-   //   f_TE   = fbwF[kbw];
-   //   f_BW   = fteF[kzero];
-   //   f_BE   = ftwF[kw];
-   //   f_TW   = fbeF[kb];
-   //   f_TN   = fbsF[kbs];
-   //   f_BS   = ftnF[kzero];
-   //   f_BN   = ftsF[ks];
-   //   f_TS   = fbnF[kb];
-   //   f_ZERO = fzeroF[kzero];
-   //   f_TNE  = fbswF[kbsw];
-   //   f_TSW  = fbneF[kb];
-   //   f_TSE  = fbnwF[kbw];
-   //   f_TNW  = fbseF[kbs];
-   //   f_BNE  = ftswF[ksw];
-   //   f_BSW  = ftneF[kzero];
-   //   f_BSE  = ftnwF[kw];
-   //   f_BNW  = ftseF[ks];
-
-   //   //drho_NET = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-   //   //vx1_NET  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + drho_NET);
-	  ////vx2_NET  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + drho_NET);
-	  ////vx3_NET  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + drho_NET);
-
-   //   //kxyFromfcNEQ_NET    = -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + drho_NET) - ((vx1_NET*vx2_NET)));
-   //   //kyzFromfcNEQ_NET    = -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + drho_NET) - ((vx2_NET*vx3_NET)));
-   //   //kxzFromfcNEQ_NET    = -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + drho_NET) - ((vx1_NET*vx3_NET)));
-   //   //kxxMyyFromfcNEQ_NET = -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + drho_NET) - ((vx1_NET*vx1_NET-vx2_NET*vx2_NET)));
-   //   //kxxMzzFromfcNEQ_NET = -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + drho_NET) - ((vx1_NET*vx1_NET-vx3_NET*vx3_NET)));
-
-	  //rho_tmp = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-	  //
-	  //drho_NET += rho_tmp;
-
-	  //vx1_tmp  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + rho_tmp);
-	  //vx2_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + rho_tmp);
-	  //vx3_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + rho_tmp);
-
-   //   vx1_NET  += vx1_tmp;
-	  //vx2_NET  += vx2_tmp;
-	  //vx3_NET  += vx3_tmp;
-
-   //   drho_NET *= c1o2;
-   //   vx1_NET  *= c1o2;
-	  //vx2_NET  *= c1o2;
-	  //vx3_NET  *= c1o2;
-
-   //   kxyFromfcNEQ_NET    += -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx2_tmp)));
-   //   kyzFromfcNEQ_NET    += -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + rho_tmp) - ((vx2_tmp*vx3_tmp)));
-   //   kxzFromfcNEQ_NET    += -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx3_tmp)));
-   //   kxxMyyFromfcNEQ_NET += -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx2_tmp*vx2_tmp)));
-   //   kxxMzzFromfcNEQ_NET += -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx3_tmp*vx3_tmp)));
-
-	  //kxyFromfcNEQ_NET    *= c1o2;
-	  //kyzFromfcNEQ_NET    *= c1o2;
-	  //kxzFromfcNEQ_NET    *= c1o2;
-	  //kxxMyyFromfcNEQ_NET *= c1o2;
-	  //kxxMzzFromfcNEQ_NET *= c1o2;
-
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //NEB//
-   //   //////////////////////////////////////////////////////////////////////////
-   //   //index 
-   //   kb   = kzero;   
-   //   kbw  = kw;  
-   //   kbs  = ks;  
-   //   kbsw = ksw; 
-   //   kzero= k0w;
-   //   kw   = neighborFX[k0w];   
-   //   ks   = k0sw;   
-   //   ksw  = neighborFX[k0sw];  
-   //   ////////////////////////////////////////////////////////////////////////////////
-   //   f_E    = fwF[kw];
-   //   f_W    = feF[kzero];
-   //   f_N    = fsF[ks];
-   //   f_S    = fnF[kzero];
-   //   f_T    = fbF[kb];
-   //   f_B    = ftF[kzero];
-   //   f_NE   = fswF[ksw];
-   //   f_SW   = fneF[kzero];
-   //   f_SE   = fnwF[kw];
-   //   f_NW   = fseF[ks];
-   //   f_TE   = fbwF[kbw];
-   //   f_BW   = fteF[kzero];
-   //   f_BE   = ftwF[kw];
-   //   f_TW   = fbeF[kb];
-   //   f_TN   = fbsF[kbs];
-   //   f_BS   = ftnF[kzero];
-   //   f_BN   = ftsF[ks];
-   //   f_TS   = fbnF[kb];
-   //   f_ZERO = fzeroF[kzero];
-   //   f_TNE  = fbswF[kbsw];
-   //   f_TSW  = fbneF[kb];
-   //   f_TSE  = fbnwF[kbw];
-   //   f_TNW  = fbseF[kbs];
-   //   f_BNE  = ftswF[ksw];
-   //   f_BSW  = ftneF[kzero];
-   //   f_BSE  = ftnwF[kw];
-   //   f_BNW  = ftseF[ks];
-
-   //   //drho_NEB = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-   //   //vx1_NEB  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + drho_NEB);
-	  ////vx2_NEB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + drho_NEB);
-	  ////vx3_NEB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + drho_NEB);
-
-   //   //kxyFromfcNEQ_NEB    = -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + drho_NEB) - ((vx1_NEB*vx2_NEB)));
-   //   //kyzFromfcNEQ_NEB    = -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + drho_NEB) - ((vx2_NEB*vx3_NEB)));
-   //   //kxzFromfcNEQ_NEB    = -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + drho_NEB) - ((vx1_NEB*vx3_NEB)));
-   //   //kxxMyyFromfcNEQ_NEB = -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + drho_NEB) - ((vx1_NEB*vx1_NEB-vx2_NEB*vx2_NEB)));
-   //   //kxxMzzFromfcNEQ_NEB = -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + drho_NEB) - ((vx1_NEB*vx1_NEB-vx3_NEB*vx3_NEB)));
-
-	  //rho_tmp = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-	  //
-	  //drho_NEB += rho_tmp;
-
-	  //vx1_tmp  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(one + rho_tmp);
-	  //vx2_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(one + rho_tmp);
-	  //vx3_tmp  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(one + rho_tmp);
-
-   //   vx1_NEB  += vx1_tmp;
-	  //vx2_NEB  += vx2_tmp;
-	  //vx3_NEB  += vx3_tmp;
-
-   //   drho_NEB *= c1o2;
-   //   vx1_NEB  *= c1o2;
-	  //vx2_NEB  *= c1o2;
-	  //vx3_NEB  *= c1o2;
-
-   //   kxyFromfcNEQ_NEB    += -three*omegaS/(one-omegaS)*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx2_tmp)));
-   //   kyzFromfcNEQ_NEB    += -three*omegaS/(one-omegaS)*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (one + rho_tmp) - ((vx2_tmp*vx3_tmp)));
-   //   kxzFromfcNEQ_NEB    += -three*omegaS/(one-omegaS)*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (one + rho_tmp) - ((vx1_tmp*vx3_tmp)));
-   //   kxxMyyFromfcNEQ_NEB += -c3o2*omegaS/(one-omegaS) *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx2_tmp*vx2_tmp)));
-   //   kxxMzzFromfcNEQ_NEB += -c3o2*omegaS/(one-omegaS) *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (one + rho_tmp) - ((vx1_tmp*vx1_tmp-vx3_tmp*vx3_tmp)));
-
-	  //kxyFromfcNEQ_NEB    *= c1o2;
-	  //kyzFromfcNEQ_NEB    *= c1o2;
-	  //kxzFromfcNEQ_NEB    *= c1o2;
-	  //kxxMyyFromfcNEQ_NEB *= c1o2;
-	  //kxxMzzFromfcNEQ_NEB *= c1o2;
-	  //
-	  //
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  
-	  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //kxyFromfcNEQ_SWB    = zero;
-	  //kyzFromfcNEQ_SWB    = zero;
-	  //kxzFromfcNEQ_SWB    = zero;
-	  //kxxMyyFromfcNEQ_SWB = zero;
-	  //kxxMzzFromfcNEQ_SWB = zero;
-	  //kxyFromfcNEQ_SWT    = zero;
-	  //kyzFromfcNEQ_SWT    = zero;
-	  //kxzFromfcNEQ_SWT    = zero;
-	  //kxxMyyFromfcNEQ_SWT = zero;
-	  //kxxMzzFromfcNEQ_SWT = zero;
-	  //kxyFromfcNEQ_SET    = zero;
-	  //kyzFromfcNEQ_SET    = zero;
-	  //kxzFromfcNEQ_SET    = zero;
-	  //kxxMyyFromfcNEQ_SET = zero;
-	  //kxxMzzFromfcNEQ_SET = zero;
-	  //kxyFromfcNEQ_SEB    = zero;
-	  //kyzFromfcNEQ_SEB    = zero;
-	  //kxzFromfcNEQ_SEB    = zero;
-	  //kxxMyyFromfcNEQ_SEB = zero;
-	  //kxxMzzFromfcNEQ_SEB = zero;
-	  //kxyFromfcNEQ_NWB    = zero;
-	  //kyzFromfcNEQ_NWB    = zero;
-	  //kxzFromfcNEQ_NWB    = zero;
-	  //kxxMyyFromfcNEQ_NWB = zero;
-	  //kxxMzzFromfcNEQ_NWB = zero;
-	  //kxyFromfcNEQ_NWT    = zero;
-	  //kyzFromfcNEQ_NWT    = zero;
-	  //kxzFromfcNEQ_NWT    = zero;
-	  //kxxMyyFromfcNEQ_NWT = zero;
-	  //kxxMzzFromfcNEQ_NWT = zero;
-	  //kxyFromfcNEQ_NET    = zero;
-	  //kyzFromfcNEQ_NET    = zero;
-	  //kxzFromfcNEQ_NET    = zero;
-	  //kxxMyyFromfcNEQ_NET = zero;
-	  //kxxMzzFromfcNEQ_NET = zero;
-	  //kxyFromfcNEQ_NEB    = zero;
-	  //kyzFromfcNEQ_NEB    = zero;
-	  //kxzFromfcNEQ_NEB    = zero;
-	  //kxxMyyFromfcNEQ_NEB = zero;
-	  //kxxMzzFromfcNEQ_NEB = zero;
       //////////////////////////////////////////////////////////////////////////
       //3
       //////////////////////////////////////////////////////////////////////////
@@ -13278,8 +12474,8 @@ __global__ void scaleFC_Fix_comp_27(  real* DC,
 												 unsigned int* neighborFX,
 												 unsigned int* neighborFY,
 												 unsigned int* neighborFZ,
-												 unsigned int size_MatC, 
-												 unsigned int size_MatF, 
+												 unsigned long long numberOfLBnodesCoarse, 
+												 unsigned long long numberOfLBnodesFine, 
 												 bool isEvenTimestep,
 												 unsigned int* posC, 
 												 unsigned int* posFSWB, 
@@ -13296,96 +12492,96 @@ __global__ void scaleFC_Fix_comp_27(  real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
@@ -15138,8 +14334,8 @@ __global__ void scaleFC_NSPress_27(   real* DC,
 												 unsigned int* neighborFX,
 												 unsigned int* neighborFY,
 												 unsigned int* neighborFZ,
-												 unsigned int size_MatC, 
-												 unsigned int size_MatF, 
+												 unsigned long long numberOfLBnodesCoarse, 
+												 unsigned long long numberOfLBnodesFine, 
 												 bool isEvenTimestep,
 												 unsigned int* posC, 
 												 unsigned int* posFSWB, 
@@ -15156,96 +14352,96 @@ __global__ void scaleFC_NSPress_27(   real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
@@ -16344,8 +15540,8 @@ __global__ void scaleFC_Fix_27(   real* DC,
                                              unsigned int* neighborFX,
                                              unsigned int* neighborFY,
                                              unsigned int* neighborFZ,
-                                             unsigned int size_MatC, 
-                                             unsigned int size_MatF, 
+                                             unsigned long long numberOfLBnodesCoarse, 
+                                             unsigned long long numberOfLBnodesFine, 
                                              bool isEvenTimestep,
                                              unsigned int* posC, 
                                              unsigned int* posFSWB, 
@@ -16362,96 +15558,96 @@ __global__ void scaleFC_Fix_27(   real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
@@ -17704,8 +16900,8 @@ __global__ void scaleFCpress27(real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC, 
-                                          unsigned int size_MatF, 
+                                          unsigned long long numberOfLBnodesCoarse, 
+                                          unsigned long long numberOfLBnodesFine, 
                                           bool isEvenTimestep,
                                           unsigned int* posC, 
                                           unsigned int* posFSWB, 
@@ -17722,96 +16918,96 @@ __global__ void scaleFCpress27(real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
@@ -18629,8 +17825,8 @@ __global__ void scaleFCLast27( real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC, 
-                                          unsigned int size_MatF, 
+                                          unsigned long long numberOfLBnodesCoarse, 
+                                          unsigned long long numberOfLBnodesFine, 
                                           bool isEvenTimestep,
                                           unsigned int* posC, 
                                           unsigned int* posFSWB, 
@@ -18647,96 +17843,96 @@ __global__ void scaleFCLast27( real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
@@ -20027,8 +19223,8 @@ __global__ void scaleFCThSMG7(    real* DC,
                                              unsigned int* neighborFX,
                                              unsigned int* neighborFY,
                                              unsigned int* neighborFZ,
-                                             unsigned int size_MatC, 
-                                             unsigned int size_MatF, 
+                                             unsigned long long numberOfLBnodesCoarse, 
+                                             unsigned long long numberOfLBnodesFine, 
                                              bool isEvenTimestep,
                                              unsigned int* posC, 
                                              unsigned int* posFSWB, 
@@ -20040,127 +19236,124 @@ __global__ void scaleFCThSMG7(    real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, //*fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   //fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, //*fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      //fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      //fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    Distributions7 D7F;
-   D7F.f[0] = &DD7F[0*size_MatF];
-   D7F.f[1] = &DD7F[1*size_MatF];
-   D7F.f[2] = &DD7F[2*size_MatF];
-   D7F.f[3] = &DD7F[3*size_MatF];
-   D7F.f[4] = &DD7F[4*size_MatF];
-   D7F.f[5] = &DD7F[5*size_MatF];
-   D7F.f[6] = &DD7F[6*size_MatF];
+   D7F.f[0] = &DD7F[0*numberOfLBnodesFine];
+   D7F.f[1] = &DD7F[1*numberOfLBnodesFine];
+   D7F.f[2] = &DD7F[2*numberOfLBnodesFine];
+   D7F.f[3] = &DD7F[3*numberOfLBnodesFine];
+   D7F.f[4] = &DD7F[4*numberOfLBnodesFine];
+   D7F.f[5] = &DD7F[5*numberOfLBnodesFine];
+   D7F.f[6] = &DD7F[6*numberOfLBnodesFine];
 
    Distributions7 D7C;
    if (isEvenTimestep==true)
    {
-      D7C.f[0] = &DD7C[0*size_MatC];
-      D7C.f[1] = &DD7C[1*size_MatC];
-      D7C.f[2] = &DD7C[2*size_MatC];
-      D7C.f[3] = &DD7C[3*size_MatC];
-      D7C.f[4] = &DD7C[4*size_MatC];
-      D7C.f[5] = &DD7C[5*size_MatC];
-      D7C.f[6] = &DD7C[6*size_MatC];
+      D7C.f[0] = &DD7C[0*numberOfLBnodesCoarse];
+      D7C.f[1] = &DD7C[1*numberOfLBnodesCoarse];
+      D7C.f[2] = &DD7C[2*numberOfLBnodesCoarse];
+      D7C.f[3] = &DD7C[3*numberOfLBnodesCoarse];
+      D7C.f[4] = &DD7C[4*numberOfLBnodesCoarse];
+      D7C.f[5] = &DD7C[5*numberOfLBnodesCoarse];
+      D7C.f[6] = &DD7C[6*numberOfLBnodesCoarse];
    }
    else
    {
-      D7C.f[0] = &DD7C[0*size_MatC];
-      D7C.f[2] = &DD7C[1*size_MatC];
-      D7C.f[1] = &DD7C[2*size_MatC];
-      D7C.f[4] = &DD7C[3*size_MatC];
-      D7C.f[3] = &DD7C[4*size_MatC];
-      D7C.f[6] = &DD7C[5*size_MatC];
-      D7C.f[5] = &DD7C[6*size_MatC];
+      D7C.f[0] = &DD7C[0*numberOfLBnodesCoarse];
+      D7C.f[2] = &DD7C[1*numberOfLBnodesCoarse];
+      D7C.f[1] = &DD7C[2*numberOfLBnodesCoarse];
+      D7C.f[4] = &DD7C[3*numberOfLBnodesCoarse];
+      D7C.f[3] = &DD7C[4*numberOfLBnodesCoarse];
+      D7C.f[6] = &DD7C[5*numberOfLBnodesCoarse];
+      D7C.f[5] = &DD7C[6*numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -20900,8 +20093,8 @@ __global__ void scaleFCThS7(   real* DC,
                                           unsigned int* neighborFX,
                                           unsigned int* neighborFY,
                                           unsigned int* neighborFZ,
-                                          unsigned int size_MatC, 
-                                          unsigned int size_MatF, 
+                                          unsigned long long numberOfLBnodesCoarse, 
+                                          unsigned long long numberOfLBnodesFine, 
                                           bool isEvenTimestep,
                                           unsigned int* posC, 
                                           unsigned int* posFSWB, 
@@ -20912,127 +20105,124 @@ __global__ void scaleFCThS7(   real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, //*fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   //fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, //*fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      //fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      //fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    Distributions7 D7F;
-   D7F.f[0] = &DD7F[0*size_MatF];
-   D7F.f[1] = &DD7F[1*size_MatF];
-   D7F.f[2] = &DD7F[2*size_MatF];
-   D7F.f[3] = &DD7F[3*size_MatF];
-   D7F.f[4] = &DD7F[4*size_MatF];
-   D7F.f[5] = &DD7F[5*size_MatF];
-   D7F.f[6] = &DD7F[6*size_MatF];
+   D7F.f[0] = &DD7F[0*numberOfLBnodesFine];
+   D7F.f[1] = &DD7F[1*numberOfLBnodesFine];
+   D7F.f[2] = &DD7F[2*numberOfLBnodesFine];
+   D7F.f[3] = &DD7F[3*numberOfLBnodesFine];
+   D7F.f[4] = &DD7F[4*numberOfLBnodesFine];
+   D7F.f[5] = &DD7F[5*numberOfLBnodesFine];
+   D7F.f[6] = &DD7F[6*numberOfLBnodesFine];
 
    Distributions7 D7C;
    if (isEvenTimestep==true)
    {
-      D7C.f[0] = &DD7C[0*size_MatC];
-      D7C.f[1] = &DD7C[1*size_MatC];
-      D7C.f[2] = &DD7C[2*size_MatC];
-      D7C.f[3] = &DD7C[3*size_MatC];
-      D7C.f[4] = &DD7C[4*size_MatC];
-      D7C.f[5] = &DD7C[5*size_MatC];
-      D7C.f[6] = &DD7C[6*size_MatC];
+      D7C.f[0] = &DD7C[0*numberOfLBnodesCoarse];
+      D7C.f[1] = &DD7C[1*numberOfLBnodesCoarse];
+      D7C.f[2] = &DD7C[2*numberOfLBnodesCoarse];
+      D7C.f[3] = &DD7C[3*numberOfLBnodesCoarse];
+      D7C.f[4] = &DD7C[4*numberOfLBnodesCoarse];
+      D7C.f[5] = &DD7C[5*numberOfLBnodesCoarse];
+      D7C.f[6] = &DD7C[6*numberOfLBnodesCoarse];
    }
    else
    {
-      D7C.f[0] = &DD7C[0*size_MatC];
-      D7C.f[2] = &DD7C[1*size_MatC];
-      D7C.f[1] = &DD7C[2*size_MatC];
-      D7C.f[4] = &DD7C[3*size_MatC];
-      D7C.f[3] = &DD7C[4*size_MatC];
-      D7C.f[6] = &DD7C[5*size_MatC];
-      D7C.f[5] = &DD7C[6*size_MatC];
+      D7C.f[0] = &DD7C[0*numberOfLBnodesCoarse];
+      D7C.f[2] = &DD7C[1*numberOfLBnodesCoarse];
+      D7C.f[1] = &DD7C[2*numberOfLBnodesCoarse];
+      D7C.f[4] = &DD7C[3*numberOfLBnodesCoarse];
+      D7C.f[3] = &DD7C[4*numberOfLBnodesCoarse];
+      D7C.f[6] = &DD7C[5*numberOfLBnodesCoarse];
+      D7C.f[5] = &DD7C[6*numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -21691,8 +20881,8 @@ __global__ void scaleFCThS27(     real* DC,
                                              unsigned int* neighborFX,
                                              unsigned int* neighborFY,
                                              unsigned int* neighborFZ,
-                                             unsigned int size_MatC, 
-                                             unsigned int size_MatF, 
+                                             unsigned long long numberOfLBnodesCoarse, 
+                                             unsigned long long numberOfLBnodesFine, 
                                              bool isEvenTimestep,
                                              unsigned int* posC, 
                                              unsigned int* posFSWB, 
@@ -21704,187 +20894,184 @@ __global__ void scaleFCThS27(     real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, //*fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   //fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, //*fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      //fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      //fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    Distributions27 D27F;
-   D27F.f[DIR_P00   ] = &DD27F[DIR_P00   *size_MatF];
-   D27F.f[DIR_M00   ] = &DD27F[DIR_M00   *size_MatF];
-   D27F.f[DIR_0P0   ] = &DD27F[DIR_0P0   *size_MatF];
-   D27F.f[DIR_0M0   ] = &DD27F[DIR_0M0   *size_MatF];
-   D27F.f[DIR_00P   ] = &DD27F[DIR_00P   *size_MatF];
-   D27F.f[DIR_00M   ] = &DD27F[DIR_00M   *size_MatF];
-   D27F.f[DIR_PP0  ] = &DD27F[DIR_PP0  *size_MatF];
-   D27F.f[DIR_MM0  ] = &DD27F[DIR_MM0  *size_MatF];
-   D27F.f[DIR_PM0  ] = &DD27F[DIR_PM0  *size_MatF];
-   D27F.f[DIR_MP0  ] = &DD27F[DIR_MP0  *size_MatF];
-   D27F.f[DIR_P0P  ] = &DD27F[DIR_P0P  *size_MatF];
-   D27F.f[DIR_M0M  ] = &DD27F[DIR_M0M  *size_MatF];
-   D27F.f[DIR_P0M  ] = &DD27F[DIR_P0M  *size_MatF];
-   D27F.f[DIR_M0P  ] = &DD27F[DIR_M0P  *size_MatF];
-   D27F.f[DIR_0PP  ] = &DD27F[DIR_0PP  *size_MatF];
-   D27F.f[DIR_0MM  ] = &DD27F[DIR_0MM  *size_MatF];
-   D27F.f[DIR_0PM  ] = &DD27F[DIR_0PM  *size_MatF];
-   D27F.f[DIR_0MP  ] = &DD27F[DIR_0MP  *size_MatF];
-   D27F.f[DIR_000] = &DD27F[DIR_000*size_MatF];
-   D27F.f[DIR_PPP ] = &DD27F[DIR_PPP *size_MatF];
-   D27F.f[DIR_MMP ] = &DD27F[DIR_MMP *size_MatF];
-   D27F.f[DIR_PMP ] = &DD27F[DIR_PMP *size_MatF];
-   D27F.f[DIR_MPP ] = &DD27F[DIR_MPP *size_MatF];
-   D27F.f[DIR_PPM ] = &DD27F[DIR_PPM *size_MatF];
-   D27F.f[DIR_MMM ] = &DD27F[DIR_MMM *size_MatF];
-   D27F.f[DIR_PMM ] = &DD27F[DIR_PMM *size_MatF];
-   D27F.f[DIR_MPM ] = &DD27F[DIR_MPM *size_MatF];
+   D27F.f[DIR_P00] = &DD27F[DIR_P00 * numberOfLBnodesFine];
+   D27F.f[DIR_M00] = &DD27F[DIR_M00 * numberOfLBnodesFine];
+   D27F.f[DIR_0P0] = &DD27F[DIR_0P0 * numberOfLBnodesFine];
+   D27F.f[DIR_0M0] = &DD27F[DIR_0M0 * numberOfLBnodesFine];
+   D27F.f[DIR_00P] = &DD27F[DIR_00P * numberOfLBnodesFine];
+   D27F.f[DIR_00M] = &DD27F[DIR_00M * numberOfLBnodesFine];
+   D27F.f[DIR_PP0] = &DD27F[DIR_PP0 * numberOfLBnodesFine];
+   D27F.f[DIR_MM0] = &DD27F[DIR_MM0 * numberOfLBnodesFine];
+   D27F.f[DIR_PM0] = &DD27F[DIR_PM0 * numberOfLBnodesFine];
+   D27F.f[DIR_MP0] = &DD27F[DIR_MP0 * numberOfLBnodesFine];
+   D27F.f[DIR_P0P] = &DD27F[DIR_P0P * numberOfLBnodesFine];
+   D27F.f[DIR_M0M] = &DD27F[DIR_M0M * numberOfLBnodesFine];
+   D27F.f[DIR_P0M] = &DD27F[DIR_P0M * numberOfLBnodesFine];
+   D27F.f[DIR_M0P] = &DD27F[DIR_M0P * numberOfLBnodesFine];
+   D27F.f[DIR_0PP] = &DD27F[DIR_0PP * numberOfLBnodesFine];
+   D27F.f[DIR_0MM] = &DD27F[DIR_0MM * numberOfLBnodesFine];
+   D27F.f[DIR_0PM] = &DD27F[DIR_0PM * numberOfLBnodesFine];
+   D27F.f[DIR_0MP] = &DD27F[DIR_0MP * numberOfLBnodesFine];
+   D27F.f[DIR_000] = &DD27F[DIR_000 * numberOfLBnodesFine];
+   D27F.f[DIR_PPP] = &DD27F[DIR_PPP * numberOfLBnodesFine];
+   D27F.f[DIR_MMP] = &DD27F[DIR_MMP * numberOfLBnodesFine];
+   D27F.f[DIR_PMP] = &DD27F[DIR_PMP * numberOfLBnodesFine];
+   D27F.f[DIR_MPP] = &DD27F[DIR_MPP * numberOfLBnodesFine];
+   D27F.f[DIR_PPM] = &DD27F[DIR_PPM * numberOfLBnodesFine];
+   D27F.f[DIR_MMM] = &DD27F[DIR_MMM * numberOfLBnodesFine];
+   D27F.f[DIR_PMM] = &DD27F[DIR_PMM * numberOfLBnodesFine];
+   D27F.f[DIR_MPM] = &DD27F[DIR_MPM * numberOfLBnodesFine];
 
    Distributions27 D27C;
    if (isEvenTimestep==true)
    {
-      D27C.f[DIR_P00   ] = &DD27C[DIR_P00   *size_MatC];
-      D27C.f[DIR_M00   ] = &DD27C[DIR_M00   *size_MatC];
-      D27C.f[DIR_0P0   ] = &DD27C[DIR_0P0   *size_MatC];
-      D27C.f[DIR_0M0   ] = &DD27C[DIR_0M0   *size_MatC];
-      D27C.f[DIR_00P   ] = &DD27C[DIR_00P   *size_MatC];
-      D27C.f[DIR_00M   ] = &DD27C[DIR_00M   *size_MatC];
-      D27C.f[DIR_PP0  ] = &DD27C[DIR_PP0  *size_MatC];
-      D27C.f[DIR_MM0  ] = &DD27C[DIR_MM0  *size_MatC];
-      D27C.f[DIR_PM0  ] = &DD27C[DIR_PM0  *size_MatC];
-      D27C.f[DIR_MP0  ] = &DD27C[DIR_MP0  *size_MatC];
-      D27C.f[DIR_P0P  ] = &DD27C[DIR_P0P  *size_MatC];
-      D27C.f[DIR_M0M  ] = &DD27C[DIR_M0M  *size_MatC];
-      D27C.f[DIR_P0M  ] = &DD27C[DIR_P0M  *size_MatC];
-      D27C.f[DIR_M0P  ] = &DD27C[DIR_M0P  *size_MatC];
-      D27C.f[DIR_0PP  ] = &DD27C[DIR_0PP  *size_MatC];
-      D27C.f[DIR_0MM  ] = &DD27C[DIR_0MM  *size_MatC];
-      D27C.f[DIR_0PM  ] = &DD27C[DIR_0PM  *size_MatC];
-      D27C.f[DIR_0MP  ] = &DD27C[DIR_0MP  *size_MatC];
-      D27C.f[DIR_000] = &DD27C[DIR_000*size_MatC];
-      D27C.f[DIR_PPP ] = &DD27C[DIR_PPP *size_MatC];
-      D27C.f[DIR_MMP ] = &DD27C[DIR_MMP *size_MatC];
-      D27C.f[DIR_PMP ] = &DD27C[DIR_PMP *size_MatC];
-      D27C.f[DIR_MPP ] = &DD27C[DIR_MPP *size_MatC];
-      D27C.f[DIR_PPM ] = &DD27C[DIR_PPM *size_MatC];
-      D27C.f[DIR_MMM ] = &DD27C[DIR_MMM *size_MatC];
-      D27C.f[DIR_PMM ] = &DD27C[DIR_PMM *size_MatC];
-      D27C.f[DIR_MPM ] = &DD27C[DIR_MPM *size_MatC];
+      D27C.f[DIR_P00] = &DD27C[DIR_P00 * numberOfLBnodesCoarse];
+      D27C.f[DIR_M00] = &DD27C[DIR_M00 * numberOfLBnodesCoarse];
+      D27C.f[DIR_0P0] = &DD27C[DIR_0P0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_0M0] = &DD27C[DIR_0M0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_00P] = &DD27C[DIR_00P * numberOfLBnodesCoarse];
+      D27C.f[DIR_00M] = &DD27C[DIR_00M * numberOfLBnodesCoarse];
+      D27C.f[DIR_PP0] = &DD27C[DIR_PP0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_MM0] = &DD27C[DIR_MM0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_PM0] = &DD27C[DIR_PM0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_MP0] = &DD27C[DIR_MP0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_P0P] = &DD27C[DIR_P0P * numberOfLBnodesCoarse];
+      D27C.f[DIR_M0M] = &DD27C[DIR_M0M * numberOfLBnodesCoarse];
+      D27C.f[DIR_P0M] = &DD27C[DIR_P0M * numberOfLBnodesCoarse];
+      D27C.f[DIR_M0P] = &DD27C[DIR_M0P * numberOfLBnodesCoarse];
+      D27C.f[DIR_0PP] = &DD27C[DIR_0PP * numberOfLBnodesCoarse];
+      D27C.f[DIR_0MM] = &DD27C[DIR_0MM * numberOfLBnodesCoarse];
+      D27C.f[DIR_0PM] = &DD27C[DIR_0PM * numberOfLBnodesCoarse];
+      D27C.f[DIR_0MP] = &DD27C[DIR_0MP * numberOfLBnodesCoarse];
+      D27C.f[DIR_000] = &DD27C[DIR_000 * numberOfLBnodesCoarse];
+      D27C.f[DIR_PPP] = &DD27C[DIR_PPP * numberOfLBnodesCoarse];
+      D27C.f[DIR_MMP] = &DD27C[DIR_MMP * numberOfLBnodesCoarse];
+      D27C.f[DIR_PMP] = &DD27C[DIR_PMP * numberOfLBnodesCoarse];
+      D27C.f[DIR_MPP] = &DD27C[DIR_MPP * numberOfLBnodesCoarse];
+      D27C.f[DIR_PPM] = &DD27C[DIR_PPM * numberOfLBnodesCoarse];
+      D27C.f[DIR_MMM] = &DD27C[DIR_MMM * numberOfLBnodesCoarse];
+      D27C.f[DIR_PMM] = &DD27C[DIR_PMM * numberOfLBnodesCoarse];
+      D27C.f[DIR_MPM] = &DD27C[DIR_MPM * numberOfLBnodesCoarse];
    }
    else
    {
-      D27C.f[DIR_M00   ] = &DD27C[DIR_P00   *size_MatC];
-      D27C.f[DIR_P00   ] = &DD27C[DIR_M00   *size_MatC];
-      D27C.f[DIR_0M0   ] = &DD27C[DIR_0P0   *size_MatC];
-      D27C.f[DIR_0P0   ] = &DD27C[DIR_0M0   *size_MatC];
-      D27C.f[DIR_00M   ] = &DD27C[DIR_00P   *size_MatC];
-      D27C.f[DIR_00P   ] = &DD27C[DIR_00M   *size_MatC];
-      D27C.f[DIR_MM0  ] = &DD27C[DIR_PP0  *size_MatC];
-      D27C.f[DIR_PP0  ] = &DD27C[DIR_MM0  *size_MatC];
-      D27C.f[DIR_MP0  ] = &DD27C[DIR_PM0  *size_MatC];
-      D27C.f[DIR_PM0  ] = &DD27C[DIR_MP0  *size_MatC];
-      D27C.f[DIR_M0M  ] = &DD27C[DIR_P0P  *size_MatC];
-      D27C.f[DIR_P0P  ] = &DD27C[DIR_M0M  *size_MatC];
-      D27C.f[DIR_M0P  ] = &DD27C[DIR_P0M  *size_MatC];
-      D27C.f[DIR_P0M  ] = &DD27C[DIR_M0P  *size_MatC];
-      D27C.f[DIR_0MM  ] = &DD27C[DIR_0PP  *size_MatC];
-      D27C.f[DIR_0PP  ] = &DD27C[DIR_0MM  *size_MatC];
-      D27C.f[DIR_0MP  ] = &DD27C[DIR_0PM  *size_MatC];
-      D27C.f[DIR_0PM  ] = &DD27C[DIR_0MP  *size_MatC];
-      D27C.f[DIR_000] = &DD27C[DIR_000*size_MatC];
-      D27C.f[DIR_MMM ] = &DD27C[DIR_PPP *size_MatC];
-      D27C.f[DIR_PPM ] = &DD27C[DIR_MMP *size_MatC];
-      D27C.f[DIR_MPM ] = &DD27C[DIR_PMP *size_MatC];
-      D27C.f[DIR_PMM ] = &DD27C[DIR_MPP *size_MatC];
-      D27C.f[DIR_MMP ] = &DD27C[DIR_PPM *size_MatC];
-      D27C.f[DIR_PPP ] = &DD27C[DIR_MMM *size_MatC];
-      D27C.f[DIR_MPP ] = &DD27C[DIR_PMM *size_MatC];
-      D27C.f[DIR_PMP ] = &DD27C[DIR_MPM *size_MatC];
+      D27C.f[DIR_M00] = &DD27C[DIR_P00 * numberOfLBnodesCoarse];
+      D27C.f[DIR_P00] = &DD27C[DIR_M00 * numberOfLBnodesCoarse];
+      D27C.f[DIR_0M0] = &DD27C[DIR_0P0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_0P0] = &DD27C[DIR_0M0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_00M] = &DD27C[DIR_00P * numberOfLBnodesCoarse];
+      D27C.f[DIR_00P] = &DD27C[DIR_00M * numberOfLBnodesCoarse];
+      D27C.f[DIR_MM0] = &DD27C[DIR_PP0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_PP0] = &DD27C[DIR_MM0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_MP0] = &DD27C[DIR_PM0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_PM0] = &DD27C[DIR_MP0 * numberOfLBnodesCoarse];
+      D27C.f[DIR_M0M] = &DD27C[DIR_P0P * numberOfLBnodesCoarse];
+      D27C.f[DIR_P0P] = &DD27C[DIR_M0M * numberOfLBnodesCoarse];
+      D27C.f[DIR_M0P] = &DD27C[DIR_P0M * numberOfLBnodesCoarse];
+      D27C.f[DIR_P0M] = &DD27C[DIR_M0P * numberOfLBnodesCoarse];
+      D27C.f[DIR_0MM] = &DD27C[DIR_0PP * numberOfLBnodesCoarse];
+      D27C.f[DIR_0PP] = &DD27C[DIR_0MM * numberOfLBnodesCoarse];
+      D27C.f[DIR_0MP] = &DD27C[DIR_0PM * numberOfLBnodesCoarse];
+      D27C.f[DIR_0PM] = &DD27C[DIR_0MP * numberOfLBnodesCoarse];
+      D27C.f[DIR_000] = &DD27C[DIR_000 * numberOfLBnodesCoarse];
+      D27C.f[DIR_MMM] = &DD27C[DIR_PPP * numberOfLBnodesCoarse];
+      D27C.f[DIR_PPM] = &DD27C[DIR_MMP * numberOfLBnodesCoarse];
+      D27C.f[DIR_MPM] = &DD27C[DIR_PMP * numberOfLBnodesCoarse];
+      D27C.f[DIR_PMM] = &DD27C[DIR_MPP * numberOfLBnodesCoarse];
+      D27C.f[DIR_MMP] = &DD27C[DIR_PPM * numberOfLBnodesCoarse];
+      D27C.f[DIR_PPP] = &DD27C[DIR_MMM * numberOfLBnodesCoarse];
+      D27C.f[DIR_MPP] = &DD27C[DIR_PMM * numberOfLBnodesCoarse];
+      D27C.f[DIR_PMP] = &DD27C[DIR_MPM * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -21980,33 +21167,33 @@ __global__ void scaleFCThS27(     real* DC,
       f_BSE  = fbseF[kbs];
       f_BNW  = fbnwF[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27F.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27F.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27F.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27F.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27F.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27F.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27F.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27F.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27F.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27F.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27F.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27F.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27F.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27F.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27F.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27F.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27F.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27F.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27F.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27F.f[DIR_M00])[kw   ];
+      f27N    =  (D27F.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27F.f[DIR_0M0])[ks   ];
+      f27T    =  (D27F.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27F.f[DIR_00M])[kb   ];
+      f27NE   =  (D27F.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27F.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27F.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27F.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27F.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27F.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27F.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27F.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27F.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27F.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27F.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27F.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27F.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27F.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27F.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27F.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27F.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27F.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27F.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27F.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27F.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27F.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27F.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27F.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27F.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27F.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27F.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27F.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27F.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_F_SWB = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -22067,33 +21254,33 @@ __global__ void scaleFCThS27(     real* DC,
       f_BSE  = fbseF[kbs];
       f_BNW  = fbnwF[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27F.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27F.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27F.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27F.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27F.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27F.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27F.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27F.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27F.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27F.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27F.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27F.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27F.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27F.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27F.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27F.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27F.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27F.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27F.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27F.f[DIR_M00])[kw   ];
+      f27N    =  (D27F.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27F.f[DIR_0M0])[ks   ];
+      f27T    =  (D27F.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27F.f[DIR_00M])[kb   ];
+      f27NE   =  (D27F.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27F.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27F.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27F.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27F.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27F.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27F.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27F.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27F.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27F.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27F.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27F.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27F.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27F.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27F.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27F.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27F.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27F.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27F.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27F.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27F.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27F.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27F.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27F.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27F.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27F.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27F.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27F.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27F.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_F_SWT = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -22154,33 +21341,33 @@ __global__ void scaleFCThS27(     real* DC,
       f_BSE  = fbseF[kbs];
       f_BNW  = fbnwF[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27F.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27F.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27F.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27F.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27F.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27F.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27F.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27F.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27F.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27F.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27F.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27F.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27F.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27F.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27F.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27F.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27F.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27F.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27F.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27F.f[DIR_M00])[kw   ];
+      f27N    =  (D27F.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27F.f[DIR_0M0])[ks   ];
+      f27T    =  (D27F.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27F.f[DIR_00M])[kb   ];
+      f27NE   =  (D27F.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27F.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27F.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27F.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27F.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27F.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27F.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27F.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27F.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27F.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27F.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27F.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27F.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27F.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27F.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27F.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27F.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27F.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27F.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27F.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27F.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27F.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27F.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27F.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27F.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27F.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27F.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27F.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27F.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_F_SET = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -22241,33 +21428,33 @@ __global__ void scaleFCThS27(     real* DC,
       f_BSE  = fbseF[kbs];
       f_BNW  = fbnwF[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27F.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27F.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27F.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27F.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27F.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27F.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27F.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27F.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27F.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27F.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27F.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27F.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27F.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27F.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27F.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27F.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27F.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27F.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27F.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27F.f[DIR_M00])[kw   ];
+      f27N    =  (D27F.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27F.f[DIR_0M0])[ks   ];
+      f27T    =  (D27F.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27F.f[DIR_00M])[kb   ];
+      f27NE   =  (D27F.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27F.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27F.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27F.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27F.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27F.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27F.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27F.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27F.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27F.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27F.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27F.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27F.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27F.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27F.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27F.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27F.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27F.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27F.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27F.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27F.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27F.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27F.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27F.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27F.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27F.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27F.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27F.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27F.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_F_SEB = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -22338,33 +21525,33 @@ __global__ void scaleFCThS27(     real* DC,
       f_BSE  = fbseF[kbs];
       f_BNW  = fbnwF[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27F.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27F.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27F.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27F.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27F.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27F.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27F.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27F.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27F.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27F.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27F.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27F.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27F.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27F.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27F.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27F.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27F.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27F.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27F.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27F.f[DIR_M00])[kw   ];
+      f27N    =  (D27F.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27F.f[DIR_0M0])[ks   ];
+      f27T    =  (D27F.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27F.f[DIR_00M])[kb   ];
+      f27NE   =  (D27F.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27F.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27F.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27F.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27F.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27F.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27F.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27F.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27F.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27F.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27F.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27F.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27F.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27F.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27F.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27F.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27F.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27F.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27F.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27F.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27F.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27F.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27F.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27F.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27F.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27F.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27F.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27F.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27F.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_F_NWB = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -22425,33 +21612,33 @@ __global__ void scaleFCThS27(     real* DC,
       f_BSE  = fbseF[kbs];
       f_BNW  = fbnwF[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27F.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27F.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27F.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27F.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27F.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27F.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27F.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27F.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27F.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27F.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27F.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27F.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27F.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27F.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27F.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27F.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27F.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27F.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27F.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27F.f[DIR_M00])[kw   ];
+      f27N    =  (D27F.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27F.f[DIR_0M0])[ks   ];
+      f27T    =  (D27F.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27F.f[DIR_00M])[kb   ];
+      f27NE   =  (D27F.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27F.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27F.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27F.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27F.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27F.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27F.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27F.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27F.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27F.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27F.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27F.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27F.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27F.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27F.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27F.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27F.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27F.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27F.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27F.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27F.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27F.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27F.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27F.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27F.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27F.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27F.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27F.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27F.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_F_NWT = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -22512,33 +21699,33 @@ __global__ void scaleFCThS27(     real* DC,
       f_BSE  = fbseF[kbs];
       f_BNW  = fbnwF[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27F.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27F.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27F.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27F.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27F.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27F.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27F.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27F.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27F.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27F.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27F.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27F.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27F.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27F.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27F.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27F.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27F.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27F.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27F.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27F.f[DIR_M00])[kw   ];
+      f27N    =  (D27F.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27F.f[DIR_0M0])[ks   ];
+      f27T    =  (D27F.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27F.f[DIR_00M])[kb   ];
+      f27NE   =  (D27F.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27F.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27F.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27F.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27F.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27F.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27F.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27F.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27F.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27F.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27F.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27F.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27F.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27F.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27F.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27F.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27F.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27F.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27F.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27F.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27F.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27F.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27F.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27F.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27F.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27F.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27F.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27F.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27F.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_F_NET = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -22599,33 +21786,33 @@ __global__ void scaleFCThS27(     real* DC,
       f_BSE  = fbseF[kbs];
       f_BNW  = fbnwF[kbw];
       //////////////////////////////////////////////////////////////////////////////////
-      f27E    =  (D27F.f[DIR_P00   ])[kzero];//ke
-      f27W    =  (D27F.f[DIR_M00   ])[kw   ];
-      f27N    =  (D27F.f[DIR_0P0   ])[kzero];//kn
-      f27S    =  (D27F.f[DIR_0M0   ])[ks   ];
-      f27T    =  (D27F.f[DIR_00P   ])[kzero];//kt
-      f27B    =  (D27F.f[DIR_00M   ])[kb   ];
-      f27NE   =  (D27F.f[DIR_PP0  ])[kzero];//kne
-      f27SW   =  (D27F.f[DIR_MM0  ])[ksw  ];
-      f27SE   =  (D27F.f[DIR_PM0  ])[ks   ];//kse
-      f27NW   =  (D27F.f[DIR_MP0  ])[kw   ];//knw
-      f27TE   =  (D27F.f[DIR_P0P  ])[kzero];//kte
-      f27BW   =  (D27F.f[DIR_M0M  ])[kbw  ];
-      f27BE   =  (D27F.f[DIR_P0M  ])[kb   ];//kbe
-      f27TW   =  (D27F.f[DIR_M0P  ])[kw   ];//ktw
-      f27TN   =  (D27F.f[DIR_0PP  ])[kzero];//ktn
-      f27BS   =  (D27F.f[DIR_0MM  ])[kbs  ];
-      f27BN   =  (D27F.f[DIR_0PM  ])[kb   ];//kbn
-      f27TS   =  (D27F.f[DIR_0MP  ])[ks   ];//kts
+      f27E    =  (D27F.f[DIR_P00])[kzero];//ke
+      f27W    =  (D27F.f[DIR_M00])[kw   ];
+      f27N    =  (D27F.f[DIR_0P0])[kzero];//kn
+      f27S    =  (D27F.f[DIR_0M0])[ks   ];
+      f27T    =  (D27F.f[DIR_00P])[kzero];//kt
+      f27B    =  (D27F.f[DIR_00M])[kb   ];
+      f27NE   =  (D27F.f[DIR_PP0])[kzero];//kne
+      f27SW   =  (D27F.f[DIR_MM0])[ksw  ];
+      f27SE   =  (D27F.f[DIR_PM0])[ks   ];//kse
+      f27NW   =  (D27F.f[DIR_MP0])[kw   ];//knw
+      f27TE   =  (D27F.f[DIR_P0P])[kzero];//kte
+      f27BW   =  (D27F.f[DIR_M0M])[kbw  ];
+      f27BE   =  (D27F.f[DIR_P0M])[kb   ];//kbe
+      f27TW   =  (D27F.f[DIR_M0P])[kw   ];//ktw
+      f27TN   =  (D27F.f[DIR_0PP])[kzero];//ktn
+      f27BS   =  (D27F.f[DIR_0MM])[kbs  ];
+      f27BN   =  (D27F.f[DIR_0PM])[kb   ];//kbn
+      f27TS   =  (D27F.f[DIR_0MP])[ks   ];//kts
       f27ZERO =  (D27F.f[DIR_000])[kzero];//kzero
-      f27TNE   = (D27F.f[DIR_PPP ])[kzero];//ktne
-      f27TSW   = (D27F.f[DIR_MMP ])[ksw  ];//ktsw
-      f27TSE   = (D27F.f[DIR_PMP ])[ks   ];//ktse
-      f27TNW   = (D27F.f[DIR_MPP ])[kw   ];//ktnw
-      f27BNE   = (D27F.f[DIR_PPM ])[kb   ];//kbne
-      f27BSW   = (D27F.f[DIR_MMM ])[kbsw ];
-      f27BSE   = (D27F.f[DIR_PMM ])[kbs  ];//kbse
-      f27BNW   = (D27F.f[DIR_MPM ])[kbw  ];//kbnw
+      f27TNE   = (D27F.f[DIR_PPP])[kzero];//ktne
+      f27TSW   = (D27F.f[DIR_MMP])[ksw  ];//ktsw
+      f27TSE   = (D27F.f[DIR_PMP])[ks   ];//ktse
+      f27TNW   = (D27F.f[DIR_MPP])[kw   ];//ktnw
+      f27BNE   = (D27F.f[DIR_PPM])[kb   ];//kbne
+      f27BSW   = (D27F.f[DIR_MMM])[kbsw ];
+      f27BSE   = (D27F.f[DIR_PMM])[kbs  ];//kbse
+      f27BNW   = (D27F.f[DIR_MPM])[kbw  ];//kbnw
 
       Conc_F_NEB = f27E + f27W + f27N + f27S + f27T + f27B + f27NE + f27SW + f27SE + f27NW + 
                    f27TE + f27BW + f27BE + f27TW + f27TN + f27BS + f27BN + f27TS + f27ZERO + 
@@ -22739,32 +21926,32 @@ __global__ void scaleFCThS27(     real* DC,
       cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
       (D27C.f[DIR_000])[kzero] =   c8o27* Conc_C*(c1o1-cu_sq);
-      (D27C.f[DIR_P00   ])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_C*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
-      (D27C.f[DIR_M00   ])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_C*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
-      (D27C.f[DIR_0P0   ])[kzero] =   c2o27* (c3o1*(     My    )+Conc_C*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
-      (D27C.f[DIR_0M0   ])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_C*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
-      (D27C.f[DIR_00P   ])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_C*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
-      (D27C.f[DIR_00M   ])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_C*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
-      (D27C.f[DIR_PP0  ])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_C*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
-      (D27C.f[DIR_MM0  ])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_C*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
-      (D27C.f[DIR_PM0  ])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_C*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
-      (D27C.f[DIR_MP0  ])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_C*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
-      (D27C.f[DIR_P0P  ])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_C*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
-      (D27C.f[DIR_M0M  ])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_C*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
-      (D27C.f[DIR_P0M  ])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_C*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
-      (D27C.f[DIR_M0P  ])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_C*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
-      (D27C.f[DIR_0PP  ])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_C*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
-      (D27C.f[DIR_0MM  ])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_C*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
-      (D27C.f[DIR_0PM  ])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_C*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
-      (D27C.f[DIR_0MP  ])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_C*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
-      (D27C.f[DIR_PPP ])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_C*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
-      (D27C.f[DIR_MMM ])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_C*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
-      (D27C.f[DIR_PPM ])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_C*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
-      (D27C.f[DIR_MMP ])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_C*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
-      (D27C.f[DIR_PMP ])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_C*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
-      (D27C.f[DIR_MPM ])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_C*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
-      (D27C.f[DIR_PMM ])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_C*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
-      (D27C.f[DIR_MPP ])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_C*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
+      (D27C.f[DIR_P00])[kzero] =   c2o27* (c3o1*( Mx        )+Conc_C*(c1o1+c9o2*( vx1        )*( vx1        )-cu_sq));
+      (D27C.f[DIR_M00])[kw   ] =   c2o27* (c3o1*(-Mx        )+Conc_C*(c1o1+c9o2*(-vx1        )*(-vx1        )-cu_sq));
+      (D27C.f[DIR_0P0])[kzero] =   c2o27* (c3o1*(     My    )+Conc_C*(c1o1+c9o2*(     vx2    )*(     vx2    )-cu_sq));
+      (D27C.f[DIR_0M0])[ks   ] =   c2o27* (c3o1*(    -My    )+Conc_C*(c1o1+c9o2*(    -vx2    )*(    -vx2    )-cu_sq));
+      (D27C.f[DIR_00P])[kzero] =   c2o27* (c3o1*(         Mz)+Conc_C*(c1o1+c9o2*(         vx3)*(         vx3)-cu_sq));
+      (D27C.f[DIR_00M])[kb   ] =   c2o27* (c3o1*(        -Mz)+Conc_C*(c1o1+c9o2*(        -vx3)*(        -vx3)-cu_sq));
+      (D27C.f[DIR_PP0])[kzero] =   c1o54* (c3o1*( Mx +My    )+Conc_C*(c1o1+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq));
+      (D27C.f[DIR_MM0])[ksw  ] =   c1o54* (c3o1*(-Mx -My    )+Conc_C*(c1o1+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq));
+      (D27C.f[DIR_PM0])[ks   ] =   c1o54* (c3o1*( Mx -My    )+Conc_C*(c1o1+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq));
+      (D27C.f[DIR_MP0])[kw   ] =   c1o54* (c3o1*(-Mx +My    )+Conc_C*(c1o1+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq));
+      (D27C.f[DIR_P0P])[kzero] =   c1o54* (c3o1*( Mx     +Mz)+Conc_C*(c1o1+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq));
+      (D27C.f[DIR_M0M])[kbw  ] =   c1o54* (c3o1*(-Mx     -Mz)+Conc_C*(c1o1+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq));
+      (D27C.f[DIR_P0M])[kb   ] =   c1o54* (c3o1*( Mx     -Mz)+Conc_C*(c1o1+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq));
+      (D27C.f[DIR_M0P])[kw   ] =   c1o54* (c3o1*(-Mx     +Mz)+Conc_C*(c1o1+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq));
+      (D27C.f[DIR_0PP])[kzero] =   c1o54* (c3o1*(     My +Mz)+Conc_C*(c1o1+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq));
+      (D27C.f[DIR_0MM])[kbs  ] =   c1o54* (c3o1*(    -My -Mz)+Conc_C*(c1o1+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq));
+      (D27C.f[DIR_0PM])[kb   ] =   c1o54* (c3o1*(     My -Mz)+Conc_C*(c1o1+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq));
+      (D27C.f[DIR_0MP])[ks   ] =   c1o54* (c3o1*(    -My +Mz)+Conc_C*(c1o1+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq));
+      (D27C.f[DIR_PPP])[kzero] =   c1o216*(c3o1*( Mx +My +Mz)+Conc_C*(c1o1+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq));
+      (D27C.f[DIR_MMM])[kbsw ] =   c1o216*(c3o1*(-Mx -My -Mz)+Conc_C*(c1o1+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq));
+      (D27C.f[DIR_PPM])[kb   ] =   c1o216*(c3o1*( Mx +My -Mz)+Conc_C*(c1o1+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq));
+      (D27C.f[DIR_MMP])[ksw  ] =   c1o216*(c3o1*(-Mx -My +Mz)+Conc_C*(c1o1+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq));
+      (D27C.f[DIR_PMP])[ks   ] =   c1o216*(c3o1*( Mx -My +Mz)+Conc_C*(c1o1+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq));
+      (D27C.f[DIR_MPM])[kbw  ] =   c1o216*(c3o1*(-Mx +My -Mz)+Conc_C*(c1o1+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq));
+      (D27C.f[DIR_PMM])[kbs  ] =   c1o216*(c3o1*( Mx -My -Mz)+Conc_C*(c1o1+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq));
+      (D27C.f[DIR_MPP])[kw   ] =   c1o216*(c3o1*(-Mx +My +Mz)+Conc_C*(c1o1+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq));
 
    }
 }
@@ -22812,8 +21999,8 @@ __global__ void scaleFCEff27(real* DC,
                                         unsigned int* neighborFX,
                                         unsigned int* neighborFY,
                                         unsigned int* neighborFZ,
-                                        unsigned int size_MatC, 
-                                        unsigned int size_MatF, 
+                                        unsigned long long numberOfLBnodesCoarse, 
+                                        unsigned long long numberOfLBnodesFine, 
                                         bool isEvenTimestep,
                                         unsigned int* posC, 
                                         unsigned int* posFSWB, 
@@ -22830,96 +22017,96 @@ __global__ void scaleFCEff27(real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
       *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
       *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
@@ -23791,8 +22978,8 @@ __global__ void scaleFC27(real* DC,
                                      unsigned int* neighborFX,
                                      unsigned int* neighborFY,
                                      unsigned int* neighborFZ,
-										       unsigned int size_MatC, 
-										       unsigned int size_MatF, 
+										       unsigned long long numberOfLBnodesCoarse, 
+										       unsigned long long numberOfLBnodesFine, 
 										       bool isEvenTimestep,
                                      unsigned int* posC, 
                                      unsigned int* posFSWB, 
@@ -23808,96 +22995,96 @@ __global__ void scaleFC27(real* DC,
    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
          *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
 
-   feF    = &DF[DIR_P00   *size_MatF];
-   fwF    = &DF[DIR_M00   *size_MatF];
-   fnF    = &DF[DIR_0P0   *size_MatF];
-   fsF    = &DF[DIR_0M0   *size_MatF];
-   ftF    = &DF[DIR_00P   *size_MatF];
-   fbF    = &DF[DIR_00M   *size_MatF];
-   fneF   = &DF[DIR_PP0  *size_MatF];
-   fswF   = &DF[DIR_MM0  *size_MatF];
-   fseF   = &DF[DIR_PM0  *size_MatF];
-   fnwF   = &DF[DIR_MP0  *size_MatF];
-   fteF   = &DF[DIR_P0P  *size_MatF];
-   fbwF   = &DF[DIR_M0M  *size_MatF];
-   fbeF   = &DF[DIR_P0M  *size_MatF];
-   ftwF   = &DF[DIR_M0P  *size_MatF];
-   ftnF   = &DF[DIR_0PP  *size_MatF];
-   fbsF   = &DF[DIR_0MM  *size_MatF];
-   fbnF   = &DF[DIR_0PM  *size_MatF];
-   ftsF   = &DF[DIR_0MP  *size_MatF];
-   fzeroF = &DF[DIR_000*size_MatF];
-   ftneF  = &DF[DIR_PPP *size_MatF];
-   ftswF  = &DF[DIR_MMP *size_MatF];
-   ftseF  = &DF[DIR_PMP *size_MatF];
-   ftnwF  = &DF[DIR_MPP *size_MatF];
-   fbneF  = &DF[DIR_PPM *size_MatF];
-   fbswF  = &DF[DIR_MMM *size_MatF];
-   fbseF  = &DF[DIR_PMM *size_MatF];
-   fbnwF  = &DF[DIR_MPM *size_MatF];
+   feF    = &DF[DIR_P00 * numberOfLBnodesFine];
+   fwF    = &DF[DIR_M00 * numberOfLBnodesFine];
+   fnF    = &DF[DIR_0P0 * numberOfLBnodesFine];
+   fsF    = &DF[DIR_0M0 * numberOfLBnodesFine];
+   ftF    = &DF[DIR_00P * numberOfLBnodesFine];
+   fbF    = &DF[DIR_00M * numberOfLBnodesFine];
+   fneF   = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fswF   = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fseF   = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fnwF   = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fteF   = &DF[DIR_P0P * numberOfLBnodesFine];
+   fbwF   = &DF[DIR_M0M * numberOfLBnodesFine];
+   fbeF   = &DF[DIR_P0M * numberOfLBnodesFine];
+   ftwF   = &DF[DIR_M0P * numberOfLBnodesFine];
+   ftnF   = &DF[DIR_0PP * numberOfLBnodesFine];
+   fbsF   = &DF[DIR_0MM * numberOfLBnodesFine];
+   fbnF   = &DF[DIR_0PM * numberOfLBnodesFine];
+   ftsF   = &DF[DIR_0MP * numberOfLBnodesFine];
+   fzeroF = &DF[DIR_000 * numberOfLBnodesFine];
+   ftneF  = &DF[DIR_PPP * numberOfLBnodesFine];
+   ftswF  = &DF[DIR_MMP * numberOfLBnodesFine];
+   ftseF  = &DF[DIR_PMP * numberOfLBnodesFine];
+   ftnwF  = &DF[DIR_MPP * numberOfLBnodesFine];
+   fbneF  = &DF[DIR_PPM * numberOfLBnodesFine];
+   fbswF  = &DF[DIR_MMM * numberOfLBnodesFine];
+   fbseF  = &DF[DIR_PMM * numberOfLBnodesFine];
+   fbnwF  = &DF[DIR_MPM * numberOfLBnodesFine];
 
    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
          *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
 
    if (isEvenTimestep==true)
    {
-      feC    = &DC[DIR_P00   *size_MatC];
-      fwC    = &DC[DIR_M00   *size_MatC];
-      fnC    = &DC[DIR_0P0   *size_MatC];
-      fsC    = &DC[DIR_0M0   *size_MatC];
-      ftC    = &DC[DIR_00P   *size_MatC];
-      fbC    = &DC[DIR_00M   *size_MatC];
-      fneC   = &DC[DIR_PP0  *size_MatC];
-      fswC   = &DC[DIR_MM0  *size_MatC];
-      fseC   = &DC[DIR_PM0  *size_MatC];
-      fnwC   = &DC[DIR_MP0  *size_MatC];
-      fteC   = &DC[DIR_P0P  *size_MatC];
-      fbwC   = &DC[DIR_M0M  *size_MatC];
-      fbeC   = &DC[DIR_P0M  *size_MatC];
-      ftwC   = &DC[DIR_M0P  *size_MatC];
-      ftnC   = &DC[DIR_0PP  *size_MatC];
-      fbsC   = &DC[DIR_0MM  *size_MatC];
-      fbnC   = &DC[DIR_0PM  *size_MatC];
-      ftsC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      ftneC  = &DC[DIR_PPP *size_MatC];
-      ftswC  = &DC[DIR_MMP *size_MatC];
-      ftseC  = &DC[DIR_PMP *size_MatC];
-      ftnwC  = &DC[DIR_MPP *size_MatC];
-      fbneC  = &DC[DIR_PPM *size_MatC];
-      fbswC  = &DC[DIR_MMM *size_MatC];
-      fbseC  = &DC[DIR_PMM *size_MatC];
-      fbnwC  = &DC[DIR_MPM *size_MatC];
+      feC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      fwC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    } 
    else
    {
-      fwC    = &DC[DIR_P00   *size_MatC];
-      feC    = &DC[DIR_M00   *size_MatC];
-      fsC    = &DC[DIR_0P0   *size_MatC];
-      fnC    = &DC[DIR_0M0   *size_MatC];
-      fbC    = &DC[DIR_00P   *size_MatC];
-      ftC    = &DC[DIR_00M   *size_MatC];
-      fswC   = &DC[DIR_PP0  *size_MatC];
-      fneC   = &DC[DIR_MM0  *size_MatC];
-      fnwC   = &DC[DIR_PM0  *size_MatC];
-      fseC   = &DC[DIR_MP0  *size_MatC];
-      fbwC   = &DC[DIR_P0P  *size_MatC];
-      fteC   = &DC[DIR_M0M  *size_MatC];
-      ftwC   = &DC[DIR_P0M  *size_MatC];
-      fbeC   = &DC[DIR_M0P  *size_MatC];
-      fbsC   = &DC[DIR_0PP  *size_MatC];
-      ftnC   = &DC[DIR_0MM  *size_MatC];
-      ftsC   = &DC[DIR_0PM  *size_MatC];
-      fbnC   = &DC[DIR_0MP  *size_MatC];
-      fzeroC = &DC[DIR_000*size_MatC];
-      fbswC  = &DC[DIR_PPP *size_MatC];
-      fbneC  = &DC[DIR_MMP *size_MatC];
-      fbnwC  = &DC[DIR_PMP *size_MatC];
-      fbseC  = &DC[DIR_MPP *size_MatC];
-      ftswC  = &DC[DIR_PPM *size_MatC];
-      ftneC  = &DC[DIR_MMM *size_MatC];
-      ftnwC  = &DC[DIR_PMM *size_MatC];
-      ftseC  = &DC[DIR_MPM *size_MatC];
+      fwC    = &DC[DIR_P00 * numberOfLBnodesCoarse];
+      feC    = &DC[DIR_M00 * numberOfLBnodesCoarse];
+      fsC    = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+      fnC    = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+      fbC    = &DC[DIR_00P * numberOfLBnodesCoarse];
+      ftC    = &DC[DIR_00M * numberOfLBnodesCoarse];
+      fswC   = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+      fneC   = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+      fnwC   = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+      fseC   = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+      fbwC   = &DC[DIR_P0P * numberOfLBnodesCoarse];
+      fteC   = &DC[DIR_M0M * numberOfLBnodesCoarse];
+      ftwC   = &DC[DIR_P0M * numberOfLBnodesCoarse];
+      fbeC   = &DC[DIR_M0P * numberOfLBnodesCoarse];
+      fbsC   = &DC[DIR_0PP * numberOfLBnodesCoarse];
+      ftnC   = &DC[DIR_0MM * numberOfLBnodesCoarse];
+      ftsC   = &DC[DIR_0PM * numberOfLBnodesCoarse];
+      fbnC   = &DC[DIR_0MP * numberOfLBnodesCoarse];
+      fzeroC = &DC[DIR_000 * numberOfLBnodesCoarse];
+      fbswC  = &DC[DIR_PPP * numberOfLBnodesCoarse];
+      fbneC  = &DC[DIR_MMP * numberOfLBnodesCoarse];
+      fbnwC  = &DC[DIR_PMP * numberOfLBnodesCoarse];
+      fbseC  = &DC[DIR_MPP * numberOfLBnodesCoarse];
+      ftswC  = &DC[DIR_PPM * numberOfLBnodesCoarse];
+      ftneC  = &DC[DIR_MMM * numberOfLBnodesCoarse];
+      ftnwC  = &DC[DIR_PMM * numberOfLBnodesCoarse];
+      ftseC  = &DC[DIR_MPM * numberOfLBnodesCoarse];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleFC_F3_27.cu b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleFC_F3_27.cu
index e7fe8b50637e97b9c8cc34025216f4d02e684c55..3b108ad4ae43bd63698f3516a207630214695797 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleFC_F3_27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/ScaleFC_F3_27.cu
@@ -23,8 +23,8 @@ __global__ void scaleFC_comp_D3Q27F3_2018(real* DC,
 													 unsigned int* neighborFX,
 													 unsigned int* neighborFY,
 													 unsigned int* neighborFZ,
-													 unsigned int size_MatC, 
-													 unsigned int size_MatF, 
+													 unsigned long long numberOfLBnodesCoarse, 
+													 unsigned long long numberOfLBnodesFine, 
 													 bool isEvenTimestep,
 													 unsigned int* posC, 
 													 unsigned int* posFSWB, 
@@ -44,33 +44,33 @@ __global__ void scaleFC_comp_D3Q27F3_2018(real* DC,
 	   *f000source, *fMMMsource, *fMMPsource, *fMPPsource, *fMPMsource, *fPPMsource, *fPPPsource, *fPMPsource, *fPMMsource;
 
 
-   fP00source = &DF[DIR_P00   *size_MatF];
-   fM00source = &DF[DIR_M00   *size_MatF];
-   f0P0source = &DF[DIR_0P0   *size_MatF];
-   f0M0source = &DF[DIR_0M0   *size_MatF];
-   f00Psource = &DF[DIR_00P   *size_MatF];
-   f00Msource = &DF[DIR_00M   *size_MatF];
-   fPP0source = &DF[DIR_PP0  *size_MatF];
-   fMM0source = &DF[DIR_MM0  *size_MatF];
-   fPM0source = &DF[DIR_PM0  *size_MatF];
-   fMP0source = &DF[DIR_MP0  *size_MatF];
-   fP0Psource = &DF[DIR_P0P  *size_MatF];
-   fM0Msource = &DF[DIR_M0M  *size_MatF];
-   fP0Msource = &DF[DIR_P0M  *size_MatF];
-   fM0Psource = &DF[DIR_M0P  *size_MatF];
-   f0PPsource = &DF[DIR_0PP  *size_MatF];
-   f0MMsource = &DF[DIR_0MM  *size_MatF];
-   f0PMsource = &DF[DIR_0PM  *size_MatF];
-   f0MPsource = &DF[DIR_0MP  *size_MatF];
-   f000source = &DF[DIR_000*size_MatF];
-   fMMMsource = &DF[DIR_MMM *size_MatF];
-   fMMPsource = &DF[DIR_MMP *size_MatF];
-   fMPPsource = &DF[DIR_MPP *size_MatF];
-   fMPMsource = &DF[DIR_MPM *size_MatF];
-   fPPMsource = &DF[DIR_PPM *size_MatF];
-   fPPPsource = &DF[DIR_PPP *size_MatF];
-   fPMPsource = &DF[DIR_PMP *size_MatF];
-   fPMMsource = &DF[DIR_PMM *size_MatF];
+   fP00source = &DF[DIR_P00 * numberOfLBnodesFine];
+   fM00source = &DF[DIR_M00 * numberOfLBnodesFine];
+   f0P0source = &DF[DIR_0P0 * numberOfLBnodesFine];
+   f0M0source = &DF[DIR_0M0 * numberOfLBnodesFine];
+   f00Psource = &DF[DIR_00P * numberOfLBnodesFine];
+   f00Msource = &DF[DIR_00M * numberOfLBnodesFine];
+   fPP0source = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fMM0source = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fPM0source = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fMP0source = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fP0Psource = &DF[DIR_P0P * numberOfLBnodesFine];
+   fM0Msource = &DF[DIR_M0M * numberOfLBnodesFine];
+   fP0Msource = &DF[DIR_P0M * numberOfLBnodesFine];
+   fM0Psource = &DF[DIR_M0P * numberOfLBnodesFine];
+   f0PPsource = &DF[DIR_0PP * numberOfLBnodesFine];
+   f0MMsource = &DF[DIR_0MM * numberOfLBnodesFine];
+   f0PMsource = &DF[DIR_0PM * numberOfLBnodesFine];
+   f0MPsource = &DF[DIR_0MP * numberOfLBnodesFine];
+   f000source = &DF[DIR_000 * numberOfLBnodesFine];
+   fMMMsource = &DF[DIR_MMM * numberOfLBnodesFine];
+   fMMPsource = &DF[DIR_MMP * numberOfLBnodesFine];
+   fMPPsource = &DF[DIR_MPP * numberOfLBnodesFine];
+   fMPMsource = &DF[DIR_MPM * numberOfLBnodesFine];
+   fPPMsource = &DF[DIR_PPM * numberOfLBnodesFine];
+   fPPPsource = &DF[DIR_PPP * numberOfLBnodesFine];
+   fPMPsource = &DF[DIR_PMP * numberOfLBnodesFine];
+   fPMMsource = &DF[DIR_PMM * numberOfLBnodesFine];
 
    real
 	   *fP00dest, *fM00dest, *f0P0dest, *f0M0dest, *f00Pdest, *f00Mdest, *fPP0dest, *fMM0dest, *fPM0dest,
@@ -79,83 +79,83 @@ __global__ void scaleFC_comp_D3Q27F3_2018(real* DC,
 
    if (isEvenTimestep==true)
    {
-	   fP00dest = &DC[DIR_P00   *size_MatC];
-	   fM00dest = &DC[DIR_M00   *size_MatC];
-	   f0P0dest = &DC[DIR_0P0   *size_MatC];
-	   f0M0dest = &DC[DIR_0M0   *size_MatC];
-	   f00Pdest = &DC[DIR_00P   *size_MatC];
-	   f00Mdest = &DC[DIR_00M   *size_MatC];
-	   fPP0dest = &DC[DIR_PP0  *size_MatC];
-	   fMM0dest = &DC[DIR_MM0  *size_MatC];
-	   fPM0dest = &DC[DIR_PM0  *size_MatC];
-	   fMP0dest = &DC[DIR_MP0  *size_MatC];
-	   fP0Pdest = &DC[DIR_P0P  *size_MatC];
-	   fM0Mdest = &DC[DIR_M0M  *size_MatC];
-	   fP0Mdest = &DC[DIR_P0M  *size_MatC];
-	   fM0Pdest = &DC[DIR_M0P  *size_MatC];
-	   f0PPdest = &DC[DIR_0PP  *size_MatC];
-	   f0MMdest = &DC[DIR_0MM  *size_MatC];
-	   f0PMdest = &DC[DIR_0PM  *size_MatC];
-	   f0MPdest = &DC[DIR_0MP  *size_MatC];
-	   f000dest = &DC[DIR_000*size_MatC];
-	   fMMMdest = &DC[DIR_MMM *size_MatC];
-	   fMMPdest = &DC[DIR_MMP *size_MatC];
-	   fMPPdest = &DC[DIR_MPP *size_MatC];
-	   fMPMdest = &DC[DIR_MPM *size_MatC];
-	   fPPMdest = &DC[DIR_PPM *size_MatC];
-	   fPPPdest = &DC[DIR_PPP *size_MatC];
-	   fPMPdest = &DC[DIR_PMP *size_MatC];
-	   fPMMdest = &DC[DIR_PMM *size_MatC];
+	   fP00dest = &DC[DIR_P00 * numberOfLBnodesCoarse];
+	   fM00dest = &DC[DIR_M00 * numberOfLBnodesCoarse];
+	   f0P0dest = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+	   f0M0dest = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+	   f00Pdest = &DC[DIR_00P * numberOfLBnodesCoarse];
+	   f00Mdest = &DC[DIR_00M * numberOfLBnodesCoarse];
+	   fPP0dest = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+	   fMM0dest = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+	   fPM0dest = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+	   fMP0dest = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+	   fP0Pdest = &DC[DIR_P0P * numberOfLBnodesCoarse];
+	   fM0Mdest = &DC[DIR_M0M * numberOfLBnodesCoarse];
+	   fP0Mdest = &DC[DIR_P0M * numberOfLBnodesCoarse];
+	   fM0Pdest = &DC[DIR_M0P * numberOfLBnodesCoarse];
+	   f0PPdest = &DC[DIR_0PP * numberOfLBnodesCoarse];
+	   f0MMdest = &DC[DIR_0MM * numberOfLBnodesCoarse];
+	   f0PMdest = &DC[DIR_0PM * numberOfLBnodesCoarse];
+	   f0MPdest = &DC[DIR_0MP * numberOfLBnodesCoarse];
+	   f000dest = &DC[DIR_000 * numberOfLBnodesCoarse];
+	   fMMMdest = &DC[DIR_MMM * numberOfLBnodesCoarse];
+	   fMMPdest = &DC[DIR_MMP * numberOfLBnodesCoarse];
+	   fMPPdest = &DC[DIR_MPP * numberOfLBnodesCoarse];
+	   fMPMdest = &DC[DIR_MPM * numberOfLBnodesCoarse];
+	   fPPMdest = &DC[DIR_PPM * numberOfLBnodesCoarse];
+	   fPPPdest = &DC[DIR_PPP * numberOfLBnodesCoarse];
+	   fPMPdest = &DC[DIR_PMP * numberOfLBnodesCoarse];
+	   fPMMdest = &DC[DIR_PMM * numberOfLBnodesCoarse];
    } 
    else
    {
-	   fP00dest = &DC[DIR_M00   *size_MatC];
-	   fM00dest = &DC[DIR_P00   *size_MatC];
-	   f0P0dest = &DC[DIR_0M0   *size_MatC];
-	   f0M0dest = &DC[DIR_0P0   *size_MatC];
-	   f00Pdest = &DC[DIR_00M   *size_MatC];
-	   f00Mdest = &DC[DIR_00P   *size_MatC];
-	   fPP0dest = &DC[DIR_MM0  *size_MatC];
-	   fMM0dest = &DC[DIR_PP0  *size_MatC];
-	   fPM0dest = &DC[DIR_MP0  *size_MatC];
-	   fMP0dest = &DC[DIR_PM0  *size_MatC];
-	   fP0Pdest = &DC[DIR_M0M  *size_MatC];
-	   fM0Mdest = &DC[DIR_P0P  *size_MatC];
-	   fP0Mdest = &DC[DIR_M0P  *size_MatC];
-	   fM0Pdest = &DC[DIR_P0M  *size_MatC];
-	   f0PPdest = &DC[DIR_0MM  *size_MatC];
-	   f0MMdest = &DC[DIR_0PP  *size_MatC];
-	   f0PMdest = &DC[DIR_0MP  *size_MatC];
-	   f0MPdest = &DC[DIR_0PM  *size_MatC];
-	   f000dest = &DC[DIR_000*size_MatC];
-	   fMMMdest = &DC[DIR_PPP *size_MatC];
-	   fMMPdest = &DC[DIR_PPM *size_MatC];
-	   fMPPdest = &DC[DIR_PMM *size_MatC];
-	   fMPMdest = &DC[DIR_PMP *size_MatC];
-	   fPPMdest = &DC[DIR_MMP *size_MatC];
-	   fPPPdest = &DC[DIR_MMM *size_MatC];
-	   fPMPdest = &DC[DIR_MPM *size_MatC];
-	   fPMMdest = &DC[DIR_MPP *size_MatC];
+	   fP00dest = &DC[DIR_M00 * numberOfLBnodesCoarse];
+	   fM00dest = &DC[DIR_P00 * numberOfLBnodesCoarse];
+	   f0P0dest = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+	   f0M0dest = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+	   f00Pdest = &DC[DIR_00M * numberOfLBnodesCoarse];
+	   f00Mdest = &DC[DIR_00P * numberOfLBnodesCoarse];
+	   fPP0dest = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+	   fMM0dest = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+	   fPM0dest = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+	   fMP0dest = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+	   fP0Pdest = &DC[DIR_M0M * numberOfLBnodesCoarse];
+	   fM0Mdest = &DC[DIR_P0P * numberOfLBnodesCoarse];
+	   fP0Mdest = &DC[DIR_M0P * numberOfLBnodesCoarse];
+	   fM0Pdest = &DC[DIR_P0M * numberOfLBnodesCoarse];
+	   f0PPdest = &DC[DIR_0MM * numberOfLBnodesCoarse];
+	   f0MMdest = &DC[DIR_0PP * numberOfLBnodesCoarse];
+	   f0PMdest = &DC[DIR_0MP * numberOfLBnodesCoarse];
+	   f0MPdest = &DC[DIR_0PM * numberOfLBnodesCoarse];
+	   f000dest = &DC[DIR_000 * numberOfLBnodesCoarse];
+	   fMMMdest = &DC[DIR_PPP * numberOfLBnodesCoarse];
+	   fMMPdest = &DC[DIR_PPM * numberOfLBnodesCoarse];
+	   fMPPdest = &DC[DIR_PMM * numberOfLBnodesCoarse];
+	   fMPMdest = &DC[DIR_PMP * numberOfLBnodesCoarse];
+	   fPPMdest = &DC[DIR_MMP * numberOfLBnodesCoarse];
+	   fPPPdest = &DC[DIR_MMM * numberOfLBnodesCoarse];
+	   fPMPdest = &DC[DIR_MPM * numberOfLBnodesCoarse];
+	   fPMMdest = &DC[DIR_MPP * numberOfLBnodesCoarse];
    }
 
    Distributions6 G;
    if (isEvenTimestep == true)
    {
-	   G.g[DIR_P00] = &G6[DIR_P00   *size_MatC];
-	   G.g[DIR_M00] = &G6[DIR_M00   *size_MatC];
-	   G.g[DIR_0P0] = &G6[DIR_0P0   *size_MatC];
-	   G.g[DIR_0M0] = &G6[DIR_0M0   *size_MatC];
-	   G.g[DIR_00P] = &G6[DIR_00P   *size_MatC];
-	   G.g[DIR_00M] = &G6[DIR_00M   *size_MatC];
+	   G.g[DIR_P00] = &G6[DIR_P00 * numberOfLBnodesCoarse];
+	   G.g[DIR_M00] = &G6[DIR_M00 * numberOfLBnodesCoarse];
+	   G.g[DIR_0P0] = &G6[DIR_0P0 * numberOfLBnodesCoarse];
+	   G.g[DIR_0M0] = &G6[DIR_0M0 * numberOfLBnodesCoarse];
+	   G.g[DIR_00P] = &G6[DIR_00P * numberOfLBnodesCoarse];
+	   G.g[DIR_00M] = &G6[DIR_00M * numberOfLBnodesCoarse];
    }
    else
    {
-	   G.g[DIR_M00] = &G6[DIR_P00   *size_MatC];
-	   G.g[DIR_P00] = &G6[DIR_M00   *size_MatC];
-	   G.g[DIR_0M0] = &G6[DIR_0P0   *size_MatC];
-	   G.g[DIR_0P0] = &G6[DIR_0M0   *size_MatC];
-	   G.g[DIR_00M] = &G6[DIR_00P   *size_MatC];
-	   G.g[DIR_00P] = &G6[DIR_00M   *size_MatC];
+	   G.g[DIR_M00] = &G6[DIR_P00 * numberOfLBnodesCoarse];
+	   G.g[DIR_P00] = &G6[DIR_M00 * numberOfLBnodesCoarse];
+	   G.g[DIR_0M0] = &G6[DIR_0P0 * numberOfLBnodesCoarse];
+	   G.g[DIR_0P0] = &G6[DIR_0M0 * numberOfLBnodesCoarse];
+	   G.g[DIR_00M] = &G6[DIR_00P * numberOfLBnodesCoarse];
+	   G.g[DIR_00P] = &G6[DIR_00M * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
@@ -1270,8 +1270,8 @@ __global__ void scaleFC_comp_D3Q27F3( real* DC,
 												 unsigned int* neighborFX,
 												 unsigned int* neighborFY,
 												 unsigned int* neighborFZ,
-												 unsigned int size_MatC, 
-												 unsigned int size_MatF, 
+												 unsigned long long numberOfLBnodesCoarse, 
+												 unsigned long long numberOfLBnodesFine, 
 												 bool isEvenTimestep,
 												 unsigned int* posC, 
 												 unsigned int* posFSWB, 
@@ -1291,33 +1291,33 @@ __global__ void scaleFC_comp_D3Q27F3( real* DC,
 	   *f000source, *fMMMsource, *fMMPsource, *fMPPsource, *fMPMsource, *fPPMsource, *fPPPsource, *fPMPsource, *fPMMsource;
 
 
-   fP00source = &DF[DIR_P00   *size_MatF];
-   fM00source = &DF[DIR_M00   *size_MatF];
-   f0P0source = &DF[DIR_0P0   *size_MatF];
-   f0M0source = &DF[DIR_0M0   *size_MatF];
-   f00Psource = &DF[DIR_00P   *size_MatF];
-   f00Msource = &DF[DIR_00M   *size_MatF];
-   fPP0source = &DF[DIR_PP0  *size_MatF];
-   fMM0source = &DF[DIR_MM0  *size_MatF];
-   fPM0source = &DF[DIR_PM0  *size_MatF];
-   fMP0source = &DF[DIR_MP0  *size_MatF];
-   fP0Psource = &DF[DIR_P0P  *size_MatF];
-   fM0Msource = &DF[DIR_M0M  *size_MatF];
-   fP0Msource = &DF[DIR_P0M  *size_MatF];
-   fM0Psource = &DF[DIR_M0P  *size_MatF];
-   f0PPsource = &DF[DIR_0PP  *size_MatF];
-   f0MMsource = &DF[DIR_0MM  *size_MatF];
-   f0PMsource = &DF[DIR_0PM  *size_MatF];
-   f0MPsource = &DF[DIR_0MP  *size_MatF];
-   f000source = &DF[DIR_000*size_MatF];
-   fMMMsource = &DF[DIR_MMM *size_MatF];
-   fMMPsource = &DF[DIR_MMP *size_MatF];
-   fMPPsource = &DF[DIR_MPP *size_MatF];
-   fMPMsource = &DF[DIR_MPM *size_MatF];
-   fPPMsource = &DF[DIR_PPM *size_MatF];
-   fPPPsource = &DF[DIR_PPP *size_MatF];
-   fPMPsource = &DF[DIR_PMP *size_MatF];
-   fPMMsource = &DF[DIR_PMM *size_MatF];
+   fP00source = &DF[DIR_P00 * numberOfLBnodesFine];
+   fM00source = &DF[DIR_M00 * numberOfLBnodesFine];
+   f0P0source = &DF[DIR_0P0 * numberOfLBnodesFine];
+   f0M0source = &DF[DIR_0M0 * numberOfLBnodesFine];
+   f00Psource = &DF[DIR_00P * numberOfLBnodesFine];
+   f00Msource = &DF[DIR_00M * numberOfLBnodesFine];
+   fPP0source = &DF[DIR_PP0 * numberOfLBnodesFine];
+   fMM0source = &DF[DIR_MM0 * numberOfLBnodesFine];
+   fPM0source = &DF[DIR_PM0 * numberOfLBnodesFine];
+   fMP0source = &DF[DIR_MP0 * numberOfLBnodesFine];
+   fP0Psource = &DF[DIR_P0P * numberOfLBnodesFine];
+   fM0Msource = &DF[DIR_M0M * numberOfLBnodesFine];
+   fP0Msource = &DF[DIR_P0M * numberOfLBnodesFine];
+   fM0Psource = &DF[DIR_M0P * numberOfLBnodesFine];
+   f0PPsource = &DF[DIR_0PP * numberOfLBnodesFine];
+   f0MMsource = &DF[DIR_0MM * numberOfLBnodesFine];
+   f0PMsource = &DF[DIR_0PM * numberOfLBnodesFine];
+   f0MPsource = &DF[DIR_0MP * numberOfLBnodesFine];
+   f000source = &DF[DIR_000 * numberOfLBnodesFine];
+   fMMMsource = &DF[DIR_MMM * numberOfLBnodesFine];
+   fMMPsource = &DF[DIR_MMP * numberOfLBnodesFine];
+   fMPPsource = &DF[DIR_MPP * numberOfLBnodesFine];
+   fMPMsource = &DF[DIR_MPM * numberOfLBnodesFine];
+   fPPMsource = &DF[DIR_PPM * numberOfLBnodesFine];
+   fPPPsource = &DF[DIR_PPP * numberOfLBnodesFine];
+   fPMPsource = &DF[DIR_PMP * numberOfLBnodesFine];
+   fPMMsource = &DF[DIR_PMM * numberOfLBnodesFine];
 
    real
 	   *fP00dest, *fM00dest, *f0P0dest, *f0M0dest, *f00Pdest, *f00Mdest, *fPP0dest, *fMM0dest, *fPM0dest,
@@ -1326,83 +1326,83 @@ __global__ void scaleFC_comp_D3Q27F3( real* DC,
 
    if (isEvenTimestep==true)
    {
-	   fP00dest = &DC[DIR_P00   *size_MatC];
-	   fM00dest = &DC[DIR_M00   *size_MatC];
-	   f0P0dest = &DC[DIR_0P0   *size_MatC];
-	   f0M0dest = &DC[DIR_0M0   *size_MatC];
-	   f00Pdest = &DC[DIR_00P   *size_MatC];
-	   f00Mdest = &DC[DIR_00M   *size_MatC];
-	   fPP0dest = &DC[DIR_PP0  *size_MatC];
-	   fMM0dest = &DC[DIR_MM0  *size_MatC];
-	   fPM0dest = &DC[DIR_PM0  *size_MatC];
-	   fMP0dest = &DC[DIR_MP0  *size_MatC];
-	   fP0Pdest = &DC[DIR_P0P  *size_MatC];
-	   fM0Mdest = &DC[DIR_M0M  *size_MatC];
-	   fP0Mdest = &DC[DIR_P0M  *size_MatC];
-	   fM0Pdest = &DC[DIR_M0P  *size_MatC];
-	   f0PPdest = &DC[DIR_0PP  *size_MatC];
-	   f0MMdest = &DC[DIR_0MM  *size_MatC];
-	   f0PMdest = &DC[DIR_0PM  *size_MatC];
-	   f0MPdest = &DC[DIR_0MP  *size_MatC];
-	   f000dest = &DC[DIR_000*size_MatC];
-	   fMMMdest = &DC[DIR_MMM *size_MatC];
-	   fMMPdest = &DC[DIR_MMP *size_MatC];
-	   fMPPdest = &DC[DIR_MPP *size_MatC];
-	   fMPMdest = &DC[DIR_MPM *size_MatC];
-	   fPPMdest = &DC[DIR_PPM *size_MatC];
-	   fPPPdest = &DC[DIR_PPP *size_MatC];
-	   fPMPdest = &DC[DIR_PMP *size_MatC];
-	   fPMMdest = &DC[DIR_PMM *size_MatC];
+	   fP00dest = &DC[DIR_P00 * numberOfLBnodesCoarse];
+	   fM00dest = &DC[DIR_M00 * numberOfLBnodesCoarse];
+	   f0P0dest = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+	   f0M0dest = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+	   f00Pdest = &DC[DIR_00P * numberOfLBnodesCoarse];
+	   f00Mdest = &DC[DIR_00M * numberOfLBnodesCoarse];
+	   fPP0dest = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+	   fMM0dest = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+	   fPM0dest = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+	   fMP0dest = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+	   fP0Pdest = &DC[DIR_P0P * numberOfLBnodesCoarse];
+	   fM0Mdest = &DC[DIR_M0M * numberOfLBnodesCoarse];
+	   fP0Mdest = &DC[DIR_P0M * numberOfLBnodesCoarse];
+	   fM0Pdest = &DC[DIR_M0P * numberOfLBnodesCoarse];
+	   f0PPdest = &DC[DIR_0PP * numberOfLBnodesCoarse];
+	   f0MMdest = &DC[DIR_0MM * numberOfLBnodesCoarse];
+	   f0PMdest = &DC[DIR_0PM * numberOfLBnodesCoarse];
+	   f0MPdest = &DC[DIR_0MP * numberOfLBnodesCoarse];
+	   f000dest = &DC[DIR_000 * numberOfLBnodesCoarse];
+	   fMMMdest = &DC[DIR_MMM * numberOfLBnodesCoarse];
+	   fMMPdest = &DC[DIR_MMP * numberOfLBnodesCoarse];
+	   fMPPdest = &DC[DIR_MPP * numberOfLBnodesCoarse];
+	   fMPMdest = &DC[DIR_MPM * numberOfLBnodesCoarse];
+	   fPPMdest = &DC[DIR_PPM * numberOfLBnodesCoarse];
+	   fPPPdest = &DC[DIR_PPP * numberOfLBnodesCoarse];
+	   fPMPdest = &DC[DIR_PMP * numberOfLBnodesCoarse];
+	   fPMMdest = &DC[DIR_PMM * numberOfLBnodesCoarse];
    } 
    else
    {
-	   fP00dest = &DC[DIR_M00   *size_MatC];
-	   fM00dest = &DC[DIR_P00   *size_MatC];
-	   f0P0dest = &DC[DIR_0M0   *size_MatC];
-	   f0M0dest = &DC[DIR_0P0   *size_MatC];
-	   f00Pdest = &DC[DIR_00M   *size_MatC];
-	   f00Mdest = &DC[DIR_00P   *size_MatC];
-	   fPP0dest = &DC[DIR_MM0  *size_MatC];
-	   fMM0dest = &DC[DIR_PP0  *size_MatC];
-	   fPM0dest = &DC[DIR_MP0  *size_MatC];
-	   fMP0dest = &DC[DIR_PM0  *size_MatC];
-	   fP0Pdest = &DC[DIR_M0M  *size_MatC];
-	   fM0Mdest = &DC[DIR_P0P  *size_MatC];
-	   fP0Mdest = &DC[DIR_M0P  *size_MatC];
-	   fM0Pdest = &DC[DIR_P0M  *size_MatC];
-	   f0PPdest = &DC[DIR_0MM  *size_MatC];
-	   f0MMdest = &DC[DIR_0PP  *size_MatC];
-	   f0PMdest = &DC[DIR_0MP  *size_MatC];
-	   f0MPdest = &DC[DIR_0PM  *size_MatC];
-	   f000dest = &DC[DIR_000*size_MatC];
-	   fMMMdest = &DC[DIR_PPP *size_MatC];
-	   fMMPdest = &DC[DIR_PPM *size_MatC];
-	   fMPPdest = &DC[DIR_PMM *size_MatC];
-	   fMPMdest = &DC[DIR_PMP *size_MatC];
-	   fPPMdest = &DC[DIR_MMP *size_MatC];
-	   fPPPdest = &DC[DIR_MMM *size_MatC];
-	   fPMPdest = &DC[DIR_MPM *size_MatC];
-	   fPMMdest = &DC[DIR_MPP *size_MatC];
+	   fP00dest = &DC[DIR_M00 * numberOfLBnodesCoarse];
+	   fM00dest = &DC[DIR_P00 * numberOfLBnodesCoarse];
+	   f0P0dest = &DC[DIR_0M0 * numberOfLBnodesCoarse];
+	   f0M0dest = &DC[DIR_0P0 * numberOfLBnodesCoarse];
+	   f00Pdest = &DC[DIR_00M * numberOfLBnodesCoarse];
+	   f00Mdest = &DC[DIR_00P * numberOfLBnodesCoarse];
+	   fPP0dest = &DC[DIR_MM0 * numberOfLBnodesCoarse];
+	   fMM0dest = &DC[DIR_PP0 * numberOfLBnodesCoarse];
+	   fPM0dest = &DC[DIR_MP0 * numberOfLBnodesCoarse];
+	   fMP0dest = &DC[DIR_PM0 * numberOfLBnodesCoarse];
+	   fP0Pdest = &DC[DIR_M0M * numberOfLBnodesCoarse];
+	   fM0Mdest = &DC[DIR_P0P * numberOfLBnodesCoarse];
+	   fP0Mdest = &DC[DIR_M0P * numberOfLBnodesCoarse];
+	   fM0Pdest = &DC[DIR_P0M * numberOfLBnodesCoarse];
+	   f0PPdest = &DC[DIR_0MM * numberOfLBnodesCoarse];
+	   f0MMdest = &DC[DIR_0PP * numberOfLBnodesCoarse];
+	   f0PMdest = &DC[DIR_0MP * numberOfLBnodesCoarse];
+	   f0MPdest = &DC[DIR_0PM * numberOfLBnodesCoarse];
+	   f000dest = &DC[DIR_000 * numberOfLBnodesCoarse];
+	   fMMMdest = &DC[DIR_PPP * numberOfLBnodesCoarse];
+	   fMMPdest = &DC[DIR_PPM * numberOfLBnodesCoarse];
+	   fMPPdest = &DC[DIR_PMM * numberOfLBnodesCoarse];
+	   fMPMdest = &DC[DIR_PMP * numberOfLBnodesCoarse];
+	   fPPMdest = &DC[DIR_MMP * numberOfLBnodesCoarse];
+	   fPPPdest = &DC[DIR_MMM * numberOfLBnodesCoarse];
+	   fPMPdest = &DC[DIR_MPM * numberOfLBnodesCoarse];
+	   fPMMdest = &DC[DIR_MPP * numberOfLBnodesCoarse];
    }
 
    Distributions6 G;
    if (isEvenTimestep == true)
    {
-	   G.g[DIR_P00] = &G6[DIR_P00   *size_MatC];
-	   G.g[DIR_M00] = &G6[DIR_M00   *size_MatC];
-	   G.g[DIR_0P0] = &G6[DIR_0P0   *size_MatC];
-	   G.g[DIR_0M0] = &G6[DIR_0M0   *size_MatC];
-	   G.g[DIR_00P] = &G6[DIR_00P   *size_MatC];
-	   G.g[DIR_00M] = &G6[DIR_00M   *size_MatC];
+	   G.g[DIR_P00] = &G6[DIR_P00 * numberOfLBnodesCoarse];
+	   G.g[DIR_M00] = &G6[DIR_M00 * numberOfLBnodesCoarse];
+	   G.g[DIR_0P0] = &G6[DIR_0P0 * numberOfLBnodesCoarse];
+	   G.g[DIR_0M0] = &G6[DIR_0M0 * numberOfLBnodesCoarse];
+	   G.g[DIR_00P] = &G6[DIR_00P * numberOfLBnodesCoarse];
+	   G.g[DIR_00M] = &G6[DIR_00M * numberOfLBnodesCoarse];
    }
    else
    {
-	   G.g[DIR_M00] = &G6[DIR_P00   *size_MatC];
-	   G.g[DIR_P00] = &G6[DIR_M00   *size_MatC];
-	   G.g[DIR_0M0] = &G6[DIR_0P0   *size_MatC];
-	   G.g[DIR_0P0] = &G6[DIR_0M0   *size_MatC];
-	   G.g[DIR_00M] = &G6[DIR_00P   *size_MatC];
-	   G.g[DIR_00P] = &G6[DIR_00M   *size_MatC];
+	   G.g[DIR_M00] = &G6[DIR_P00 * numberOfLBnodesCoarse];
+	   G.g[DIR_P00] = &G6[DIR_M00 * numberOfLBnodesCoarse];
+	   G.g[DIR_0M0] = &G6[DIR_0P0 * numberOfLBnodesCoarse];
+	   G.g[DIR_0P0] = &G6[DIR_0M0 * numberOfLBnodesCoarse];
+	   G.g[DIR_00M] = &G6[DIR_00P * numberOfLBnodesCoarse];
+	   G.g[DIR_00P] = &G6[DIR_00M * numberOfLBnodesCoarse];
    }
 
    ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleCF_compressible.cu b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleCF_compressible.cu
index f4160b89c047a7e6244a5579baae03d30b3c89cb..0724002cffa3a47820664851ffefd1c35dbe0235 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleCF_compressible.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleCF_compressible.cu
@@ -32,12 +32,13 @@
 //=======================================================================================
 
 #include "DataTypes.h"
-#include "Kernel/Utilities/DistributionHelper.cuh"
-#include "Kernel/Utilities/ChimeraTransformation.h"
-#include "Kernel/Utilities/ScalingHelperFunctions.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
+#include "LBM/GPUHelperFunctions/ChimeraTransformation.h"
+#include "LBM/GPUHelperFunctions/ScalingUtilities.h"
 
 using namespace vf::lbm::constant;
 using namespace vf::lbm::dir;
+using namespace vf::gpu;
 
 //////////////////////////////////////////////////////////////////////////
 //! \brief Calculate the interpolated distributions on the fine destination nodes
@@ -226,8 +227,8 @@ __global__ void scaleCF_compressible(
     unsigned int* neighborXfine,
     unsigned int* neighborYfine,
     unsigned int* neighborZfine,
-    unsigned int numberOfLBnodesCoarse, 
-    unsigned int numberOfLBnodesFine, 
+    unsigned long long numberOfLBnodesCoarse, 
+    unsigned long long numberOfLBnodesFine, 
     bool isEvenTimestep,
     unsigned int* indicesCoarseMMM, 
     unsigned int* indicesFineMMM, 
@@ -237,13 +238,13 @@ __global__ void scaleCF_compressible(
     OffCF offsetCF)
 {
     ////////////////////////////////////////////////////////////////////////////////
-    //! - Get the thread index coordinates from threadId_100, blockId_100, blockDim and gridDim.
+    //! - Get the node index coordinates from threadId_100, blockId_100, blockDim and gridDim.
     //!
-    const unsigned k_thread = vf::gpu::getNodeIndex();
+    const unsigned nodeIndex = getNodeIndex();
 
     //////////////////////////////////////////////////////////////////////////
     //! - Return for non-interface node
-    if (k_thread >= numberOfInterfaceNodes)
+    if (nodeIndex >= numberOfInterfaceNodes)
         return;
 
     //////////////////////////////////////////////////////////////////////////
@@ -252,8 +253,9 @@ __global__ void scaleCF_compressible(
     //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
     //! DOI:10.3390/computation5020019 ]</b></a>
     //!
-    Distributions27 distFine   = vf::gpu::getDistributionReferences27(distributionsFine,   numberOfLBnodesFine,   true);
-    Distributions27 distCoarse = vf::gpu::getDistributionReferences27(distributionsCoarse, numberOfLBnodesCoarse, isEvenTimestep);
+    Distributions27 distFine, distCoarse;
+    getPointersToDistributions(distFine, distributionsFine, numberOfLBnodesFine, true);
+    getPointersToDistributions(distCoarse, distributionsCoarse, numberOfLBnodesCoarse, isEvenTimestep);
 
     ////////////////////////////////////////////////////////////////////////////////
     //! - declare local variables for source nodes
@@ -289,7 +291,7 @@ __global__ void scaleCF_compressible(
     // source node BSW = MMM
     ////////////////////////////////////////////////////////////////////////////////
     // index of the base node and its neighbors
-    unsigned int k_base_000 = indicesCoarseMMM[k_thread];
+    unsigned int k_base_000 = indicesCoarseMMM[nodeIndex];
     unsigned int k_base_M00 = neighborXcoarse [k_base_000];
     unsigned int k_base_0M0 = neighborYcoarse [k_base_000];
     unsigned int k_base_00M = neighborZcoarse [k_base_000];
@@ -452,119 +454,240 @@ __global__ void scaleCF_compressible(
     real c_000, c_100, c_010, c_001, c_200, c_020, c_002, c_110, c_101, c_011, c_111;
     real d_000, d_100, d_010, d_001, d_110, d_101, d_011, d_111;
 
-    a_000 = (-kxxMyyFromfcNEQ_PPM - kxxMyyFromfcNEQ_PPP + kxxMyyFromfcNEQ_MPM + kxxMyyFromfcNEQ_MPP -
-            kxxMyyFromfcNEQ_PMM - kxxMyyFromfcNEQ_PMP + kxxMyyFromfcNEQ_MMM + kxxMyyFromfcNEQ_MMP -
-            kxxMzzFromfcNEQ_PPM - kxxMzzFromfcNEQ_PPP + kxxMzzFromfcNEQ_MPM + kxxMzzFromfcNEQ_MPP -
-            kxxMzzFromfcNEQ_PMM - kxxMzzFromfcNEQ_PMP + kxxMzzFromfcNEQ_MMM + kxxMzzFromfcNEQ_MMP -
-            c2o1 * kxyFromfcNEQ_PPM - c2o1 * kxyFromfcNEQ_PPP - c2o1 * kxyFromfcNEQ_MPM - c2o1 * kxyFromfcNEQ_MPP +
-            c2o1 * kxyFromfcNEQ_PMM + c2o1 * kxyFromfcNEQ_PMP + c2o1 * kxyFromfcNEQ_MMM + c2o1 * kxyFromfcNEQ_MMP +
-            c2o1 * kxzFromfcNEQ_PPM - c2o1 * kxzFromfcNEQ_PPP + c2o1 * kxzFromfcNEQ_MPM - c2o1 * kxzFromfcNEQ_MPP +
-            c2o1 * kxzFromfcNEQ_PMM - c2o1 * kxzFromfcNEQ_PMP + c2o1 * kxzFromfcNEQ_MMM - c2o1 * kxzFromfcNEQ_MMP +
-            c8o1 * vx1_PPM + c8o1 * vx1_PPP + c8o1 * vx1_MPM + c8o1 * vx1_MPP + c8o1 * vx1_PMM + c8o1 * vx1_PMP +
-            c8o1 * vx1_MMM + c8o1 * vx1_MMP + c2o1 * vx2_PPM + c2o1 * vx2_PPP - c2o1 * vx2_MPM - c2o1 * vx2_MPP -
-            c2o1 * vx2_PMM - c2o1 * vx2_PMP + c2o1 * vx2_MMM + c2o1 * vx2_MMP - c2o1 * vx3_PPM + c2o1 * vx3_PPP +
-            c2o1 * vx3_MPM - c2o1 * vx3_MPP - c2o1 * vx3_PMM + c2o1 * vx3_PMP + c2o1 * vx3_MMM - c2o1 * vx3_MMP) /
-            c64o1;
-    b_000 = (c2o1 * kxxMyyFromfcNEQ_PPM + c2o1 * kxxMyyFromfcNEQ_PPP + c2o1 * kxxMyyFromfcNEQ_MPM +
-            c2o1 * kxxMyyFromfcNEQ_MPP - c2o1 * kxxMyyFromfcNEQ_PMM - c2o1 * kxxMyyFromfcNEQ_PMP -
-            c2o1 * kxxMyyFromfcNEQ_MMM - c2o1 * kxxMyyFromfcNEQ_MMP - kxxMzzFromfcNEQ_PPM - kxxMzzFromfcNEQ_PPP -
-            kxxMzzFromfcNEQ_MPM - kxxMzzFromfcNEQ_MPP + kxxMzzFromfcNEQ_PMM + kxxMzzFromfcNEQ_PMP +
-            kxxMzzFromfcNEQ_MMM + kxxMzzFromfcNEQ_MMP - c2o1 * kxyFromfcNEQ_PPM - c2o1 * kxyFromfcNEQ_PPP +
-            c2o1 * kxyFromfcNEQ_MPM + c2o1 * kxyFromfcNEQ_MPP - c2o1 * kxyFromfcNEQ_PMM - c2o1 * kxyFromfcNEQ_PMP +
-            c2o1 * kxyFromfcNEQ_MMM + c2o1 * kxyFromfcNEQ_MMP + c2o1 * kyzFromfcNEQ_PPM - c2o1 * kyzFromfcNEQ_PPP +
-            c2o1 * kyzFromfcNEQ_MPM - c2o1 * kyzFromfcNEQ_MPP + c2o1 * kyzFromfcNEQ_PMM - c2o1 * kyzFromfcNEQ_PMP +
-            c2o1 * kyzFromfcNEQ_MMM - c2o1 * kyzFromfcNEQ_MMP + c2o1 * vx1_PPM + c2o1 * vx1_PPP - c2o1 * vx1_MPM -
-            c2o1 * vx1_MPP - c2o1 * vx1_PMM - c2o1 * vx1_PMP + c2o1 * vx1_MMM + c2o1 * vx1_MMP + c8o1 * vx2_PPM +
-            c8o1 * vx2_PPP + c8o1 * vx2_MPM + c8o1 * vx2_MPP + c8o1 * vx2_PMM + c8o1 * vx2_PMP + c8o1 * vx2_MMM +
-            c8o1 * vx2_MMP - c2o1 * vx3_PPM + c2o1 * vx3_PPP - c2o1 * vx3_MPM + c2o1 * vx3_MPP + c2o1 * vx3_PMM -
-            c2o1 * vx3_PMP + c2o1 * vx3_MMM - c2o1 * vx3_MMP) /
-            c64o1;
-    c_000 = (kxxMyyFromfcNEQ_PPM - kxxMyyFromfcNEQ_PPP + kxxMyyFromfcNEQ_MPM - kxxMyyFromfcNEQ_MPP +
-            kxxMyyFromfcNEQ_PMM - kxxMyyFromfcNEQ_PMP + kxxMyyFromfcNEQ_MMM - kxxMyyFromfcNEQ_MMP -
-            c2o1 * kxxMzzFromfcNEQ_PPM + c2o1 * kxxMzzFromfcNEQ_PPP - c2o1 * kxxMzzFromfcNEQ_MPM +
-            c2o1 * kxxMzzFromfcNEQ_MPP - c2o1 * kxxMzzFromfcNEQ_PMM + c2o1 * kxxMzzFromfcNEQ_PMP -
-            c2o1 * kxxMzzFromfcNEQ_MMM + c2o1 * kxxMzzFromfcNEQ_MMP - c2o1 * kxzFromfcNEQ_PPM -
-            c2o1 * kxzFromfcNEQ_PPP + c2o1 * kxzFromfcNEQ_MPM + c2o1 * kxzFromfcNEQ_MPP - c2o1 * kxzFromfcNEQ_PMM -
-            c2o1 * kxzFromfcNEQ_PMP + c2o1 * kxzFromfcNEQ_MMM + c2o1 * kxzFromfcNEQ_MMP - c2o1 * kyzFromfcNEQ_PPM -
-            c2o1 * kyzFromfcNEQ_PPP - c2o1 * kyzFromfcNEQ_MPM - c2o1 * kyzFromfcNEQ_MPP + c2o1 * kyzFromfcNEQ_PMM +
-            c2o1 * kyzFromfcNEQ_PMP + c2o1 * kyzFromfcNEQ_MMM + c2o1 * kyzFromfcNEQ_MMP - c2o1 * vx1_PPM +
-            c2o1 * vx1_PPP + c2o1 * vx1_MPM - c2o1 * vx1_MPP - c2o1 * vx1_PMM + c2o1 * vx1_PMP + c2o1 * vx1_MMM -
-            c2o1 * vx1_MMP - c2o1 * vx2_PPM + c2o1 * vx2_PPP - c2o1 * vx2_MPM + c2o1 * vx2_MPP + c2o1 * vx2_PMM -
-            c2o1 * vx2_PMP + c2o1 * vx2_MMM - c2o1 * vx2_MMP + c8o1 * vx3_PPM + c8o1 * vx3_PPP + c8o1 * vx3_MPM +
-            c8o1 * vx3_MPP + c8o1 * vx3_PMM + c8o1 * vx3_PMP + c8o1 * vx3_MMM + c8o1 * vx3_MMP) /
-            c64o1;
-    a_100  = (vx1_PPM + vx1_PPP - vx1_MPM - vx1_MPP + vx1_PMM + vx1_PMP - vx1_MMM - vx1_MMP) / c4o1;
-    b_100  = (vx2_PPM + vx2_PPP - vx2_MPM - vx2_MPP + vx2_PMM + vx2_PMP - vx2_MMM - vx2_MMP) / c4o1;
-    c_100  = (vx3_PPM + vx3_PPP - vx3_MPM - vx3_MPP + vx3_PMM + vx3_PMP - vx3_MMM - vx3_MMP) / c4o1;
-    a_200 = (kxxMyyFromfcNEQ_PPM + kxxMyyFromfcNEQ_PPP - kxxMyyFromfcNEQ_MPM - kxxMyyFromfcNEQ_MPP +
-            kxxMyyFromfcNEQ_PMM + kxxMyyFromfcNEQ_PMP - kxxMyyFromfcNEQ_MMM - kxxMyyFromfcNEQ_MMP +
-            kxxMzzFromfcNEQ_PPM + kxxMzzFromfcNEQ_PPP - kxxMzzFromfcNEQ_MPM - kxxMzzFromfcNEQ_MPP +
-            kxxMzzFromfcNEQ_PMM + kxxMzzFromfcNEQ_PMP - kxxMzzFromfcNEQ_MMM - kxxMzzFromfcNEQ_MMP + c2o1 * vx2_PPM +
-            c2o1 * vx2_PPP - c2o1 * vx2_MPM - c2o1 * vx2_MPP - c2o1 * vx2_PMM - c2o1 * vx2_PMP + c2o1 * vx2_MMM +
-            c2o1 * vx2_MMP - c2o1 * vx3_PPM + c2o1 * vx3_PPP + c2o1 * vx3_MPM - c2o1 * vx3_MPP - c2o1 * vx3_PMM +
-            c2o1 * vx3_PMP + c2o1 * vx3_MMM - c2o1 * vx3_MMP) /
-            c16o1;
-    b_200 = (kxyFromfcNEQ_PPM + kxyFromfcNEQ_PPP - kxyFromfcNEQ_MPM - kxyFromfcNEQ_MPP + kxyFromfcNEQ_PMM +
-            kxyFromfcNEQ_PMP - kxyFromfcNEQ_MMM - kxyFromfcNEQ_MMP - c2o1 * vx1_PPM - c2o1 * vx1_PPP +
-            c2o1 * vx1_MPM + c2o1 * vx1_MPP + c2o1 * vx1_PMM + c2o1 * vx1_PMP - c2o1 * vx1_MMM - c2o1 * vx1_MMP) /
-            c8o1;
-    c_200 = (kxzFromfcNEQ_PPM + kxzFromfcNEQ_PPP - kxzFromfcNEQ_MPM - kxzFromfcNEQ_MPP + kxzFromfcNEQ_PMM +
-            kxzFromfcNEQ_PMP - kxzFromfcNEQ_MMM - kxzFromfcNEQ_MMP + c2o1 * vx1_PPM - c2o1 * vx1_PPP -
-            c2o1 * vx1_MPM + c2o1 * vx1_MPP + c2o1 * vx1_PMM - c2o1 * vx1_PMP - c2o1 * vx1_MMM + c2o1 * vx1_MMP) /
-            c8o1;
-    a_010  = (vx1_PPM + vx1_PPP + vx1_MPM + vx1_MPP - vx1_PMM - vx1_PMP - vx1_MMM - vx1_MMP) / c4o1;
-    b_010  = (vx2_PPM + vx2_PPP + vx2_MPM + vx2_MPP - vx2_PMM - vx2_PMP - vx2_MMM - vx2_MMP) / c4o1;
-    c_010  = (vx3_PPM + vx3_PPP + vx3_MPM + vx3_MPP - vx3_PMM - vx3_PMP - vx3_MMM - vx3_MMP) / c4o1;
-    a_020 = (kxyFromfcNEQ_PPM + kxyFromfcNEQ_PPP + kxyFromfcNEQ_MPM + kxyFromfcNEQ_MPP - kxyFromfcNEQ_PMM -
-            kxyFromfcNEQ_PMP - kxyFromfcNEQ_MMM - kxyFromfcNEQ_MMP - c2o1 * vx2_PPM - c2o1 * vx2_PPP +
-            c2o1 * vx2_MPM + c2o1 * vx2_MPP + c2o1 * vx2_PMM + c2o1 * vx2_PMP - c2o1 * vx2_MMM - c2o1 * vx2_MMP) /
-            c8o1;
-    b_020 = (-c2o1 * kxxMyyFromfcNEQ_PPM - c2o1 * kxxMyyFromfcNEQ_PPP - c2o1 * kxxMyyFromfcNEQ_MPM -
-            c2o1 * kxxMyyFromfcNEQ_MPP + c2o1 * kxxMyyFromfcNEQ_PMM + c2o1 * kxxMyyFromfcNEQ_PMP +
-            c2o1 * kxxMyyFromfcNEQ_MMM + c2o1 * kxxMyyFromfcNEQ_MMP + kxxMzzFromfcNEQ_PPM + kxxMzzFromfcNEQ_PPP +
-            kxxMzzFromfcNEQ_MPM + kxxMzzFromfcNEQ_MPP - kxxMzzFromfcNEQ_PMM - kxxMzzFromfcNEQ_PMP -
-            kxxMzzFromfcNEQ_MMM - kxxMzzFromfcNEQ_MMP + c2o1 * vx1_PPM + c2o1 * vx1_PPP - c2o1 * vx1_MPM -
-            c2o1 * vx1_MPP - c2o1 * vx1_PMM - c2o1 * vx1_PMP + c2o1 * vx1_MMM + c2o1 * vx1_MMP - c2o1 * vx3_PPM +
-            c2o1 * vx3_PPP - c2o1 * vx3_MPM + c2o1 * vx3_MPP + c2o1 * vx3_PMM - c2o1 * vx3_PMP + c2o1 * vx3_MMM -
-            c2o1 * vx3_MMP) /
-            c16o1;
-    c_020 = (kyzFromfcNEQ_PPM + kyzFromfcNEQ_PPP + kyzFromfcNEQ_MPM + kyzFromfcNEQ_MPP - kyzFromfcNEQ_PMM -
-            kyzFromfcNEQ_PMP - kyzFromfcNEQ_MMM - kyzFromfcNEQ_MMP + c2o1 * vx2_PPM - c2o1 * vx2_PPP +
-            c2o1 * vx2_MPM - c2o1 * vx2_MPP - c2o1 * vx2_PMM + c2o1 * vx2_PMP - c2o1 * vx2_MMM + c2o1 * vx2_MMP) /
-            c8o1;
-    a_001  = (-vx1_PPM + vx1_PPP - vx1_MPM + vx1_MPP - vx1_PMM + vx1_PMP - vx1_MMM + vx1_MMP) / c4o1;
-    b_001  = (-vx2_PPM + vx2_PPP - vx2_MPM + vx2_MPP - vx2_PMM + vx2_PMP - vx2_MMM + vx2_MMP) / c4o1;
-    c_001  = (-vx3_PPM + vx3_PPP - vx3_MPM + vx3_MPP - vx3_PMM + vx3_PMP - vx3_MMM + vx3_MMP) / c4o1;
-    a_002 = (-kxzFromfcNEQ_PPM + kxzFromfcNEQ_PPP - kxzFromfcNEQ_MPM + kxzFromfcNEQ_MPP - kxzFromfcNEQ_PMM +
-            kxzFromfcNEQ_PMP - kxzFromfcNEQ_MMM + kxzFromfcNEQ_MMP + c2o1 * vx3_PPM - c2o1 * vx3_PPP -
-            c2o1 * vx3_MPM + c2o1 * vx3_MPP + c2o1 * vx3_PMM - c2o1 * vx3_PMP - c2o1 * vx3_MMM + c2o1 * vx3_MMP) /
-            c8o1;
-    b_002 = (-kyzFromfcNEQ_PPM + kyzFromfcNEQ_PPP - kyzFromfcNEQ_MPM + kyzFromfcNEQ_MPP - kyzFromfcNEQ_PMM +
-            kyzFromfcNEQ_PMP - kyzFromfcNEQ_MMM + kyzFromfcNEQ_MMP + c2o1 * vx3_PPM - c2o1 * vx3_PPP +
-            c2o1 * vx3_MPM - c2o1 * vx3_MPP - c2o1 * vx3_PMM + c2o1 * vx3_PMP - c2o1 * vx3_MMM + c2o1 * vx3_MMP) /
-            c8o1;
-    c_002 = (-kxxMyyFromfcNEQ_PPM + kxxMyyFromfcNEQ_PPP - kxxMyyFromfcNEQ_MPM + kxxMyyFromfcNEQ_MPP -
-            kxxMyyFromfcNEQ_PMM + kxxMyyFromfcNEQ_PMP - kxxMyyFromfcNEQ_MMM + kxxMyyFromfcNEQ_MMP +
-            c2o1 * kxxMzzFromfcNEQ_PPM - c2o1 * kxxMzzFromfcNEQ_PPP + c2o1 * kxxMzzFromfcNEQ_MPM -
-            c2o1 * kxxMzzFromfcNEQ_MPP + c2o1 * kxxMzzFromfcNEQ_PMM - c2o1 * kxxMzzFromfcNEQ_PMP +
-            c2o1 * kxxMzzFromfcNEQ_MMM - c2o1 * kxxMzzFromfcNEQ_MMP - c2o1 * vx1_PPM + c2o1 * vx1_PPP +
-            c2o1 * vx1_MPM - c2o1 * vx1_MPP - c2o1 * vx1_PMM + c2o1 * vx1_PMP + c2o1 * vx1_MMM - c2o1 * vx1_MMP -
-            c2o1 * vx2_PPM + c2o1 * vx2_PPP - c2o1 * vx2_MPM + c2o1 * vx2_MPP + c2o1 * vx2_PMM - c2o1 * vx2_PMP +
-            c2o1 * vx2_MMM - c2o1 * vx2_MMP) /
-            c16o1;
-    a_110 = (vx1_PPM + vx1_PPP - vx1_MPM - vx1_MPP - vx1_PMM - vx1_PMP + vx1_MMM + vx1_MMP) / c2o1;
-    b_110 = (vx2_PPM + vx2_PPP - vx2_MPM - vx2_MPP - vx2_PMM - vx2_PMP + vx2_MMM + vx2_MMP) / c2o1;
-    c_110 = (vx3_PPM + vx3_PPP - vx3_MPM - vx3_MPP - vx3_PMM - vx3_PMP + vx3_MMM + vx3_MMP) / c2o1;
-    a_101 = (-vx1_PPM + vx1_PPP + vx1_MPM - vx1_MPP - vx1_PMM + vx1_PMP + vx1_MMM - vx1_MMP) / c2o1;
-    b_101 = (-vx2_PPM + vx2_PPP + vx2_MPM - vx2_MPP - vx2_PMM + vx2_PMP + vx2_MMM - vx2_MMP) / c2o1;
-    c_101 = (-vx3_PPM + vx3_PPP + vx3_MPM - vx3_MPP - vx3_PMM + vx3_PMP + vx3_MMM - vx3_MMP) / c2o1;
-    a_011 = (-vx1_PPM + vx1_PPP - vx1_MPM + vx1_MPP + vx1_PMM - vx1_PMP + vx1_MMM - vx1_MMP) / c2o1;
-    b_011 = (-vx2_PPM + vx2_PPP - vx2_MPM + vx2_MPP + vx2_PMM - vx2_PMP + vx2_MMM - vx2_MMP) / c2o1;
-    c_011 = (-vx3_PPM + vx3_PPP - vx3_MPM + vx3_MPP + vx3_PMM - vx3_PMP + vx3_MMM - vx3_MMP) / c2o1;
-
-    a_111 = -vx1_PPM + vx1_PPP + vx1_MPM - vx1_MPP + vx1_PMM - vx1_PMP - vx1_MMM + vx1_MMP;
-    b_111 = -vx2_PPM + vx2_PPP + vx2_MPM - vx2_MPP + vx2_PMM - vx2_PMP - vx2_MMM + vx2_MMP;
-    c_111 = -vx3_PPM + vx3_PPP + vx3_MPM - vx3_MPP + vx3_PMM - vx3_PMP - vx3_MMM + vx3_MMP;
+    // a_000 = (-kxxMyyFromfcNEQ_PPM - kxxMyyFromfcNEQ_PPP + kxxMyyFromfcNEQ_MPM + kxxMyyFromfcNEQ_MPP -
+    //         kxxMyyFromfcNEQ_PMM - kxxMyyFromfcNEQ_PMP + kxxMyyFromfcNEQ_MMM + kxxMyyFromfcNEQ_MMP -
+    //         kxxMzzFromfcNEQ_PPM - kxxMzzFromfcNEQ_PPP + kxxMzzFromfcNEQ_MPM + kxxMzzFromfcNEQ_MPP -
+    //         kxxMzzFromfcNEQ_PMM - kxxMzzFromfcNEQ_PMP + kxxMzzFromfcNEQ_MMM + kxxMzzFromfcNEQ_MMP -
+    //         c2o1 * kxyFromfcNEQ_PPM - c2o1 * kxyFromfcNEQ_PPP - c2o1 * kxyFromfcNEQ_MPM - c2o1 * kxyFromfcNEQ_MPP +
+    //         c2o1 * kxyFromfcNEQ_PMM + c2o1 * kxyFromfcNEQ_PMP + c2o1 * kxyFromfcNEQ_MMM + c2o1 * kxyFromfcNEQ_MMP +
+    //         c2o1 * kxzFromfcNEQ_PPM - c2o1 * kxzFromfcNEQ_PPP + c2o1 * kxzFromfcNEQ_MPM - c2o1 * kxzFromfcNEQ_MPP +
+    //         c2o1 * kxzFromfcNEQ_PMM - c2o1 * kxzFromfcNEQ_PMP + c2o1 * kxzFromfcNEQ_MMM - c2o1 * kxzFromfcNEQ_MMP +
+    //         c8o1 * vx1_PPM + c8o1 * vx1_PPP + c8o1 * vx1_MPM + c8o1 * vx1_MPP + c8o1 * vx1_PMM + c8o1 * vx1_PMP +
+    //         c8o1 * vx1_MMM + c8o1 * vx1_MMP + c2o1 * vx2_PPM + c2o1 * vx2_PPP - c2o1 * vx2_MPM - c2o1 * vx2_MPP -
+    //         c2o1 * vx2_PMM - c2o1 * vx2_PMP + c2o1 * vx2_MMM + c2o1 * vx2_MMP - c2o1 * vx3_PPM + c2o1 * vx3_PPP +
+    //         c2o1 * vx3_MPM - c2o1 * vx3_MPP - c2o1 * vx3_PMM + c2o1 * vx3_PMP + c2o1 * vx3_MMM - c2o1 * vx3_MMP) /
+    //         c64o1;
+    a_000 =
+        c1o64 * (c2o1 * (((kxyFromfcNEQ_MMM - kxyFromfcNEQ_PPP) + (kxyFromfcNEQ_MMP - kxyFromfcNEQ_PPM)) +
+                         ((kxyFromfcNEQ_PMM - kxyFromfcNEQ_MPP) + (kxyFromfcNEQ_PMP - kxyFromfcNEQ_MPM)) +
+                         ((kxzFromfcNEQ_MMM - kxzFromfcNEQ_PPP) + (kxzFromfcNEQ_PPM - kxzFromfcNEQ_MMP)) +
+                         ((kxzFromfcNEQ_PMM - kxzFromfcNEQ_MPP) + (kxzFromfcNEQ_MPM - kxzFromfcNEQ_PMP)) +
+                         ((vx2_PPP + vx2_MMM) + (vx2_PPM + vx2_MMP)) - ((vx2_MPP + vx2_PMM) + (vx2_MPM + vx2_PMP)) +
+                         ((vx3_PPP + vx3_MMM) - (vx3_PPM + vx3_MMP)) + ((vx3_PMP + vx3_MPM) - (vx3_MPP + vx3_PMM))) +
+                 c8o1 * (((vx1_PPP + vx1_MMM) + (vx1_PPM + vx1_MMP)) + ((vx1_MPP + vx1_PMM) + (vx1_PMP + vx1_MPM))) +
+                 ((kxxMyyFromfcNEQ_MMM - kxxMyyFromfcNEQ_PPP) + (kxxMyyFromfcNEQ_MMP - kxxMyyFromfcNEQ_PPM)) +
+                 ((kxxMyyFromfcNEQ_MPP - kxxMyyFromfcNEQ_PMM) + (kxxMyyFromfcNEQ_MPM - kxxMyyFromfcNEQ_PMP)) +
+                 ((kxxMzzFromfcNEQ_MMM - kxxMzzFromfcNEQ_PPP) + (kxxMzzFromfcNEQ_MMP - kxxMzzFromfcNEQ_PPM)) +
+                 ((kxxMzzFromfcNEQ_MPP - kxxMzzFromfcNEQ_PMM) + (kxxMzzFromfcNEQ_MPM - kxxMzzFromfcNEQ_PMP)));
+
+    // b_000 = (c2o1 * kxxMyyFromfcNEQ_PPM + c2o1 * kxxMyyFromfcNEQ_PPP + c2o1 * kxxMyyFromfcNEQ_MPM +
+    //         c2o1 * kxxMyyFromfcNEQ_MPP - c2o1 * kxxMyyFromfcNEQ_PMM - c2o1 * kxxMyyFromfcNEQ_PMP -
+    //         c2o1 * kxxMyyFromfcNEQ_MMM - c2o1 * kxxMyyFromfcNEQ_MMP - kxxMzzFromfcNEQ_PPM - kxxMzzFromfcNEQ_PPP -
+    //         kxxMzzFromfcNEQ_MPM - kxxMzzFromfcNEQ_MPP + kxxMzzFromfcNEQ_PMM + kxxMzzFromfcNEQ_PMP +
+    //         kxxMzzFromfcNEQ_MMM + kxxMzzFromfcNEQ_MMP - c2o1 * kxyFromfcNEQ_PPM - c2o1 * kxyFromfcNEQ_PPP +
+    //         c2o1 * kxyFromfcNEQ_MPM + c2o1 * kxyFromfcNEQ_MPP - c2o1 * kxyFromfcNEQ_PMM - c2o1 * kxyFromfcNEQ_PMP +
+    //         c2o1 * kxyFromfcNEQ_MMM + c2o1 * kxyFromfcNEQ_MMP + c2o1 * kyzFromfcNEQ_PPM - c2o1 * kyzFromfcNEQ_PPP +
+    //         c2o1 * kyzFromfcNEQ_MPM - c2o1 * kyzFromfcNEQ_MPP + c2o1 * kyzFromfcNEQ_PMM - c2o1 * kyzFromfcNEQ_PMP +
+    //         c2o1 * kyzFromfcNEQ_MMM - c2o1 * kyzFromfcNEQ_MMP + c2o1 * vx1_PPM + c2o1 * vx1_PPP - c2o1 * vx1_MPM -
+    //         c2o1 * vx1_MPP - c2o1 * vx1_PMM - c2o1 * vx1_PMP + c2o1 * vx1_MMM + c2o1 * vx1_MMP + c8o1 * vx2_PPM +
+    //         c8o1 * vx2_PPP + c8o1 * vx2_MPM + c8o1 * vx2_MPP + c8o1 * vx2_PMM + c8o1 * vx2_PMP + c8o1 * vx2_MMM +
+    //         c8o1 * vx2_MMP - c2o1 * vx3_PPM + c2o1 * vx3_PPP - c2o1 * vx3_MPM + c2o1 * vx3_MPP + c2o1 * vx3_PMM -
+    //         c2o1 * vx3_PMP + c2o1 * vx3_MMM - c2o1 * vx3_MMP) /
+    //         c64o1;
+    b_000 =
+        c1o64 * (c2o1 * (((kxxMyyFromfcNEQ_PPP - kxxMyyFromfcNEQ_MMM) + (kxxMyyFromfcNEQ_PPM - kxxMyyFromfcNEQ_MMP)) +
+                         ((kxxMyyFromfcNEQ_MPP - kxxMyyFromfcNEQ_PMM) + (kxxMyyFromfcNEQ_MPM - kxxMyyFromfcNEQ_PMP)) +
+                         ((kxyFromfcNEQ_MMM - kxyFromfcNEQ_PPP) + (kxyFromfcNEQ_MMP - kxyFromfcNEQ_PPM)) +
+                         ((kxyFromfcNEQ_MPP - kxyFromfcNEQ_PMM) + (kxyFromfcNEQ_MPM - kxyFromfcNEQ_PMP)) +
+                         ((kyzFromfcNEQ_MMM - kyzFromfcNEQ_PPP) + (kyzFromfcNEQ_PPM - kyzFromfcNEQ_MMP)) +
+                         ((kyzFromfcNEQ_PMM - kyzFromfcNEQ_MPP) + (kyzFromfcNEQ_MPM - kyzFromfcNEQ_PMP)) +
+                         ((vx1_PPP + vx1_MMM) + (vx1_PPM + vx1_MMP)) - ((vx1_MPM + vx1_MPP) + (vx1_PMM + vx1_PMP)) +
+                         ((vx3_PPP + vx3_MMM) - (vx3_PPM + vx3_MMP)) + ((vx3_MPP + vx3_PMM) - (vx3_MPM + vx3_PMP))) +
+                 c8o1 * (((vx2_PPP + vx2_MMM) + (vx2_PPM + vx2_MMP)) + ((vx2_MPP + vx2_PMM) + (vx2_MPM + vx2_PMP))) +
+                 ((kxxMzzFromfcNEQ_MMM - kxxMzzFromfcNEQ_PPP) + (kxxMzzFromfcNEQ_MMP - kxxMzzFromfcNEQ_PPM)) +
+                 ((kxxMzzFromfcNEQ_PMM - kxxMzzFromfcNEQ_MPP) + (kxxMzzFromfcNEQ_PMP - kxxMzzFromfcNEQ_MPM)));
+
+    // c_000 = (kxxMyyFromfcNEQ_PPM - kxxMyyFromfcNEQ_PPP + kxxMyyFromfcNEQ_MPM - kxxMyyFromfcNEQ_MPP +
+    //         kxxMyyFromfcNEQ_PMM - kxxMyyFromfcNEQ_PMP + kxxMyyFromfcNEQ_MMM - kxxMyyFromfcNEQ_MMP -
+    //         c2o1 * kxxMzzFromfcNEQ_PPM + c2o1 * kxxMzzFromfcNEQ_PPP - c2o1 * kxxMzzFromfcNEQ_MPM +
+    //         c2o1 * kxxMzzFromfcNEQ_MPP - c2o1 * kxxMzzFromfcNEQ_PMM + c2o1 * kxxMzzFromfcNEQ_PMP -
+    //         c2o1 * kxxMzzFromfcNEQ_MMM + c2o1 * kxxMzzFromfcNEQ_MMP - c2o1 * kxzFromfcNEQ_PPM -
+    //         c2o1 * kxzFromfcNEQ_PPP + c2o1 * kxzFromfcNEQ_MPM + c2o1 * kxzFromfcNEQ_MPP - c2o1 * kxzFromfcNEQ_PMM -
+    //         c2o1 * kxzFromfcNEQ_PMP + c2o1 * kxzFromfcNEQ_MMM + c2o1 * kxzFromfcNEQ_MMP - c2o1 * kyzFromfcNEQ_PPM -
+    //         c2o1 * kyzFromfcNEQ_PPP - c2o1 * kyzFromfcNEQ_MPM - c2o1 * kyzFromfcNEQ_MPP + c2o1 * kyzFromfcNEQ_PMM +
+    //         c2o1 * kyzFromfcNEQ_PMP + c2o1 * kyzFromfcNEQ_MMM + c2o1 * kyzFromfcNEQ_MMP - c2o1 * vx1_PPM +
+    //         c2o1 * vx1_PPP + c2o1 * vx1_MPM - c2o1 * vx1_MPP - c2o1 * vx1_PMM + c2o1 * vx1_PMP + c2o1 * vx1_MMM -
+    //         c2o1 * vx1_MMP - c2o1 * vx2_PPM + c2o1 * vx2_PPP - c2o1 * vx2_MPM + c2o1 * vx2_MPP + c2o1 * vx2_PMM -
+    //         c2o1 * vx2_PMP + c2o1 * vx2_MMM - c2o1 * vx2_MMP + c8o1 * vx3_PPM + c8o1 * vx3_PPP + c8o1 * vx3_MPM +
+    //         c8o1 * vx3_MPP + c8o1 * vx3_PMM + c8o1 * vx3_PMP + c8o1 * vx3_MMM + c8o1 * vx3_MMP) /
+    //         c64o1;
+    c_000 =
+        c1o64 * (c2o1 * (((kxxMzzFromfcNEQ_PPP - kxxMzzFromfcNEQ_MMM) + (kxxMzzFromfcNEQ_MMP - kxxMzzFromfcNEQ_PPM)) +
+                         ((kxxMzzFromfcNEQ_MPP - kxxMzzFromfcNEQ_PMM) + (kxxMzzFromfcNEQ_PMP - kxxMzzFromfcNEQ_MPM)) +
+                         ((kxzFromfcNEQ_MMM - kxzFromfcNEQ_PPP) + (kxzFromfcNEQ_MMP - kxzFromfcNEQ_PPM)) +
+                         ((kxzFromfcNEQ_MPP - kxzFromfcNEQ_PMM) + (kxzFromfcNEQ_MPM - kxzFromfcNEQ_PMP)) +
+                         ((kyzFromfcNEQ_MMM - kyzFromfcNEQ_PPP) + (kyzFromfcNEQ_MMP - kyzFromfcNEQ_PPM)) +
+                         ((kyzFromfcNEQ_PMM - kyzFromfcNEQ_MPP) + (kyzFromfcNEQ_PMP - kyzFromfcNEQ_MPM)) +
+                         ((vx1_PPP + vx1_MMM) - (vx1_MMP + vx1_PPM)) + ((vx1_MPM + vx1_PMP) - (vx1_MPP + vx1_PMM)) +
+                         ((vx2_PPP + vx2_MMM) - (vx2_MMP + vx2_PPM)) + ((vx2_MPP + vx2_PMM) - (vx2_MPM + vx2_PMP))) +
+                 c8o1 * (((vx3_PPP + vx3_MMM) + (vx3_PPM + vx3_MMP)) + ((vx3_PMM + vx3_MPP) + (vx3_PMP + vx3_MPM))) +
+                 ((kxxMyyFromfcNEQ_MMM - kxxMyyFromfcNEQ_PPP) + (kxxMyyFromfcNEQ_PPM - kxxMyyFromfcNEQ_MMP)) +
+                 ((kxxMyyFromfcNEQ_PMM - kxxMyyFromfcNEQ_MPP) + (kxxMyyFromfcNEQ_MPM - kxxMyyFromfcNEQ_PMP)));
+
+    // a_100  = (vx1_PPM + vx1_PPP - vx1_MPM - vx1_MPP + vx1_PMM + vx1_PMP - vx1_MMM - vx1_MMP) / c4o1;
+    a_100 = c1o4 * (((vx1_PPP - vx1_MMM) + (vx1_PPM - vx1_MMP)) + ((vx1_PMM - vx1_MPP) + (vx1_PMP - vx1_MPM)));
+
+    // b_100  = (vx2_PPM + vx2_PPP - vx2_MPM - vx2_MPP + vx2_PMM + vx2_PMP - vx2_MMM - vx2_MMP) / c4o1;
+    b_100 = c1o4 * (((vx2_PPP - vx2_MMM) + (vx2_PPM - vx2_MMP)) + ((vx2_PMM - vx2_MPP) + (vx2_PMP - vx2_MPM)));
+
+    // c_100  = (vx3_PPM + vx3_PPP - vx3_MPM - vx3_MPP + vx3_PMM + vx3_PMP - vx3_MMM - vx3_MMP) / c4o1;
+    c_100 = c1o4 * (((vx3_PPP - vx3_MMM) + (vx3_PPM - vx3_MMP)) + ((vx3_PMM - vx3_MPP) + (vx3_PMP - vx3_MPM)));
+
+    // a_200 = (kxxMyyFromfcNEQ_PPM + kxxMyyFromfcNEQ_PPP - kxxMyyFromfcNEQ_MPM - kxxMyyFromfcNEQ_MPP +
+    //         kxxMyyFromfcNEQ_PMM + kxxMyyFromfcNEQ_PMP - kxxMyyFromfcNEQ_MMM - kxxMyyFromfcNEQ_MMP +
+    //         kxxMzzFromfcNEQ_PPM + kxxMzzFromfcNEQ_PPP - kxxMzzFromfcNEQ_MPM - kxxMzzFromfcNEQ_MPP +
+    //         kxxMzzFromfcNEQ_PMM + kxxMzzFromfcNEQ_PMP - kxxMzzFromfcNEQ_MMM - kxxMzzFromfcNEQ_MMP + c2o1 * vx2_PPM +
+    //         c2o1 * vx2_PPP - c2o1 * vx2_MPM - c2o1 * vx2_MPP - c2o1 * vx2_PMM - c2o1 * vx2_PMP + c2o1 * vx2_MMM +
+    //         c2o1 * vx2_MMP - c2o1 * vx3_PPM + c2o1 * vx3_PPP + c2o1 * vx3_MPM - c2o1 * vx3_MPP - c2o1 * vx3_PMM +
+    //         c2o1 * vx3_PMP + c2o1 * vx3_MMM - c2o1 * vx3_MMP) /
+    //         c16o1;
+    a_200 =
+        c1o16 * (c2o1 * (((vx2_PPP + vx2_MMM) + (vx2_PPM - vx2_MPP)) + ((vx2_MMP - vx2_PMM) - (vx2_MPM + vx2_PMP)) +
+                         ((vx3_PPP + vx3_MMM) - (vx3_PPM + vx3_MPP)) + ((vx3_MPM + vx3_PMP) - (vx3_MMP + vx3_PMM))) +
+                 ((kxxMyyFromfcNEQ_PPP - kxxMyyFromfcNEQ_MMM) + (kxxMyyFromfcNEQ_PPM - kxxMyyFromfcNEQ_MMP)) +
+                 ((kxxMyyFromfcNEQ_PMM - kxxMyyFromfcNEQ_MPP) + (kxxMyyFromfcNEQ_PMP - kxxMyyFromfcNEQ_MPM)) +
+                 ((kxxMzzFromfcNEQ_PPP - kxxMzzFromfcNEQ_MMM) + (kxxMzzFromfcNEQ_PPM - kxxMzzFromfcNEQ_MMP)) +
+                 ((kxxMzzFromfcNEQ_PMM - kxxMzzFromfcNEQ_MPP) + (kxxMzzFromfcNEQ_PMP - kxxMzzFromfcNEQ_MPM)));
+
+    // b_200 = (kxyFromfcNEQ_PPM + kxyFromfcNEQ_PPP - kxyFromfcNEQ_MPM - kxyFromfcNEQ_MPP + kxyFromfcNEQ_PMM +
+    //         kxyFromfcNEQ_PMP - kxyFromfcNEQ_MMM - kxyFromfcNEQ_MMP - c2o1 * vx1_PPM - c2o1 * vx1_PPP +
+    //         c2o1 * vx1_MPM + c2o1 * vx1_MPP + c2o1 * vx1_PMM + c2o1 * vx1_PMP - c2o1 * vx1_MMM - c2o1 * vx1_MMP) /
+    //         c8o1;
+    b_200 =
+        c1o8 * (c2o1 * (-((vx1_PPP + vx1_MMM) + (vx1_PPM + vx1_MMP)) + ((vx1_MPP + vx1_PMM) + (vx1_MPM + vx1_PMP))) +
+                ((kxyFromfcNEQ_PPP - kxyFromfcNEQ_MMM) + (kxyFromfcNEQ_PPM - kxyFromfcNEQ_MMP)) +
+                ((kxyFromfcNEQ_PMM - kxyFromfcNEQ_MPP) + (kxyFromfcNEQ_PMP - kxyFromfcNEQ_MPM)));
+
+    // c_200 = (kxzFromfcNEQ_PPM + kxzFromfcNEQ_PPP - kxzFromfcNEQ_MPM - kxzFromfcNEQ_MPP + kxzFromfcNEQ_PMM +
+    //          kxzFromfcNEQ_PMP - kxzFromfcNEQ_MMM - kxzFromfcNEQ_MMP + c2o1 * vx1_PPM - c2o1 * vx1_PPP - c2o1 *
+    //          vx1_MPM + c2o1 * vx1_MPP + c2o1 * vx1_PMM - c2o1 * vx1_PMP - c2o1 * vx1_MMM + c2o1 * vx1_MMP) /
+    //         c8o1;
+    c_200 = c1o8 * (c2o1 * (((vx1_PPM + vx1_MMP) - (vx1_PPP + vx1_MMM)) + ((vx1_MPP + vx1_PMM) - (vx1_MPM + vx1_PMP))) +
+                    ((kxzFromfcNEQ_PPP - kxzFromfcNEQ_MMM) + (kxzFromfcNEQ_PPM - kxzFromfcNEQ_MMP)) +
+                    ((kxzFromfcNEQ_PMM - kxzFromfcNEQ_MPP) + (kxzFromfcNEQ_PMP - kxzFromfcNEQ_MPM)));
+
+    // a_010 = (vx1_PPM + vx1_PPP + vx1_MPM + vx1_MPP - vx1_PMM - vx1_PMP - vx1_MMM - vx1_MMP) / c4o1;
+    a_010 = c1o4 * (((vx1_PPP - vx1_MMM) + (vx1_PPM - vx1_MMP)) + ((vx1_MPP - vx1_PMM) + (vx1_MPM - vx1_PMP)));
+
+    // b_010 = (vx2_PPM + vx2_PPP + vx2_MPM + vx2_MPP - vx2_PMM - vx2_PMP - vx2_MMM - vx2_MMP) / c4o1;
+    b_010 = c1o4 * (((vx2_PPP - vx2_MMM) + (vx2_PPM - vx2_MMP)) + ((vx2_MPP - vx2_PMM) + (vx2_MPM - vx2_PMP)));
+
+    // c_010 = (vx3_PPM + vx3_PPP + vx3_MPM + vx3_MPP - vx3_PMM - vx3_PMP - vx3_MMM - vx3_MMP) / c4o1;
+    c_010 = c1o4 * (((vx3_PPP - vx3_MMM) + (vx3_PPM - vx3_MMP)) + ((vx3_MPP - vx3_PMM) + (vx3_MPM - vx3_PMP)));
+
+    // a_020 = (kxyFromfcNEQ_PPM + kxyFromfcNEQ_PPP + kxyFromfcNEQ_MPM + kxyFromfcNEQ_MPP - kxyFromfcNEQ_PMM -
+    //         kxyFromfcNEQ_PMP - kxyFromfcNEQ_MMM - kxyFromfcNEQ_MMP - c2o1 * vx2_PPM - c2o1 * vx2_PPP +
+    //         c2o1 * vx2_MPM + c2o1 * vx2_MPP + c2o1 * vx2_PMM + c2o1 * vx2_PMP - c2o1 * vx2_MMM - c2o1 * vx2_MMP) /
+    //         c8o1;
+    a_020 =
+        c1o8 * (c2o1 * (-((vx2_PPP + vx2_MMM) + (vx2_MMP + vx2_PPM)) + ((vx2_MPP + vx2_PMM) + (vx2_MPM + vx2_PMP))) +
+                ((kxyFromfcNEQ_PPP - kxyFromfcNEQ_MMM) + (kxyFromfcNEQ_PPM - kxyFromfcNEQ_MMP)) +
+                ((kxyFromfcNEQ_MPP - kxyFromfcNEQ_PMM) + (kxyFromfcNEQ_MPM - kxyFromfcNEQ_PMP)));
+
+    // b_020 = (-c2o1 * kxxMyyFromfcNEQ_PPM - c2o1 * kxxMyyFromfcNEQ_PPP - c2o1 * kxxMyyFromfcNEQ_MPM -
+    //         c2o1 * kxxMyyFromfcNEQ_MPP + c2o1 * kxxMyyFromfcNEQ_PMM + c2o1 * kxxMyyFromfcNEQ_PMP +
+    //         c2o1 * kxxMyyFromfcNEQ_MMM + c2o1 * kxxMyyFromfcNEQ_MMP + kxxMzzFromfcNEQ_PPM + kxxMzzFromfcNEQ_PPP +
+    //         kxxMzzFromfcNEQ_MPM + kxxMzzFromfcNEQ_MPP - kxxMzzFromfcNEQ_PMM - kxxMzzFromfcNEQ_PMP -
+    //         kxxMzzFromfcNEQ_MMM - kxxMzzFromfcNEQ_MMP + c2o1 * vx1_PPM + c2o1 * vx1_PPP - c2o1 * vx1_MPM -
+    //         c2o1 * vx1_MPP - c2o1 * vx1_PMM - c2o1 * vx1_PMP + c2o1 * vx1_MMM + c2o1 * vx1_MMP - c2o1 * vx3_PPM +
+    //         c2o1 * vx3_PPP - c2o1 * vx3_MPM + c2o1 * vx3_MPP + c2o1 * vx3_PMM - c2o1 * vx3_PMP + c2o1 * vx3_MMM -
+    //         c2o1 * vx3_MMP) /
+    //         c16o1;
+    b_020 =
+        c1o16 * (c2o1 * (((kxxMyyFromfcNEQ_MMM - kxxMyyFromfcNEQ_PPP) + (kxxMyyFromfcNEQ_MMP - kxxMyyFromfcNEQ_PPM)) +
+                         ((kxxMyyFromfcNEQ_PMM - kxxMyyFromfcNEQ_MPP) + (kxxMyyFromfcNEQ_PMP - kxxMyyFromfcNEQ_MPM)) +
+                         ((vx1_PPP + vx1_MMM) + (vx1_PPM + vx1_MMP)) - ((vx1_MPP + vx1_PMM) + (vx1_PMP + vx1_MPM)) +
+                         ((vx3_PPP + vx3_MMM) - (vx3_PPM + vx3_MMP)) + ((vx3_MPP + vx3_PMM) - (vx3_MPM + vx3_PMP))) +
+                 ((kxxMzzFromfcNEQ_PPP - kxxMzzFromfcNEQ_MMM) + (kxxMzzFromfcNEQ_PPM - kxxMzzFromfcNEQ_MMP)) +
+                 ((kxxMzzFromfcNEQ_MPP - kxxMzzFromfcNEQ_PMM) + (kxxMzzFromfcNEQ_MPM - kxxMzzFromfcNEQ_PMP)));
+
+    // c_020 = (kyzFromfcNEQ_PPM + kyzFromfcNEQ_PPP + kyzFromfcNEQ_MPM + kyzFromfcNEQ_MPP - kyzFromfcNEQ_PMM -
+    //          kyzFromfcNEQ_PMP - kyzFromfcNEQ_MMM - kyzFromfcNEQ_MMP + c2o1 * vx2_PPM - c2o1 * vx2_PPP + c2o1 *
+    //          vx2_MPM - c2o1 * vx2_MPP - c2o1 * vx2_PMM + c2o1 * vx2_PMP - c2o1 * vx2_MMM + c2o1 * vx2_MMP) /
+    //         c8o1;
+    c_020 = c1o8 * (c2o1 * (((vx2_MMP + vx2_PPM) - (vx2_PPP + vx2_MMM)) + ((vx2_PMP + vx2_MPM) - (vx2_MPP + vx2_PMM))) +
+                    ((kyzFromfcNEQ_PPP - kyzFromfcNEQ_MMM) + (kyzFromfcNEQ_PPM - kyzFromfcNEQ_MMP)) +
+                    ((kyzFromfcNEQ_MPP - kyzFromfcNEQ_PMM) + (kyzFromfcNEQ_MPM - kyzFromfcNEQ_PMP)));
+
+    // a_001  = (-vx1_PPM + vx1_PPP - vx1_MPM + vx1_MPP - vx1_PMM + vx1_PMP - vx1_MMM + vx1_MMP) / c4o1;
+    a_001 = c1o4 * (((vx1_PPP - vx1_MMM) + (vx1_MMP - vx1_PPM)) + ((vx1_MPP - vx1_PMM) + (vx1_PMP - vx1_MPM)));
+
+    // b_001  = (-vx2_PPM + vx2_PPP - vx2_MPM + vx2_MPP - vx2_PMM + vx2_PMP - vx2_MMM + vx2_MMP) / c4o1;
+    b_001 = c1o4 * (((vx2_PPP - vx2_MMM) + (vx2_MMP - vx2_PPM)) + ((vx2_MPP - vx2_PMM) + (vx2_PMP - vx2_MPM)));
+
+    // c_001  = (-vx3_PPM + vx3_PPP - vx3_MPM + vx3_MPP - vx3_PMM + vx3_PMP - vx3_MMM + vx3_MMP) / c4o1;
+    c_001 = c1o4 * (((vx3_PPP - vx3_MMM) + (vx3_MMP - vx3_PPM)) + ((vx3_MPP - vx3_PMM) + (vx3_PMP - vx3_MPM)));
+
+    // a_002 = (-kxzFromfcNEQ_PPM + kxzFromfcNEQ_PPP - kxzFromfcNEQ_MPM + kxzFromfcNEQ_MPP - kxzFromfcNEQ_PMM +
+    //         kxzFromfcNEQ_PMP - kxzFromfcNEQ_MMM + kxzFromfcNEQ_MMP + c2o1 * vx3_PPM - c2o1 * vx3_PPP -
+    //         c2o1 * vx3_MPM + c2o1 * vx3_MPP + c2o1 * vx3_PMM - c2o1 * vx3_PMP - c2o1 * vx3_MMM + c2o1 * vx3_MMP) /
+    //         c8o1;
+    a_002 = c1o8 * (c2o1 * (((vx3_PPM + vx3_MMP) - (vx3_PPP + vx3_MMM)) + ((vx3_MPP + vx3_PMM) - (vx3_PMP + vx3_MPM))) +
+                    ((kxzFromfcNEQ_PPP - kxzFromfcNEQ_MMM) + (kxzFromfcNEQ_MMP - kxzFromfcNEQ_PPM)) +
+                    ((kxzFromfcNEQ_PMP - kxzFromfcNEQ_MPM) + (kxzFromfcNEQ_MPP - kxzFromfcNEQ_PMM)));
+
+    // b_002 = (-kyzFromfcNEQ_PPM + kyzFromfcNEQ_PPP - kyzFromfcNEQ_MPM + kyzFromfcNEQ_MPP - kyzFromfcNEQ_PMM +
+    //          kyzFromfcNEQ_PMP - kyzFromfcNEQ_MMM + kyzFromfcNEQ_MMP + c2o1 * vx3_PPM - c2o1 * vx3_PPP + c2o1 *
+    //          vx3_MPM - c2o1 * vx3_MPP - c2o1 * vx3_PMM + c2o1 * vx3_PMP - c2o1 * vx3_MMM + c2o1 * vx3_MMP) /
+    //         c8o1;
+    b_002 = c1o8 * (c2o1 * (((vx3_PPM + vx3_MMP) - (vx3_PPP + vx3_MMM)) + ((vx3_MPM + vx3_PMP) - (vx3_PMM + vx3_MPP))) +
+                    ((kyzFromfcNEQ_PPP - kyzFromfcNEQ_MMM) + (kyzFromfcNEQ_MMP - kyzFromfcNEQ_PPM)) +
+                    ((kyzFromfcNEQ_PMP - kyzFromfcNEQ_MPM) + (kyzFromfcNEQ_MPP - kyzFromfcNEQ_PMM)));
+
+    // c_002 = (-kxxMyyFromfcNEQ_PPM + kxxMyyFromfcNEQ_PPP - kxxMyyFromfcNEQ_MPM + kxxMyyFromfcNEQ_MPP -
+    //         kxxMyyFromfcNEQ_PMM + kxxMyyFromfcNEQ_PMP - kxxMyyFromfcNEQ_MMM + kxxMyyFromfcNEQ_MMP +
+    //         c2o1 * kxxMzzFromfcNEQ_PPM - c2o1 * kxxMzzFromfcNEQ_PPP + c2o1 * kxxMzzFromfcNEQ_MPM -
+    //         c2o1 * kxxMzzFromfcNEQ_MPP + c2o1 * kxxMzzFromfcNEQ_PMM - c2o1 * kxxMzzFromfcNEQ_PMP +
+    //         c2o1 * kxxMzzFromfcNEQ_MMM - c2o1 * kxxMzzFromfcNEQ_MMP - c2o1 * vx1_PPM + c2o1 * vx1_PPP +
+    //         c2o1 * vx1_MPM - c2o1 * vx1_MPP - c2o1 * vx1_PMM + c2o1 * vx1_PMP + c2o1 * vx1_MMM - c2o1 * vx1_MMP -
+    //         c2o1 * vx2_PPM + c2o1 * vx2_PPP - c2o1 * vx2_MPM + c2o1 * vx2_MPP + c2o1 * vx2_PMM - c2o1 * vx2_PMP +
+    //         c2o1 * vx2_MMM - c2o1 * vx2_MMP) /
+    //         c16o1;
+    c_002 =
+        c1o16 * (c2o1 * (((kxxMzzFromfcNEQ_MMM - kxxMzzFromfcNEQ_PPP) + (kxxMzzFromfcNEQ_PPM - kxxMzzFromfcNEQ_MMP)) +
+                         ((kxxMzzFromfcNEQ_MPM - kxxMzzFromfcNEQ_PMP) + (kxxMzzFromfcNEQ_PMM - kxxMzzFromfcNEQ_MPP)) +
+                         ((vx1_PPP + vx1_MMM) - (vx1_MMP + vx1_PPM)) + ((vx1_MPM + vx1_PMP) - (vx1_PMM + vx1_MPP)) +
+                         ((vx2_PPP + vx2_MMM) - (vx2_MMP + vx2_PPM)) + ((vx2_PMM + vx2_MPP) - (vx2_MPM + vx2_PMP))) +
+                 ((kxxMyyFromfcNEQ_PPP - kxxMyyFromfcNEQ_MMM) + (kxxMyyFromfcNEQ_MMP - kxxMyyFromfcNEQ_PPM)) +
+                 ((kxxMyyFromfcNEQ_PMP - kxxMyyFromfcNEQ_MPM) + (kxxMyyFromfcNEQ_MPP - kxxMyyFromfcNEQ_PMM)));
+
+    // a_110 = (vx1_PPM + vx1_PPP - vx1_MPM - vx1_MPP - vx1_PMM - vx1_PMP + vx1_MMM + vx1_MMP) / c2o1;
+    // b_110 = (vx2_PPM + vx2_PPP - vx2_MPM - vx2_MPP - vx2_PMM - vx2_PMP + vx2_MMM + vx2_MMP) / c2o1;
+    // c_110 = (vx3_PPM + vx3_PPP - vx3_MPM - vx3_MPP - vx3_PMM - vx3_PMP + vx3_MMM + vx3_MMP) / c2o1;
+    a_110 = c1o2 * (((vx1_PPP + vx1_MMM) + (vx1_MMP + vx1_PPM)) - ((vx1_MPM + vx1_PMP) + (vx1_PMM + vx1_MPP)));
+    b_110 = c1o2 * (((vx2_PPP + vx2_MMM) + (vx2_MMP + vx2_PPM)) - ((vx2_MPM + vx2_PMP) + (vx2_PMM + vx2_MPP)));
+    c_110 = c1o2 * (((vx3_PPP + vx3_MMM) + (vx3_MMP + vx3_PPM)) - ((vx3_MPM + vx3_PMP) + (vx3_PMM + vx3_MPP)));
+
+    // a_101 = (-vx1_PPM + vx1_PPP + vx1_MPM - vx1_MPP - vx1_PMM + vx1_PMP + vx1_MMM - vx1_MMP) / c2o1;
+    // b_101 = (-vx2_PPM + vx2_PPP + vx2_MPM - vx2_MPP - vx2_PMM + vx2_PMP + vx2_MMM - vx2_MMP) / c2o1;
+    // c_101 = (-vx3_PPM + vx3_PPP + vx3_MPM - vx3_MPP - vx3_PMM + vx3_PMP + vx3_MMM - vx3_MMP) / c2o1;
+    a_101 = c1o2 * (((vx1_PPP + vx1_MMM) - (vx1_MMP + vx1_PPM)) + ((vx1_MPM + vx1_PMP) - (vx1_PMM + vx1_MPP)));
+    b_101 = c1o2 * (((vx2_PPP + vx2_MMM) - (vx2_MMP + vx2_PPM)) + ((vx2_MPM + vx2_PMP) - (vx2_PMM + vx2_MPP)));
+    c_101 = c1o2 * (((vx3_PPP + vx3_MMM) - (vx3_MMP + vx3_PPM)) + ((vx3_MPM + vx3_PMP) - (vx3_PMM + vx3_MPP)));
+
+    // a_011 = (-vx1_PPM + vx1_PPP - vx1_MPM + vx1_MPP + vx1_PMM - vx1_PMP + vx1_MMM - vx1_MMP) / c2o1;
+    // b_011 = (-vx2_PPM + vx2_PPP - vx2_MPM + vx2_MPP + vx2_PMM - vx2_PMP + vx2_MMM - vx2_MMP) / c2o1;
+    // c_011 = (-vx3_PPM + vx3_PPP - vx3_MPM + vx3_MPP + vx3_PMM - vx3_PMP + vx3_MMM - vx3_MMP) / c2o1;
+    a_011 = c1o2 * (((vx1_PPP + vx1_MMM) - (vx1_MMP + vx1_PPM)) + ((vx1_PMM + vx1_MPP) - (vx1_MPM + vx1_PMP)));
+    b_011 = c1o2 * (((vx2_PPP + vx2_MMM) - (vx2_MMP + vx2_PPM)) + ((vx2_PMM + vx2_MPP) - (vx2_MPM + vx2_PMP)));
+    c_011 = c1o2 * (((vx3_PPP + vx3_MMM) - (vx3_MMP + vx3_PPM)) + ((vx3_PMM + vx3_MPP) - (vx3_MPM + vx3_PMP)));
+
+    // a_111 = -vx1_PPM + vx1_PPP + vx1_MPM - vx1_MPP + vx1_PMM - vx1_PMP - vx1_MMM + vx1_MMP;
+    // b_111 = -vx2_PPM + vx2_PPP + vx2_MPM - vx2_MPP + vx2_PMM - vx2_PMP - vx2_MMM + vx2_MMP;
+    // c_111 = -vx3_PPM + vx3_PPP + vx3_MPM - vx3_MPP + vx3_PMM - vx3_PMP - vx3_MMM + vx3_MMP;
+    a_111 = ((vx1_PPP - vx1_MMM) + (vx1_MMP - vx1_PPM)) + ((vx1_MPM - vx1_PMP) + (vx1_PMM - vx1_MPP));
+    b_111 = ((vx2_PPP - vx2_MMM) + (vx2_MMP - vx2_PPM)) + ((vx2_MPM - vx2_PMP) + (vx2_PMM - vx2_MPP));
+    c_111 = ((vx3_PPP - vx3_MMM) + (vx3_MMP - vx3_PPM)) + ((vx3_MPM - vx3_PMP) + (vx3_PMM - vx3_MPP));
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -618,9 +741,9 @@ __global__ void scaleCF_compressible(
     ////////////////////////////////////////////////////////////////////////////////
     //! - Set the relative position of the offset cell {-1, 0, 1}
     //!
-    real xoff    = offsetCF.xOffCF[k_thread];
-    real yoff    = offsetCF.yOffCF[k_thread];
-    real zoff    = offsetCF.zOffCF[k_thread];
+    real xoff    = offsetCF.xOffCF[nodeIndex];
+    real yoff    = offsetCF.yOffCF[nodeIndex];
+    real zoff    = offsetCF.zOffCF[nodeIndex];
 
     real xoff_sq = xoff * xoff;
     real yoff_sq = yoff * yoff;
@@ -632,14 +755,29 @@ __global__ void scaleCF_compressible(
         ((xoff != c0o1) || (yoff != c0o1) || (zoff != c0o1))
         ? c0o1
         : -c3o1 * (a_100 * a_100 + b_010 * b_010 + c_001 * c_001) - c6o1 * (b_100 * a_010 + c_100 * a_001 + c_010 * b_001);
-    d_000 = ( drho_PPM + drho_PPP + drho_MPM + drho_MPP + drho_PMM + drho_PMP + drho_MMM + drho_MMP) * c1o8;
-    d_100 = ( drho_PPM + drho_PPP - drho_MPM - drho_MPP + drho_PMM + drho_PMP - drho_MMM - drho_MMP) * c1o4;
-    d_010 = ( drho_PPM + drho_PPP + drho_MPM + drho_MPP - drho_PMM - drho_PMP - drho_MMM - drho_MMP) * c1o4;
-    d_001 = (-drho_PPM + drho_PPP - drho_MPM + drho_MPP - drho_PMM + drho_PMP - drho_MMM + drho_MMP) * c1o4;
-    d_110 = ( drho_PPM + drho_PPP - drho_MPM - drho_MPP - drho_PMM - drho_PMP + drho_MMM + drho_MMP) * c1o2;
-    d_101 = (-drho_PPM + drho_PPP + drho_MPM - drho_MPP - drho_PMM + drho_PMP + drho_MMM - drho_MMP) * c1o2;
-    d_011 = (-drho_PPM + drho_PPP - drho_MPM + drho_MPP + drho_PMM - drho_PMP + drho_MMM - drho_MMP) * c1o2;
-    d_111 =  -drho_PPM + drho_PPP + drho_MPM - drho_MPP + drho_PMM - drho_PMP - drho_MMM + drho_MMP;
+    // d_000 = ( drho_PPM + drho_PPP + drho_MPM + drho_MPP + drho_PMM + drho_PMP + drho_MMM + drho_MMP) * c1o8;
+    d_000 = c1o8 * (((drho_PPP + drho_MMM) + (drho_PPM + drho_MMP)) + ((drho_PMM + drho_MPP) + (drho_PMP + drho_MPM)));
+
+    // d_100 = ( drho_PPM + drho_PPP - drho_MPM - drho_MPP + drho_PMM + drho_PMP - drho_MMM - drho_MMP) * c1o4;
+    d_100 = c1o4 * (((drho_PPP - drho_MMM) + (drho_PPM - drho_MMP)) + ((drho_PMM - drho_MPP) + (drho_PMP - drho_MPM)));
+
+    // d_010 = ( drho_PPM + drho_PPP + drho_MPM + drho_MPP - drho_PMM - drho_PMP - drho_MMM - drho_MMP) * c1o4;
+    d_010 = c1o4 * (((drho_PPP - drho_MMM) + (drho_PPM - drho_MMP)) + ((drho_MPP - drho_PMM) + (drho_MPM - drho_PMP)));
+
+    // d_001 = (-drho_PPM + drho_PPP - drho_MPM + drho_MPP - drho_PMM + drho_PMP - drho_MMM + drho_MMP) * c1o4;
+    d_001 = c1o4 * (((drho_PPP - drho_MMM) + (drho_MMP - drho_PPM)) + ((drho_MPP - drho_PMM) + (drho_PMP - drho_MPM)));
+
+    // d_110 = ( drho_PPM + drho_PPP - drho_MPM - drho_MPP - drho_PMM - drho_PMP + drho_MMM + drho_MMP) * c1o2;
+    d_110 = c1o2 * (((drho_PPP + drho_MMM) + (drho_PPM + drho_MMP)) - ((drho_PMM + drho_MPP) + (drho_PMP + drho_MPM)));
+
+    // d_101 = (-drho_PPM + drho_PPP + drho_MPM - drho_MPP - drho_PMM + drho_PMP + drho_MMM - drho_MMP) * c1o2;
+    d_101 = c1o2 * (((drho_PPP + drho_MMM) - (drho_PPM + drho_MMP)) + ((drho_PMP + drho_MPM) - (drho_PMM + drho_MPP)));
+
+    // d_011 = (-drho_PPM + drho_PPP - drho_MPM + drho_MPP + drho_PMM - drho_PMP + drho_MMM - drho_MMP) * c1o2;
+    d_011 = c1o2 * (((drho_PPP + drho_MMM) - (drho_PPM + drho_MMP)) + ((drho_PMM + drho_MPP) - (drho_PMP + drho_MPM)));
+
+    // d_111 =  -drho_PPM + drho_PPP + drho_MPM - drho_MPP + drho_PMM - drho_PMP - drho_MMM + drho_MMP;
+    d_111 = (((drho_PPP - drho_MMM) + (drho_MMP - drho_PPM)) + ((drho_PMM - drho_MPP) + (drho_MPM - drho_PMP)));
 
     //////////////////////////////////////////////////////////////////////////
     //! - Extrapolation for refinement in to the wall (polynomial coefficients)
@@ -768,7 +906,7 @@ __global__ void scaleCF_compressible(
 
     //////////////////////////////////////////////////////////////////////////
     // index of the base node and its neighbors
-    k_base_000 = indicesFineMMM[k_thread];
+    k_base_000 = indicesFineMMM[nodeIndex];
     k_base_M00 = neighborXfine [k_base_000];
     k_base_0M0 = neighborYfine [k_base_000];
     k_base_00M = neighborZfine [k_base_000];
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleFC_compressible.cu b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleFC_compressible.cu
index 3ab8b9d20279eff341ca42d20cee9fe7550a2039..e7d999d108e59bca98bf87b813f9479f1c601266 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleFC_compressible.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleFC_compressible.cu
@@ -31,12 +31,13 @@
 //! \author Martin Schoenherr, Anna Wellmann
 //=======================================================================================
 
-#include "Kernel/Utilities/DistributionHelper.cuh"
-#include "Kernel/Utilities/ChimeraTransformation.h"
-#include "Kernel/Utilities/ScalingHelperFunctions.h"
+#include "LBM/GPUHelperFunctions/ChimeraTransformation.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
+#include "LBM/GPUHelperFunctions/ScalingUtilities.h"
 
 using namespace vf::lbm::constant;
 using namespace vf::lbm::dir;
+using namespace vf::gpu;
 
 //////////////////////////////////////////////////////////////////////////
 //! \brief Interpolate from fine to coarse
@@ -54,8 +55,8 @@ __global__ void scaleFC_compressible(
     unsigned int *neighborXfine,
     unsigned int *neighborYfine,
     unsigned int *neighborZfine,
-    unsigned int numberOfLBnodesCoarse,
-    unsigned int numberOfLBnodesFine,
+    unsigned long long numberOfLBnodesCoarse,
+    unsigned long long numberOfLBnodesFine,
     bool isEvenTimestep,
     unsigned int *indicesCoarse000,
     unsigned int *indicesFineMMM,
@@ -65,13 +66,13 @@ __global__ void scaleFC_compressible(
     OffFC offsetFC)
 {
     ////////////////////////////////////////////////////////////////////////////////
-    //! - Get the thread index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //! - Get the node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
     //!
-    const unsigned k_thread = vf::gpu::getNodeIndex();
+    const unsigned nodeIndex = getNodeIndex();
 
     //////////////////////////////////////////////////////////////////////////
     //! - Return for non-interface node
-    if (k_thread >= numberOfInterfaceNodes)
+    if (nodeIndex >= numberOfInterfaceNodes)
         return;
 
     //////////////////////////////////////////////////////////////////////////
@@ -80,8 +81,9 @@ __global__ void scaleFC_compressible(
     //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
     //! DOI:10.3390/computation5020019 ]</b></a>
     //!
-    Distributions27 distFine   = vf::gpu::getDistributionReferences27(distributionsFine,   numberOfLBnodesFine,   true);
-    Distributions27 distCoarse = vf::gpu::getDistributionReferences27(distributionsCoarse, numberOfLBnodesCoarse, isEvenTimestep);
+    Distributions27 distFine, distCoarse;
+    getPointersToDistributions(distFine, distributionsFine, numberOfLBnodesFine, true);
+    getPointersToDistributions(distCoarse, distributionsCoarse, numberOfLBnodesCoarse, isEvenTimestep);
 
     ////////////////////////////////////////////////////////////////////////////////
     //! - declare local variables for source nodes
@@ -117,7 +119,7 @@ __global__ void scaleFC_compressible(
     // source node BSW = MMM
     //////////////////////////////////////////////////////////////////////////
     // index of the base node and its neighbors
-    unsigned int k_base_000 = indicesFineMMM[k_thread];
+    unsigned int k_base_000 = indicesFineMMM[nodeIndex];
     unsigned int k_base_M00 = neighborXfine [k_base_000];
     unsigned int k_base_0M0 = neighborYfine [k_base_000];
     unsigned int k_base_00M = neighborZfine [k_base_000];
@@ -278,115 +280,120 @@ __global__ void scaleFC_compressible(
     real c_000, c_100, c_010, c_001, c_200, c_020, c_002, c_110, c_101, c_011;
     real d_000, d_100, d_010, d_001, d_110, d_101, d_011;
 
-    a_000 = (-kxxMyyFromfcNEQ_PPM - kxxMyyFromfcNEQ_PPP + kxxMyyFromfcNEQ_MPM + kxxMyyFromfcNEQ_MPP -
-            kxxMyyFromfcNEQ_PMM - kxxMyyFromfcNEQ_PMP + kxxMyyFromfcNEQ_MMM + kxxMyyFromfcNEQ_MMP -
-            kxxMzzFromfcNEQ_PPM - kxxMzzFromfcNEQ_PPP + kxxMzzFromfcNEQ_MPM + kxxMzzFromfcNEQ_MPP -
-            kxxMzzFromfcNEQ_PMM - kxxMzzFromfcNEQ_PMP + kxxMzzFromfcNEQ_MMM + kxxMzzFromfcNEQ_MMP -
-            c2o1 * kxyFromfcNEQ_PPM - c2o1 * kxyFromfcNEQ_PPP - c2o1 * kxyFromfcNEQ_MPM - c2o1 * kxyFromfcNEQ_MPP +
-            c2o1 * kxyFromfcNEQ_PMM + c2o1 * kxyFromfcNEQ_PMP + c2o1 * kxyFromfcNEQ_MMM + c2o1 * kxyFromfcNEQ_MMP +
-            c2o1 * kxzFromfcNEQ_PPM - c2o1 * kxzFromfcNEQ_PPP + c2o1 * kxzFromfcNEQ_MPM - c2o1 * kxzFromfcNEQ_MPP +
-            c2o1 * kxzFromfcNEQ_PMM - c2o1 * kxzFromfcNEQ_PMP + c2o1 * kxzFromfcNEQ_MMM - c2o1 * kxzFromfcNEQ_MMP +
-            c8o1 * vx1_PPM + c8o1 * vx1_PPP + c8o1 * vx1_MPM + c8o1 * vx1_MPP + c8o1 * vx1_PMM + c8o1 * vx1_PMP +
-            c8o1 * vx1_MMM + c8o1 * vx1_MMP + c2o1 * vx2_PPM + c2o1 * vx2_PPP - c2o1 * vx2_MPM - c2o1 * vx2_MPP -
-            c2o1 * vx2_PMM - c2o1 * vx2_PMP + c2o1 * vx2_MMM + c2o1 * vx2_MMP - c2o1 * vx3_PPM + c2o1 * vx3_PPP +
-            c2o1 * vx3_MPM - c2o1 * vx3_MPP - c2o1 * vx3_PMM + c2o1 * vx3_PMP + c2o1 * vx3_MMM - c2o1 * vx3_MMP) /
-            c64o1;
-    b_000 = (c2o1 * kxxMyyFromfcNEQ_PPM + c2o1 * kxxMyyFromfcNEQ_PPP + c2o1 * kxxMyyFromfcNEQ_MPM +
-            c2o1 * kxxMyyFromfcNEQ_MPP - c2o1 * kxxMyyFromfcNEQ_PMM - c2o1 * kxxMyyFromfcNEQ_PMP -
-            c2o1 * kxxMyyFromfcNEQ_MMM - c2o1 * kxxMyyFromfcNEQ_MMP - kxxMzzFromfcNEQ_PPM - kxxMzzFromfcNEQ_PPP -
-            kxxMzzFromfcNEQ_MPM - kxxMzzFromfcNEQ_MPP + kxxMzzFromfcNEQ_PMM + kxxMzzFromfcNEQ_PMP +
-            kxxMzzFromfcNEQ_MMM + kxxMzzFromfcNEQ_MMP - c2o1 * kxyFromfcNEQ_PPM - c2o1 * kxyFromfcNEQ_PPP +
-            c2o1 * kxyFromfcNEQ_MPM + c2o1 * kxyFromfcNEQ_MPP - c2o1 * kxyFromfcNEQ_PMM - c2o1 * kxyFromfcNEQ_PMP +
-            c2o1 * kxyFromfcNEQ_MMM + c2o1 * kxyFromfcNEQ_MMP + c2o1 * kyzFromfcNEQ_PPM - c2o1 * kyzFromfcNEQ_PPP +
-            c2o1 * kyzFromfcNEQ_MPM - c2o1 * kyzFromfcNEQ_MPP + c2o1 * kyzFromfcNEQ_PMM - c2o1 * kyzFromfcNEQ_PMP +
-            c2o1 * kyzFromfcNEQ_MMM - c2o1 * kyzFromfcNEQ_MMP + c2o1 * vx1_PPM + c2o1 * vx1_PPP - c2o1 * vx1_MPM -
-            c2o1 * vx1_MPP - c2o1 * vx1_PMM - c2o1 * vx1_PMP + c2o1 * vx1_MMM + c2o1 * vx1_MMP + c8o1 * vx2_PPM +
-            c8o1 * vx2_PPP + c8o1 * vx2_MPM + c8o1 * vx2_MPP + c8o1 * vx2_PMM + c8o1 * vx2_PMP + c8o1 * vx2_MMM +
-            c8o1 * vx2_MMP - c2o1 * vx3_PPM + c2o1 * vx3_PPP - c2o1 * vx3_MPM + c2o1 * vx3_MPP + c2o1 * vx3_PMM -
-            c2o1 * vx3_PMP + c2o1 * vx3_MMM - c2o1 * vx3_MMP) /
-            c64o1;
-    c_000 = (kxxMyyFromfcNEQ_PPM - kxxMyyFromfcNEQ_PPP + kxxMyyFromfcNEQ_MPM - kxxMyyFromfcNEQ_MPP +
-            kxxMyyFromfcNEQ_PMM - kxxMyyFromfcNEQ_PMP + kxxMyyFromfcNEQ_MMM - kxxMyyFromfcNEQ_MMP -
-            c2o1 * kxxMzzFromfcNEQ_PPM + c2o1 * kxxMzzFromfcNEQ_PPP - c2o1 * kxxMzzFromfcNEQ_MPM +
-            c2o1 * kxxMzzFromfcNEQ_MPP - c2o1 * kxxMzzFromfcNEQ_PMM + c2o1 * kxxMzzFromfcNEQ_PMP -
-            c2o1 * kxxMzzFromfcNEQ_MMM + c2o1 * kxxMzzFromfcNEQ_MMP - c2o1 * kxzFromfcNEQ_PPM -
-            c2o1 * kxzFromfcNEQ_PPP + c2o1 * kxzFromfcNEQ_MPM + c2o1 * kxzFromfcNEQ_MPP - c2o1 * kxzFromfcNEQ_PMM -
-            c2o1 * kxzFromfcNEQ_PMP + c2o1 * kxzFromfcNEQ_MMM + c2o1 * kxzFromfcNEQ_MMP - c2o1 * kyzFromfcNEQ_PPM -
-            c2o1 * kyzFromfcNEQ_PPP - c2o1 * kyzFromfcNEQ_MPM - c2o1 * kyzFromfcNEQ_MPP + c2o1 * kyzFromfcNEQ_PMM +
-            c2o1 * kyzFromfcNEQ_PMP + c2o1 * kyzFromfcNEQ_MMM + c2o1 * kyzFromfcNEQ_MMP - c2o1 * vx1_PPM +
-            c2o1 * vx1_PPP + c2o1 * vx1_MPM - c2o1 * vx1_MPP - c2o1 * vx1_PMM + c2o1 * vx1_PMP + c2o1 * vx1_MMM -
-            c2o1 * vx1_MMP - c2o1 * vx2_PPM + c2o1 * vx2_PPP - c2o1 * vx2_MPM + c2o1 * vx2_MPP + c2o1 * vx2_PMM -
-            c2o1 * vx2_PMP + c2o1 * vx2_MMM - c2o1 * vx2_MMP + c8o1 * vx3_PPM + c8o1 * vx3_PPP + c8o1 * vx3_MPM +
-            c8o1 * vx3_MPP + c8o1 * vx3_PMM + c8o1 * vx3_PMP + c8o1 * vx3_MMM + c8o1 * vx3_MMP) /
-            c64o1;
-    a_100  = (vx1_PPM + vx1_PPP - vx1_MPM - vx1_MPP + vx1_PMM + vx1_PMP - vx1_MMM - vx1_MMP) / c4o1;
-    b_100  = (vx2_PPM + vx2_PPP - vx2_MPM - vx2_MPP + vx2_PMM + vx2_PMP - vx2_MMM - vx2_MMP) / c4o1;
-    c_100  = (vx3_PPM + vx3_PPP - vx3_MPM - vx3_MPP + vx3_PMM + vx3_PMP - vx3_MMM - vx3_MMP) / c4o1;
-    a_200 = (kxxMyyFromfcNEQ_PPM + kxxMyyFromfcNEQ_PPP - kxxMyyFromfcNEQ_MPM - kxxMyyFromfcNEQ_MPP +
-            kxxMyyFromfcNEQ_PMM + kxxMyyFromfcNEQ_PMP - kxxMyyFromfcNEQ_MMM - kxxMyyFromfcNEQ_MMP +
-            kxxMzzFromfcNEQ_PPM + kxxMzzFromfcNEQ_PPP - kxxMzzFromfcNEQ_MPM - kxxMzzFromfcNEQ_MPP +
-            kxxMzzFromfcNEQ_PMM + kxxMzzFromfcNEQ_PMP - kxxMzzFromfcNEQ_MMM - kxxMzzFromfcNEQ_MMP + c2o1 * vx2_PPM +
-            c2o1 * vx2_PPP - c2o1 * vx2_MPM - c2o1 * vx2_MPP - c2o1 * vx2_PMM - c2o1 * vx2_PMP + c2o1 * vx2_MMM +
-            c2o1 * vx2_MMP - c2o1 * vx3_PPM + c2o1 * vx3_PPP + c2o1 * vx3_MPM - c2o1 * vx3_MPP - c2o1 * vx3_PMM +
-            c2o1 * vx3_PMP + c2o1 * vx3_MMM - c2o1 * vx3_MMP) /
-            c16o1;
-    b_200 = (kxyFromfcNEQ_PPM + kxyFromfcNEQ_PPP - kxyFromfcNEQ_MPM - kxyFromfcNEQ_MPP + kxyFromfcNEQ_PMM +
-            kxyFromfcNEQ_PMP - kxyFromfcNEQ_MMM - kxyFromfcNEQ_MMP - c2o1 * vx1_PPM - c2o1 * vx1_PPP +
-            c2o1 * vx1_MPM + c2o1 * vx1_MPP + c2o1 * vx1_PMM + c2o1 * vx1_PMP - c2o1 * vx1_MMM - c2o1 * vx1_MMP) /
-            c8o1;
-    c_200 = (kxzFromfcNEQ_PPM + kxzFromfcNEQ_PPP - kxzFromfcNEQ_MPM - kxzFromfcNEQ_MPP + kxzFromfcNEQ_PMM +
-            kxzFromfcNEQ_PMP - kxzFromfcNEQ_MMM - kxzFromfcNEQ_MMP + c2o1 * vx1_PPM - c2o1 * vx1_PPP -
-            c2o1 * vx1_MPM + c2o1 * vx1_MPP + c2o1 * vx1_PMM - c2o1 * vx1_PMP - c2o1 * vx1_MMM + c2o1 * vx1_MMP) /
-            c8o1;
-    a_010  = (vx1_PPM + vx1_PPP + vx1_MPM + vx1_MPP - vx1_PMM - vx1_PMP - vx1_MMM - vx1_MMP) / c4o1;
-    b_010  = (vx2_PPM + vx2_PPP + vx2_MPM + vx2_MPP - vx2_PMM - vx2_PMP - vx2_MMM - vx2_MMP) / c4o1;
-    c_010  = (vx3_PPM + vx3_PPP + vx3_MPM + vx3_MPP - vx3_PMM - vx3_PMP - vx3_MMM - vx3_MMP) / c4o1;
-    a_020 = (kxyFromfcNEQ_PPM + kxyFromfcNEQ_PPP + kxyFromfcNEQ_MPM + kxyFromfcNEQ_MPP - kxyFromfcNEQ_PMM -
-            kxyFromfcNEQ_PMP - kxyFromfcNEQ_MMM - kxyFromfcNEQ_MMP - c2o1 * vx2_PPM - c2o1 * vx2_PPP +
-            c2o1 * vx2_MPM + c2o1 * vx2_MPP + c2o1 * vx2_PMM + c2o1 * vx2_PMP - c2o1 * vx2_MMM - c2o1 * vx2_MMP) /
-            c8o1;
-    b_020 = (-c2o1 * kxxMyyFromfcNEQ_PPM - c2o1 * kxxMyyFromfcNEQ_PPP - c2o1 * kxxMyyFromfcNEQ_MPM -
-            c2o1 * kxxMyyFromfcNEQ_MPP + c2o1 * kxxMyyFromfcNEQ_PMM + c2o1 * kxxMyyFromfcNEQ_PMP +
-            c2o1 * kxxMyyFromfcNEQ_MMM + c2o1 * kxxMyyFromfcNEQ_MMP + kxxMzzFromfcNEQ_PPM + kxxMzzFromfcNEQ_PPP +
-            kxxMzzFromfcNEQ_MPM + kxxMzzFromfcNEQ_MPP - kxxMzzFromfcNEQ_PMM - kxxMzzFromfcNEQ_PMP -
-            kxxMzzFromfcNEQ_MMM - kxxMzzFromfcNEQ_MMP + c2o1 * vx1_PPM + c2o1 * vx1_PPP - c2o1 * vx1_MPM -
-            c2o1 * vx1_MPP - c2o1 * vx1_PMM - c2o1 * vx1_PMP + c2o1 * vx1_MMM + c2o1 * vx1_MMP - c2o1 * vx3_PPM +
-            c2o1 * vx3_PPP - c2o1 * vx3_MPM + c2o1 * vx3_MPP + c2o1 * vx3_PMM - c2o1 * vx3_PMP + c2o1 * vx3_MMM -
-            c2o1 * vx3_MMP) /
-            c16o1;
-    c_020 = (kyzFromfcNEQ_PPM + kyzFromfcNEQ_PPP + kyzFromfcNEQ_MPM + kyzFromfcNEQ_MPP - kyzFromfcNEQ_PMM -
-            kyzFromfcNEQ_PMP - kyzFromfcNEQ_MMM - kyzFromfcNEQ_MMP + c2o1 * vx2_PPM - c2o1 * vx2_PPP +
-            c2o1 * vx2_MPM - c2o1 * vx2_MPP - c2o1 * vx2_PMM + c2o1 * vx2_PMP - c2o1 * vx2_MMM + c2o1 * vx2_MMP) /
-            c8o1;
-    a_001  = (-vx1_PPM + vx1_PPP - vx1_MPM + vx1_MPP - vx1_PMM + vx1_PMP - vx1_MMM + vx1_MMP) / c4o1;
-    b_001  = (-vx2_PPM + vx2_PPP - vx2_MPM + vx2_MPP - vx2_PMM + vx2_PMP - vx2_MMM + vx2_MMP) / c4o1;
-    c_001  = (-vx3_PPM + vx3_PPP - vx3_MPM + vx3_MPP - vx3_PMM + vx3_PMP - vx3_MMM + vx3_MMP) / c4o1;
-    a_002 = (-kxzFromfcNEQ_PPM + kxzFromfcNEQ_PPP - kxzFromfcNEQ_MPM + kxzFromfcNEQ_MPP - kxzFromfcNEQ_PMM +
-            kxzFromfcNEQ_PMP - kxzFromfcNEQ_MMM + kxzFromfcNEQ_MMP + c2o1 * vx3_PPM - c2o1 * vx3_PPP -
-            c2o1 * vx3_MPM + c2o1 * vx3_MPP + c2o1 * vx3_PMM - c2o1 * vx3_PMP - c2o1 * vx3_MMM + c2o1 * vx3_MMP) /
-            c8o1;
-    b_002 = (-kyzFromfcNEQ_PPM + kyzFromfcNEQ_PPP - kyzFromfcNEQ_MPM + kyzFromfcNEQ_MPP - kyzFromfcNEQ_PMM +
-            kyzFromfcNEQ_PMP - kyzFromfcNEQ_MMM + kyzFromfcNEQ_MMP + c2o1 * vx3_PPM - c2o1 * vx3_PPP +
-            c2o1 * vx3_MPM - c2o1 * vx3_MPP - c2o1 * vx3_PMM + c2o1 * vx3_PMP - c2o1 * vx3_MMM + c2o1 * vx3_MMP) /
-            c8o1;
-    c_002 = (-kxxMyyFromfcNEQ_PPM + kxxMyyFromfcNEQ_PPP - kxxMyyFromfcNEQ_MPM + kxxMyyFromfcNEQ_MPP -
-            kxxMyyFromfcNEQ_PMM + kxxMyyFromfcNEQ_PMP - kxxMyyFromfcNEQ_MMM + kxxMyyFromfcNEQ_MMP +
-            c2o1 * kxxMzzFromfcNEQ_PPM - c2o1 * kxxMzzFromfcNEQ_PPP + c2o1 * kxxMzzFromfcNEQ_MPM -
-            c2o1 * kxxMzzFromfcNEQ_MPP + c2o1 * kxxMzzFromfcNEQ_PMM - c2o1 * kxxMzzFromfcNEQ_PMP +
-            c2o1 * kxxMzzFromfcNEQ_MMM - c2o1 * kxxMzzFromfcNEQ_MMP - c2o1 * vx1_PPM + c2o1 * vx1_PPP +
-            c2o1 * vx1_MPM - c2o1 * vx1_MPP - c2o1 * vx1_PMM + c2o1 * vx1_PMP + c2o1 * vx1_MMM - c2o1 * vx1_MMP -
-            c2o1 * vx2_PPM + c2o1 * vx2_PPP - c2o1 * vx2_MPM + c2o1 * vx2_MPP + c2o1 * vx2_PMM - c2o1 * vx2_PMP +
-            c2o1 * vx2_MMM - c2o1 * vx2_MMP) /
-            c16o1;
-    a_110 = (vx1_PPM + vx1_PPP - vx1_MPM - vx1_MPP - vx1_PMM - vx1_PMP + vx1_MMM + vx1_MMP) / c2o1;
-    b_110 = (vx2_PPM + vx2_PPP - vx2_MPM - vx2_MPP - vx2_PMM - vx2_PMP + vx2_MMM + vx2_MMP) / c2o1;
-    c_110 = (vx3_PPM + vx3_PPP - vx3_MPM - vx3_MPP - vx3_PMM - vx3_PMP + vx3_MMM + vx3_MMP) / c2o1;
-    a_101 = (-vx1_PPM + vx1_PPP + vx1_MPM - vx1_MPP - vx1_PMM + vx1_PMP + vx1_MMM - vx1_MMP) / c2o1;
-    b_101 = (-vx2_PPM + vx2_PPP + vx2_MPM - vx2_MPP - vx2_PMM + vx2_PMP + vx2_MMM - vx2_MMP) / c2o1;
-    c_101 = (-vx3_PPM + vx3_PPP + vx3_MPM - vx3_MPP - vx3_PMM + vx3_PMP + vx3_MMM - vx3_MMP) / c2o1;
-    a_011 = (-vx1_PPM + vx1_PPP - vx1_MPM + vx1_MPP + vx1_PMM - vx1_PMP + vx1_MMM - vx1_MMP) / c2o1;
-    b_011 = (-vx2_PPM + vx2_PPP - vx2_MPM + vx2_MPP + vx2_PMM - vx2_PMP + vx2_MMM - vx2_MMP) / c2o1;
-    c_011 = (-vx3_PPM + vx3_PPP - vx3_MPM + vx3_MPP + vx3_PMM - vx3_PMP + vx3_MMM - vx3_MMP) / c2o1;
+    a_000 = c1o64 * (
+            c2o1 * (
+            ((kxyFromfcNEQ_MMM - kxyFromfcNEQ_PPP) + (kxyFromfcNEQ_MMP - kxyFromfcNEQ_PPM)) + ((kxyFromfcNEQ_PMM - kxyFromfcNEQ_MPP) + (kxyFromfcNEQ_PMP - kxyFromfcNEQ_MPM)) + 
+            ((kxzFromfcNEQ_MMM - kxzFromfcNEQ_PPP) + (kxzFromfcNEQ_PPM - kxzFromfcNEQ_MMP)) + ((kxzFromfcNEQ_PMM - kxzFromfcNEQ_MPP) + (kxzFromfcNEQ_MPM - kxzFromfcNEQ_PMP)) + 
+            ((vx2_PPP + vx2_MMM) + (vx2_PPM + vx2_MMP)) - ((vx2_MPP + vx2_PMM) + (vx2_MPM + vx2_PMP)) + 
+            ((vx3_PPP + vx3_MMM) - (vx3_PPM + vx3_MMP)) + ((vx3_PMP + vx3_MPM) - (vx3_MPP + vx3_PMM))) + 
+            c8o1 * (((vx1_PPP + vx1_MMM) + (vx1_PPM + vx1_MMP)) + ((vx1_MPP + vx1_PMM) + (vx1_PMP + vx1_MPM))) +
+            ((kxxMyyFromfcNEQ_MMM - kxxMyyFromfcNEQ_PPP) + (kxxMyyFromfcNEQ_MMP - kxxMyyFromfcNEQ_PPM)) + 
+            ((kxxMyyFromfcNEQ_MPP - kxxMyyFromfcNEQ_PMM) + (kxxMyyFromfcNEQ_MPM - kxxMyyFromfcNEQ_PMP)) +
+            ((kxxMzzFromfcNEQ_MMM - kxxMzzFromfcNEQ_PPP) + (kxxMzzFromfcNEQ_MMP - kxxMzzFromfcNEQ_PPM)) + 
+            ((kxxMzzFromfcNEQ_MPP - kxxMzzFromfcNEQ_PMM) + (kxxMzzFromfcNEQ_MPM - kxxMzzFromfcNEQ_PMP)));
+    b_000 = c1o64 * (
+            c2o1 * (
+            ((kxxMyyFromfcNEQ_PPP - kxxMyyFromfcNEQ_MMM) + (kxxMyyFromfcNEQ_PPM - kxxMyyFromfcNEQ_MMP)) + 
+            ((kxxMyyFromfcNEQ_MPP - kxxMyyFromfcNEQ_PMM) + (kxxMyyFromfcNEQ_MPM - kxxMyyFromfcNEQ_PMP)) + 
+            ((kxyFromfcNEQ_MMM - kxyFromfcNEQ_PPP) + (kxyFromfcNEQ_MMP - kxyFromfcNEQ_PPM)) + 
+            ((kxyFromfcNEQ_MPP - kxyFromfcNEQ_PMM) + (kxyFromfcNEQ_MPM - kxyFromfcNEQ_PMP)) + 
+            ((kyzFromfcNEQ_MMM - kyzFromfcNEQ_PPP) + (kyzFromfcNEQ_PPM - kyzFromfcNEQ_MMP)) + 
+            ((kyzFromfcNEQ_PMM - kyzFromfcNEQ_MPP) + (kyzFromfcNEQ_MPM - kyzFromfcNEQ_PMP)) + 
+            ((vx1_PPP + vx1_MMM) + (vx1_PPM + vx1_MMP)) - ((vx1_MPM + vx1_MPP) + (vx1_PMM + vx1_PMP)) + 
+            ((vx3_PPP + vx3_MMM) - (vx3_PPM + vx3_MMP)) + ((vx3_MPP + vx3_PMM) - (vx3_MPM + vx3_PMP))) + 
+            c8o1 * (((vx2_PPP + vx2_MMM) + (vx2_PPM + vx2_MMP)) + ((vx2_MPP + vx2_PMM) + (vx2_MPM + vx2_PMP))) + 
+            ((kxxMzzFromfcNEQ_MMM - kxxMzzFromfcNEQ_PPP) + (kxxMzzFromfcNEQ_MMP - kxxMzzFromfcNEQ_PPM)) +
+            ((kxxMzzFromfcNEQ_PMM - kxxMzzFromfcNEQ_MPP) + (kxxMzzFromfcNEQ_PMP - kxxMzzFromfcNEQ_MPM)));
+    c_000 = c1o64 * ( 
+            c2o1 * (
+            ((kxxMzzFromfcNEQ_PPP - kxxMzzFromfcNEQ_MMM) + (kxxMzzFromfcNEQ_MMP - kxxMzzFromfcNEQ_PPM)) + 
+            ((kxxMzzFromfcNEQ_MPP - kxxMzzFromfcNEQ_PMM) + (kxxMzzFromfcNEQ_PMP - kxxMzzFromfcNEQ_MPM)) + 
+            ((kxzFromfcNEQ_MMM - kxzFromfcNEQ_PPP) + (kxzFromfcNEQ_MMP - kxzFromfcNEQ_PPM)) + 
+            ((kxzFromfcNEQ_MPP - kxzFromfcNEQ_PMM) + (kxzFromfcNEQ_MPM - kxzFromfcNEQ_PMP)) + 
+            ((kyzFromfcNEQ_MMM - kyzFromfcNEQ_PPP) + (kyzFromfcNEQ_MMP - kyzFromfcNEQ_PPM)) + 
+            ((kyzFromfcNEQ_PMM - kyzFromfcNEQ_MPP) + (kyzFromfcNEQ_PMP - kyzFromfcNEQ_MPM)) + 
+            ((vx1_PPP + vx1_MMM) - (vx1_MMP + vx1_PPM)) + ((vx1_MPM + vx1_PMP) - (vx1_MPP + vx1_PMM)) + 
+            ((vx2_PPP + vx2_MMM) - (vx2_MMP + vx2_PPM)) + ((vx2_MPP + vx2_PMM) - (vx2_MPM + vx2_PMP))) + 
+            c8o1 * (((vx3_PPP + vx3_MMM) + (vx3_PPM + vx3_MMP)) + ((vx3_PMM + vx3_MPP) + (vx3_PMP + vx3_MPM))) +
+            ((kxxMyyFromfcNEQ_MMM - kxxMyyFromfcNEQ_PPP) + (kxxMyyFromfcNEQ_PPM - kxxMyyFromfcNEQ_MMP)) + 
+            ((kxxMyyFromfcNEQ_PMM - kxxMyyFromfcNEQ_MPP) + (kxxMyyFromfcNEQ_MPM - kxxMyyFromfcNEQ_PMP)));
+
+    a_100 = c1o4 * (((vx1_PPP - vx1_MMM) + (vx1_PPM - vx1_MMP)) + ((vx1_PMM - vx1_MPP) + (vx1_PMP - vx1_MPM)));
+    b_100 = c1o4 * (((vx2_PPP - vx2_MMM) + (vx2_PPM - vx2_MMP)) + ((vx2_PMM - vx2_MPP) + (vx2_PMP - vx2_MPM)));
+    c_100 = c1o4 * (((vx3_PPP - vx3_MMM) + (vx3_PPM - vx3_MMP)) + ((vx3_PMM - vx3_MPP) + (vx3_PMP - vx3_MPM)));
+
+    a_200 = c1o16 * ( 
+            c2o1 * (
+            ((vx2_PPP + vx2_MMM) + (vx2_PPM - vx2_MPP)) + ((vx2_MMP - vx2_PMM) - (vx2_MPM + vx2_PMP)) + 
+            ((vx3_PPP + vx3_MMM) - (vx3_PPM + vx3_MPP)) + ((vx3_MPM + vx3_PMP) - (vx3_MMP + vx3_PMM))) + 
+            ((kxxMyyFromfcNEQ_PPP - kxxMyyFromfcNEQ_MMM) + (kxxMyyFromfcNEQ_PPM - kxxMyyFromfcNEQ_MMP)) + 
+            ((kxxMyyFromfcNEQ_PMM - kxxMyyFromfcNEQ_MPP) + (kxxMyyFromfcNEQ_PMP - kxxMyyFromfcNEQ_MPM)) + 
+            ((kxxMzzFromfcNEQ_PPP - kxxMzzFromfcNEQ_MMM) + (kxxMzzFromfcNEQ_PPM - kxxMzzFromfcNEQ_MMP)) + 
+            ((kxxMzzFromfcNEQ_PMM - kxxMzzFromfcNEQ_MPP) + (kxxMzzFromfcNEQ_PMP - kxxMzzFromfcNEQ_MPM)));
+    b_200 = c1o8 * (
+            c2o1 * (
+            -((vx1_PPP + vx1_MMM) + (vx1_PPM + vx1_MMP)) + ((vx1_MPP + vx1_PMM) + (vx1_MPM + vx1_PMP))) +
+            ((kxyFromfcNEQ_PPP - kxyFromfcNEQ_MMM) + (kxyFromfcNEQ_PPM - kxyFromfcNEQ_MMP)) + 
+            ((kxyFromfcNEQ_PMM - kxyFromfcNEQ_MPP) + (kxyFromfcNEQ_PMP - kxyFromfcNEQ_MPM)));
+    c_200 = c1o8 * (
+            c2o1 * (
+            ((vx1_PPM + vx1_MMP) - (vx1_PPP + vx1_MMM)) + ((vx1_MPP + vx1_PMM) - (vx1_MPM + vx1_PMP))) +
+            ((kxzFromfcNEQ_PPP - kxzFromfcNEQ_MMM) + (kxzFromfcNEQ_PPM - kxzFromfcNEQ_MMP)) + 
+            ((kxzFromfcNEQ_PMM - kxzFromfcNEQ_MPP) + (kxzFromfcNEQ_PMP - kxzFromfcNEQ_MPM)));
+
+    a_010 = c1o4 * (((vx1_PPP - vx1_MMM) + (vx1_PPM - vx1_MMP)) + ((vx1_MPP - vx1_PMM) + (vx1_MPM - vx1_PMP)));
+    b_010 = c1o4 * (((vx2_PPP - vx2_MMM) + (vx2_PPM - vx2_MMP)) + ((vx2_MPP - vx2_PMM) + (vx2_MPM - vx2_PMP)));
+    c_010 = c1o4 * (((vx3_PPP - vx3_MMM) + (vx3_PPM - vx3_MMP)) + ((vx3_MPP - vx3_PMM) + (vx3_MPM - vx3_PMP)));
+
+    a_020 = c1o8 * (
+            c2o1 * (-((vx2_PPP + vx2_MMM) + (vx2_MMP + vx2_PPM)) + ((vx2_MPP + vx2_PMM) + (vx2_MPM + vx2_PMP))) +
+            ((kxyFromfcNEQ_PPP - kxyFromfcNEQ_MMM) + (kxyFromfcNEQ_PPM - kxyFromfcNEQ_MMP)) + 
+            ((kxyFromfcNEQ_MPP - kxyFromfcNEQ_PMM) + (kxyFromfcNEQ_MPM - kxyFromfcNEQ_PMP)));
+    b_020 = c1o16 * (
+            c2o1 * (
+            ((kxxMyyFromfcNEQ_MMM - kxxMyyFromfcNEQ_PPP) + (kxxMyyFromfcNEQ_MMP - kxxMyyFromfcNEQ_PPM)) +
+            ((kxxMyyFromfcNEQ_PMM - kxxMyyFromfcNEQ_MPP) + (kxxMyyFromfcNEQ_PMP - kxxMyyFromfcNEQ_MPM)) +
+            ((vx1_PPP + vx1_MMM) + (vx1_PPM + vx1_MMP)) - ((vx1_MPP + vx1_PMM) + (vx1_PMP + vx1_MPM)) + 
+            ((vx3_PPP + vx3_MMM) - (vx3_PPM + vx3_MMP)) + ((vx3_MPP + vx3_PMM) - (vx3_MPM + vx3_PMP))) +
+            ((kxxMzzFromfcNEQ_PPP - kxxMzzFromfcNEQ_MMM) + (kxxMzzFromfcNEQ_PPM - kxxMzzFromfcNEQ_MMP)) + 
+            ((kxxMzzFromfcNEQ_MPP - kxxMzzFromfcNEQ_PMM) + (kxxMzzFromfcNEQ_MPM - kxxMzzFromfcNEQ_PMP)));
+    c_020 = c1o8 * (
+            c2o1 * (((vx2_MMP + vx2_PPM) - (vx2_PPP + vx2_MMM)) + ((vx2_PMP + vx2_MPM) - (vx2_MPP + vx2_PMM))) +
+            ((kyzFromfcNEQ_PPP - kyzFromfcNEQ_MMM) + (kyzFromfcNEQ_PPM - kyzFromfcNEQ_MMP)) +
+            ((kyzFromfcNEQ_MPP - kyzFromfcNEQ_PMM) + (kyzFromfcNEQ_MPM - kyzFromfcNEQ_PMP)));
+
+    a_001 = c1o4 * (((vx1_PPP - vx1_MMM) + (vx1_MMP - vx1_PPM)) + ((vx1_MPP - vx1_PMM) + (vx1_PMP - vx1_MPM)));
+    b_001 = c1o4 * (((vx2_PPP - vx2_MMM) + (vx2_MMP - vx2_PPM)) + ((vx2_MPP - vx2_PMM) + (vx2_PMP - vx2_MPM)));
+    c_001 = c1o4 * (((vx3_PPP - vx3_MMM) + (vx3_MMP - vx3_PPM)) + ((vx3_MPP - vx3_PMM) + (vx3_PMP - vx3_MPM)));
+
+    a_002 = c1o8 * (
+            c2o1 * (((vx3_PPM + vx3_MMP) - (vx3_PPP + vx3_MMM)) + ((vx3_MPP + vx3_PMM) - (vx3_PMP + vx3_MPM))) +
+                    ((kxzFromfcNEQ_PPP - kxzFromfcNEQ_MMM) + (kxzFromfcNEQ_MMP - kxzFromfcNEQ_PPM)) +
+                    ((kxzFromfcNEQ_PMP - kxzFromfcNEQ_MPM) + (kxzFromfcNEQ_MPP - kxzFromfcNEQ_PMM)));
+    b_002 = c1o8 * (
+            c2o1 * (((vx3_PPM + vx3_MMP) - (vx3_PPP + vx3_MMM)) + ((vx3_MPM + vx3_PMP) - (vx3_PMM + vx3_MPP))) + 
+                    ((kyzFromfcNEQ_PPP - kyzFromfcNEQ_MMM) + (kyzFromfcNEQ_MMP - kyzFromfcNEQ_PPM)) + 
+                    ((kyzFromfcNEQ_PMP - kyzFromfcNEQ_MPM) + (kyzFromfcNEQ_MPP - kyzFromfcNEQ_PMM)));
+    c_002 = c1o16 * (
+            c2o1 * (
+            ((kxxMzzFromfcNEQ_MMM - kxxMzzFromfcNEQ_PPP) + (kxxMzzFromfcNEQ_PPM - kxxMzzFromfcNEQ_MMP)) + 
+            ((kxxMzzFromfcNEQ_MPM - kxxMzzFromfcNEQ_PMP) + (kxxMzzFromfcNEQ_PMM - kxxMzzFromfcNEQ_MPP)) + 
+            ((vx1_PPP + vx1_MMM) - (vx1_MMP + vx1_PPM)) + ((vx1_MPM + vx1_PMP) - (vx1_PMM + vx1_MPP)) + 
+            ((vx2_PPP + vx2_MMM) - (vx2_MMP + vx2_PPM)) + ((vx2_PMM + vx2_MPP) - (vx2_MPM + vx2_PMP))) + 
+            ((kxxMyyFromfcNEQ_PPP - kxxMyyFromfcNEQ_MMM) + (kxxMyyFromfcNEQ_MMP - kxxMyyFromfcNEQ_PPM)) +
+            ((kxxMyyFromfcNEQ_PMP - kxxMyyFromfcNEQ_MPM) + (kxxMyyFromfcNEQ_MPP - kxxMyyFromfcNEQ_PMM)));
+
+    a_110 = c1o2 * (((vx1_PPP + vx1_MMM) + (vx1_MMP + vx1_PPM)) - ((vx1_MPM + vx1_PMP) + (vx1_PMM + vx1_MPP)));
+    b_110 = c1o2 * (((vx2_PPP + vx2_MMM) + (vx2_MMP + vx2_PPM)) - ((vx2_MPM + vx2_PMP) + (vx2_PMM + vx2_MPP)));
+    c_110 = c1o2 * (((vx3_PPP + vx3_MMM) + (vx3_MMP + vx3_PPM)) - ((vx3_MPM + vx3_PMP) + (vx3_PMM + vx3_MPP)));
+
+    a_101 = c1o2 * (((vx1_PPP + vx1_MMM) - (vx1_MMP + vx1_PPM)) + ((vx1_MPM + vx1_PMP) - (vx1_PMM + vx1_MPP)));
+    b_101 = c1o2 * (((vx2_PPP + vx2_MMM) - (vx2_MMP + vx2_PPM)) + ((vx2_MPM + vx2_PMP) - (vx2_PMM + vx2_MPP)));
+    c_101 = c1o2 * (((vx3_PPP + vx3_MMM) - (vx3_MMP + vx3_PPM)) + ((vx3_MPM + vx3_PMP) - (vx3_PMM + vx3_MPP)));
+    
+    a_011 = c1o2 * (((vx1_PPP + vx1_MMM) - (vx1_MMP + vx1_PPM)) + ((vx1_PMM + vx1_MPP) - (vx1_MPM + vx1_PMP)));
+    b_011 = c1o2 * (((vx2_PPP + vx2_MMM) - (vx2_MMP + vx2_PPM)) + ((vx2_PMM + vx2_MPP) - (vx2_MPM + vx2_PMP)));
+    c_011 = c1o2 * (((vx3_PPP + vx3_MMM) - (vx3_MMP + vx3_PPM)) + ((vx3_PMM + vx3_MPP) - (vx3_MPM + vx3_PMP)));
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     
@@ -399,9 +406,9 @@ __global__ void scaleFC_compressible(
     ////////////////////////////////////////////////////////////////////////////////
     //! - Set the relative position of the offset cell {-1, 0, 1}
     //!
-    real xoff    = offsetFC.xOffFC[k_thread];
-    real yoff    = offsetFC.yOffFC[k_thread];
-    real zoff    = offsetFC.zOffFC[k_thread];
+    real xoff    = offsetFC.xOffFC[nodeIndex];
+    real yoff    = offsetFC.yOffFC[nodeIndex];
+    real zoff    = offsetFC.zOffFC[nodeIndex];
      
     real xoff_sq = xoff * xoff;
     real yoff_sq = yoff * yoff;
@@ -412,15 +419,14 @@ __global__ void scaleFC_compressible(
     //! 
     real LaplaceRho = 
         ((xoff != c0o1) || (yoff != c0o1) || (zoff != c0o1))
-        ? c0o1
-        : -c3o1 * (a_100 * a_100 + b_010 * b_010 + c_001 * c_001) - c6o1 * (b_100 * a_010 + c_100 * a_001 + c_010 * b_001);
-    d_000 = ( drho_PPM + drho_PPP + drho_MPM + drho_MPP + drho_PMM + drho_PMP + drho_MMM + drho_MMP - c2o1 * LaplaceRho) * c1o8;
-    d_100 = ( drho_PPM + drho_PPP - drho_MPM - drho_MPP + drho_PMM + drho_PMP - drho_MMM - drho_MMP) * c1o4;
-    d_010 = ( drho_PPM + drho_PPP + drho_MPM + drho_MPP - drho_PMM - drho_PMP - drho_MMM - drho_MMP) * c1o4;
-    d_001 = (-drho_PPM + drho_PPP - drho_MPM + drho_MPP - drho_PMM + drho_PMP - drho_MMM + drho_MMP) * c1o4;
-    d_110 = ( drho_PPM + drho_PPP - drho_MPM - drho_MPP - drho_PMM - drho_PMP + drho_MMM + drho_MMP) * c1o2;
-    d_101 = (-drho_PPM + drho_PPP + drho_MPM - drho_MPP - drho_PMM + drho_PMP + drho_MMM - drho_MMP) * c1o2;
-    d_011 = (-drho_PPM + drho_PPP - drho_MPM + drho_MPP + drho_PMM - drho_PMP + drho_MMM - drho_MMP) * c1o2;
+        ? c0o1 : -c3o1 * (a_100 * a_100 + b_010 * b_010 + c_001 * c_001) - c6o1 * (b_100 * a_010 + c_100 * a_001 + c_010 * b_001);
+    d_000 =  c1o8 * ((((drho_PPP + drho_MMM) + (drho_PPM + drho_MMP)) + ((drho_PMM + drho_MPP) + (drho_PMP + drho_MPM))) - c2o1 * LaplaceRho);
+    d_100 = c1o4 * (((drho_PPP - drho_MMM) + (drho_PPM - drho_MMP)) + ((drho_PMM - drho_MPP) + (drho_PMP - drho_MPM)));
+    d_010 = c1o4 * (((drho_PPP - drho_MMM) + (drho_PPM - drho_MMP)) + ((drho_MPP - drho_PMM) + (drho_MPM - drho_PMP)));
+    d_001 = c1o4 * (((drho_PPP - drho_MMM) + (drho_MMP - drho_PPM)) + ((drho_MPP - drho_PMM) + (drho_PMP - drho_MPM)));
+    d_110 = c1o2 * (((drho_PPP + drho_MMM) + (drho_PPM + drho_MMP)) - ((drho_PMM + drho_MPP) + (drho_PMP + drho_MPM)));
+    d_101 = c1o2 * (((drho_PPP + drho_MMM) - (drho_PPM + drho_MMP)) + ((drho_PMP + drho_MPM) - (drho_PMM + drho_MPP)));
+    d_011 = c1o2 * (((drho_PPP + drho_MMM) - (drho_PPM + drho_MMP)) + ((drho_PMM + drho_MPP) - (drho_PMP + drho_MPM)));
 
 
     //////////////////////////////////////////////////////////////////////////
@@ -639,7 +645,7 @@ __global__ void scaleFC_compressible(
 
     ////////////////////////////////////////////////////////////////////////////////////
     // index of the destination node and its neighbors
-    k_000 = indicesCoarse000[k_thread];
+    k_000 = indicesCoarse000[nodeIndex];
     k_M00 = neighborXcoarse [k_000];
     k_0M0 = neighborYcoarse [k_000];
     k_00M = neighborZcoarse [k_000];
diff --git a/src/gpu/VirtualFluids_GPU/GPU/Init27.cu b/src/gpu/VirtualFluids_GPU/GPU/Init27.cu
index 6d497d2a1ab7ec305bec4f1ad1ed2e2d63c4dc27..23666fdcf6714d30b40b4750c52f129cc472761c 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/Init27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/Init27.cu
@@ -15,7 +15,7 @@ __global__ void LBInit27( int myid,
                                      unsigned int* neighborY,
                                      unsigned int* neighborZ,
                                      real* vParabel,
-                                     unsigned int size_Mat,
+                                     unsigned long long numberOfLBnodes,
                                      unsigned int grid_nx, 
                                      unsigned int grid_ny, 
                                      unsigned int grid_nz, 
@@ -24,33 +24,33 @@ __global__ void LBInit27( int myid,
                                      int maxlev)
 {
    Distributions27 D;
-   D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-   D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-   D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-   D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-   D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-   D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-   D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-   D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-   D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-   D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-   D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-   D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-   D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-   D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-   D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-   D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-   D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-   D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-   D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-   D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-   D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-   D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-   D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-   D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-   D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+   D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+   D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+   D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+   D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+   D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+   D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+   D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+   D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+   D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+   D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+   D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+   D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+   D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+   D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+   D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+   D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+   D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+   D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+   D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+   D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+   D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+   D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+   D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+   D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+   D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+   D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+   D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    ////////////////////////////////////////////////////////////////////////////////
    unsigned int  k;                   // Zugriff auf arrays im device
    //
@@ -142,32 +142,32 @@ __global__ void LBInit27( int myid,
    real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
    (D.f[DIR_000])[kzero] =   c8o27* (drho-cu_sq);
-   (D.f[DIR_P00   ])[ke   ] =   c2o27* (drho+c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cu_sq);
-   (D.f[DIR_M00   ])[kw   ] =   c2o27* (drho+c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cu_sq);
-   (D.f[DIR_0P0   ])[kn   ] =   c2o27* (drho+c3o1*(    vx2     )+c9o2*(     vx2    )*(     vx2    )-cu_sq);
-   (D.f[DIR_0M0   ])[ks   ] =   c2o27* (drho+c3o1*(   -vx2     )+c9o2*(    -vx2    )*(    -vx2    )-cu_sq);
-   (D.f[DIR_00P   ])[kt   ] =   c2o27* (drho+c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cu_sq);
-   (D.f[DIR_00M   ])[kb   ] =   c2o27* (drho+c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cu_sq);
-   (D.f[DIR_PP0  ])[kne  ] =   c1o54* (drho+c3o1*( vx1+vx2    )+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
-   (D.f[DIR_MM0  ])[ksw  ] =   c1o54* (drho+c3o1*(-vx1-vx2    )+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
-   (D.f[DIR_PM0  ])[kse  ] =   c1o54* (drho+c3o1*( vx1-vx2    )+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
-   (D.f[DIR_MP0  ])[knw  ] =   c1o54* (drho+c3o1*(-vx1+vx2    )+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
-   (D.f[DIR_P0P  ])[kte  ] =   c1o54* (drho+c3o1*( vx1    +vx3)+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
-   (D.f[DIR_M0M  ])[kbw  ] =   c1o54* (drho+c3o1*(-vx1    -vx3)+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
-   (D.f[DIR_P0M  ])[kbe  ] =   c1o54* (drho+c3o1*( vx1    -vx3)+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
-   (D.f[DIR_M0P  ])[ktw  ] =   c1o54* (drho+c3o1*(-vx1    +vx3)+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
-   (D.f[DIR_0PP  ])[ktn  ] =   c1o54* (drho+c3o1*(     vx2+vx3)+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
-   (D.f[DIR_0MM  ])[kbs  ] =   c1o54* (drho+c3o1*(    -vx2-vx3)+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
-   (D.f[DIR_0PM  ])[kbn  ] =   c1o54* (drho+c3o1*(     vx2-vx3)+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
-   (D.f[DIR_0MP  ])[kts  ] =   c1o54* (drho+c3o1*(    -vx2+vx3)+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
-   (D.f[DIR_PPP ])[ktne ] =   c1o216*(drho+c3o1*( vx1+vx2+vx3)+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
-   (D.f[DIR_MMM ])[kbsw ] =   c1o216*(drho+c3o1*(-vx1-vx2-vx3)+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
-   (D.f[DIR_PPM ])[kbne ] =   c1o216*(drho+c3o1*( vx1+vx2-vx3)+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
-   (D.f[DIR_MMP ])[ktsw ] =   c1o216*(drho+c3o1*(-vx1-vx2+vx3)+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
-   (D.f[DIR_PMP ])[ktse ] =   c1o216*(drho+c3o1*( vx1-vx2+vx3)+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
-   (D.f[DIR_MPM ])[kbnw ] =   c1o216*(drho+c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
-   (D.f[DIR_PMM ])[kbse ] =   c1o216*(drho+c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
-   (D.f[DIR_MPP ])[ktnw ] =   c1o216*(drho+c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
+   (D.f[DIR_P00])[ke   ] =   c2o27* (drho+c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cu_sq);
+   (D.f[DIR_M00])[kw   ] =   c2o27* (drho+c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cu_sq);
+   (D.f[DIR_0P0])[kn   ] =   c2o27* (drho+c3o1*(    vx2     )+c9o2*(     vx2    )*(     vx2    )-cu_sq);
+   (D.f[DIR_0M0])[ks   ] =   c2o27* (drho+c3o1*(   -vx2     )+c9o2*(    -vx2    )*(    -vx2    )-cu_sq);
+   (D.f[DIR_00P])[kt   ] =   c2o27* (drho+c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cu_sq);
+   (D.f[DIR_00M])[kb   ] =   c2o27* (drho+c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cu_sq);
+   (D.f[DIR_PP0])[kne  ] =   c1o54* (drho+c3o1*( vx1+vx2    )+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
+   (D.f[DIR_MM0])[ksw  ] =   c1o54* (drho+c3o1*(-vx1-vx2    )+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
+   (D.f[DIR_PM0])[kse  ] =   c1o54* (drho+c3o1*( vx1-vx2    )+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
+   (D.f[DIR_MP0])[knw  ] =   c1o54* (drho+c3o1*(-vx1+vx2    )+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
+   (D.f[DIR_P0P])[kte  ] =   c1o54* (drho+c3o1*( vx1    +vx3)+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
+   (D.f[DIR_M0M])[kbw  ] =   c1o54* (drho+c3o1*(-vx1    -vx3)+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
+   (D.f[DIR_P0M])[kbe  ] =   c1o54* (drho+c3o1*( vx1    -vx3)+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
+   (D.f[DIR_M0P])[ktw  ] =   c1o54* (drho+c3o1*(-vx1    +vx3)+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
+   (D.f[DIR_0PP])[ktn  ] =   c1o54* (drho+c3o1*(     vx2+vx3)+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
+   (D.f[DIR_0MM])[kbs  ] =   c1o54* (drho+c3o1*(    -vx2-vx3)+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
+   (D.f[DIR_0PM])[kbn  ] =   c1o54* (drho+c3o1*(     vx2-vx3)+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
+   (D.f[DIR_0MP])[kts  ] =   c1o54* (drho+c3o1*(    -vx2+vx3)+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
+   (D.f[DIR_PPP])[ktne ] =   c1o216*(drho+c3o1*( vx1+vx2+vx3)+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
+   (D.f[DIR_MMM])[kbsw ] =   c1o216*(drho+c3o1*(-vx1-vx2-vx3)+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
+   (D.f[DIR_PPM])[kbne ] =   c1o216*(drho+c3o1*( vx1+vx2-vx3)+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
+   (D.f[DIR_MMP])[ktsw ] =   c1o216*(drho+c3o1*(-vx1-vx2+vx3)+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
+   (D.f[DIR_PMP])[ktse ] =   c1o216*(drho+c3o1*( vx1-vx2+vx3)+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
+   (D.f[DIR_MPM])[kbnw ] =   c1o216*(drho+c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
+   (D.f[DIR_PMM])[kbse ] =   c1o216*(drho+c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
+   (D.f[DIR_MPP])[ktnw ] =   c1o216*(drho+c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
 
 }
 ////////////////////////////////////////////////////////////////////////////////
@@ -191,7 +191,7 @@ __global__ void LBInitNonEqPartSP27( unsigned int* neighborX,
                                                 real* ux,
                                                 real* uy,
                                                 real* uz,
-                                                unsigned int size_Mat,
+                                                unsigned long long numberOfLBnodes,
                                                 real* DD,
                                                 real omega,
                                                 bool EvenOrOdd)
@@ -207,7 +207,7 @@ __global__ void LBInitNonEqPartSP27( unsigned int* neighborX,
     const unsigned k = nx*(ny*z + y) + x;
     //////////////////////////////////////////////////////////////////////////
     
-    if(k<size_Mat)
+    if(k<numberOfLBnodes)
     {
         ////////////////////////////////////////////////////////////////////////////////
         unsigned int BC;
@@ -218,63 +218,63 @@ __global__ void LBInitNonEqPartSP27( unsigned int* neighborX,
             Distributions27 D;
             if (EvenOrOdd==true)
             {
-                D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-                D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-                D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-                D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-                D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-                D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-                D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-                D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-                D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-                D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-                D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-                D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-                D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-                D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-                D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-                D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-                D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-                D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-                D.f[DIR_000] = &DD[DIR_000*size_Mat];
-                D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-                D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-                D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-                D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-                D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-                D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-                D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-                D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+                D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+                D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+                D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+                D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+                D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+                D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+                D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+                D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+                D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+                D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+                D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+                D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+                D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+                D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+                D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+                D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+                D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+                D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+                D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+                D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+                D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+                D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+                D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+                D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+                D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+                D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+                D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
             }
             else
             {
-                D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-                D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-                D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-                D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-                D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-                D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-                D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-                D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-                D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-                D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-                D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-                D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-                D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-                D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-                D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-                D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-                D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-                D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-                D.f[DIR_000] = &DD[DIR_000*size_Mat];
-                D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-                D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-                D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
-                D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-                D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-                D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-                D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-                D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
+                D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+                D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+                D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+                D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+                D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+                D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+                D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+                D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+                D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+                D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+                D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+                D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+                D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+                D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+                D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+                D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+                D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+                D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+                D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+                D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+                D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+                D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
+                D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+                D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+                D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+                D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+                D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
             }
             //////////////////////////////////////////////////////////////////////////
             real drho = rho[k];//0.0f;//
@@ -397,62 +397,62 @@ __global__ void LBInitNonEqPartSP27( unsigned int* neighborX,
             real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
             
             (D.f[DIR_000])[kzero] =   c8o27* (drho-cu_sq);
-            (D.f[DIR_P00   ])[ke   ] =   c2o27* (drho+c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cu_sq);
-            (D.f[DIR_M00   ])[kw   ] =   c2o27* (drho+c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cu_sq);
-            (D.f[DIR_0P0   ])[kn   ] =   c2o27* (drho+c3o1*(    vx2     )+c9o2*(     vx2    )*(     vx2    )-cu_sq);
-            (D.f[DIR_0M0   ])[ks   ] =   c2o27* (drho+c3o1*(   -vx2     )+c9o2*(    -vx2    )*(    -vx2    )-cu_sq);
-            (D.f[DIR_00P   ])[kt   ] =   c2o27* (drho+c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cu_sq);
-            (D.f[DIR_00M   ])[kb   ] =   c2o27* (drho+c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cu_sq);
-            (D.f[DIR_PP0  ])[kne  ] =   c1o54* (drho+c3o1*( vx1+vx2    )+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
-            (D.f[DIR_MM0  ])[ksw  ] =   c1o54* (drho+c3o1*(-vx1-vx2    )+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
-            (D.f[DIR_PM0  ])[kse  ] =   c1o54* (drho+c3o1*( vx1-vx2    )+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
-            (D.f[DIR_MP0  ])[knw  ] =   c1o54* (drho+c3o1*(-vx1+vx2    )+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
-            (D.f[DIR_P0P  ])[kte  ] =   c1o54* (drho+c3o1*( vx1    +vx3)+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
-            (D.f[DIR_M0M  ])[kbw  ] =   c1o54* (drho+c3o1*(-vx1    -vx3)+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
-            (D.f[DIR_P0M  ])[kbe  ] =   c1o54* (drho+c3o1*( vx1    -vx3)+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
-            (D.f[DIR_M0P  ])[ktw  ] =   c1o54* (drho+c3o1*(-vx1    +vx3)+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
-            (D.f[DIR_0PP  ])[ktn  ] =   c1o54* (drho+c3o1*(     vx2+vx3)+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
-            (D.f[DIR_0MM  ])[kbs  ] =   c1o54* (drho+c3o1*(    -vx2-vx3)+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
-            (D.f[DIR_0PM  ])[kbn  ] =   c1o54* (drho+c3o1*(     vx2-vx3)+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
-            (D.f[DIR_0MP  ])[kts  ] =   c1o54* (drho+c3o1*(    -vx2+vx3)+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
-            (D.f[DIR_PPP ])[ktne ] =   c1o216*(drho+c3o1*( vx1+vx2+vx3)+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
-            (D.f[DIR_MMM ])[kbsw ] =   c1o216*(drho+c3o1*(-vx1-vx2-vx3)+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
-            (D.f[DIR_PPM ])[kbne ] =   c1o216*(drho+c3o1*( vx1+vx2-vx3)+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
-            (D.f[DIR_MMP ])[ktsw ] =   c1o216*(drho+c3o1*(-vx1-vx2+vx3)+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
-            (D.f[DIR_PMP ])[ktse ] =   c1o216*(drho+c3o1*( vx1-vx2+vx3)+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
-            (D.f[DIR_MPM ])[kbnw ] =   c1o216*(drho+c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
-            (D.f[DIR_PMM ])[kbse ] =   c1o216*(drho+c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
-            (D.f[DIR_MPP ])[ktnw ] =   c1o216*(drho+c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
+            (D.f[DIR_P00])[ke   ] =   c2o27* (drho+c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cu_sq);
+            (D.f[DIR_M00])[kw   ] =   c2o27* (drho+c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cu_sq);
+            (D.f[DIR_0P0])[kn   ] =   c2o27* (drho+c3o1*(    vx2     )+c9o2*(     vx2    )*(     vx2    )-cu_sq);
+            (D.f[DIR_0M0])[ks   ] =   c2o27* (drho+c3o1*(   -vx2     )+c9o2*(    -vx2    )*(    -vx2    )-cu_sq);
+            (D.f[DIR_00P])[kt   ] =   c2o27* (drho+c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cu_sq);
+            (D.f[DIR_00M])[kb   ] =   c2o27* (drho+c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cu_sq);
+            (D.f[DIR_PP0])[kne  ] =   c1o54* (drho+c3o1*( vx1+vx2    )+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
+            (D.f[DIR_MM0])[ksw  ] =   c1o54* (drho+c3o1*(-vx1-vx2    )+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
+            (D.f[DIR_PM0])[kse  ] =   c1o54* (drho+c3o1*( vx1-vx2    )+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
+            (D.f[DIR_MP0])[knw  ] =   c1o54* (drho+c3o1*(-vx1+vx2    )+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
+            (D.f[DIR_P0P])[kte  ] =   c1o54* (drho+c3o1*( vx1    +vx3)+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
+            (D.f[DIR_M0M])[kbw  ] =   c1o54* (drho+c3o1*(-vx1    -vx3)+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
+            (D.f[DIR_P0M])[kbe  ] =   c1o54* (drho+c3o1*( vx1    -vx3)+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
+            (D.f[DIR_M0P])[ktw  ] =   c1o54* (drho+c3o1*(-vx1    +vx3)+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
+            (D.f[DIR_0PP])[ktn  ] =   c1o54* (drho+c3o1*(     vx2+vx3)+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
+            (D.f[DIR_0MM])[kbs  ] =   c1o54* (drho+c3o1*(    -vx2-vx3)+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
+            (D.f[DIR_0PM])[kbn  ] =   c1o54* (drho+c3o1*(     vx2-vx3)+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
+            (D.f[DIR_0MP])[kts  ] =   c1o54* (drho+c3o1*(    -vx2+vx3)+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
+            (D.f[DIR_PPP])[ktne ] =   c1o216*(drho+c3o1*( vx1+vx2+vx3)+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
+            (D.f[DIR_MMM])[kbsw ] =   c1o216*(drho+c3o1*(-vx1-vx2-vx3)+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
+            (D.f[DIR_PPM])[kbne ] =   c1o216*(drho+c3o1*( vx1+vx2-vx3)+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
+            (D.f[DIR_MMP])[ktsw ] =   c1o216*(drho+c3o1*(-vx1-vx2+vx3)+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
+            (D.f[DIR_PMP])[ktse ] =   c1o216*(drho+c3o1*( vx1-vx2+vx3)+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
+            (D.f[DIR_MPM])[kbnw ] =   c1o216*(drho+c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
+            (D.f[DIR_PMM])[kbse ] =   c1o216*(drho+c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
+            (D.f[DIR_MPP])[ktnw ] =   c1o216*(drho+c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
 
             //////////////////////////////////////////////////////////////////////////
 
             (D.f[DIR_000])[kzero] += f_ZERO;
-            (D.f[DIR_P00   ])[ke   ] += f_E   ;
-            (D.f[DIR_M00   ])[kw   ] += f_E   ;
-            (D.f[DIR_0P0   ])[kn   ] += f_N   ;
-            (D.f[DIR_0M0   ])[ks   ] += f_N   ;
-            (D.f[DIR_00P   ])[kt   ] += f_T   ;
-            (D.f[DIR_00M   ])[kb   ] += f_T   ;
-            (D.f[DIR_PP0  ])[kne  ] += f_NE  ;
-            (D.f[DIR_MM0  ])[ksw  ] += f_NE  ;
-            (D.f[DIR_PM0  ])[kse  ] += f_SE  ;
-            (D.f[DIR_MP0  ])[knw  ] += f_SE  ;
-            (D.f[DIR_P0P  ])[kte  ] += f_TE  ;
-            (D.f[DIR_M0M  ])[kbw  ] += f_TE  ;
-            (D.f[DIR_P0M  ])[kbe  ] += f_BE  ;
-            (D.f[DIR_M0P  ])[ktw  ] += f_BE  ;
-            (D.f[DIR_0PP  ])[ktn  ] += f_TN  ;
-            (D.f[DIR_0MM  ])[kbs  ] += f_TN  ;
-            (D.f[DIR_0PM  ])[kbn  ] += f_BN  ;
-            (D.f[DIR_0MP  ])[kts  ] += f_BN  ;
-            (D.f[DIR_PPP ])[ktne ] += f_TNE ;
-            (D.f[DIR_MMM ])[kbsw ] += f_TNE ;
-            (D.f[DIR_PPM ])[kbne ] += f_TSW ;
-            (D.f[DIR_MMP ])[ktsw ] += f_TSW ;
-            (D.f[DIR_PMP ])[ktse ] += f_TSE ;
-            (D.f[DIR_MPM ])[kbnw ] += f_TSE ;
-            (D.f[DIR_PMM ])[kbse ] += f_TNW ;
-            (D.f[DIR_MPP ])[ktnw ] += f_TNW ;
+            (D.f[DIR_P00])[ke   ] += f_E   ;
+            (D.f[DIR_M00])[kw   ] += f_E   ;
+            (D.f[DIR_0P0])[kn   ] += f_N   ;
+            (D.f[DIR_0M0])[ks   ] += f_N   ;
+            (D.f[DIR_00P])[kt   ] += f_T   ;
+            (D.f[DIR_00M])[kb   ] += f_T   ;
+            (D.f[DIR_PP0])[kne  ] += f_NE  ;
+            (D.f[DIR_MM0])[ksw  ] += f_NE  ;
+            (D.f[DIR_PM0])[kse  ] += f_SE  ;
+            (D.f[DIR_MP0])[knw  ] += f_SE  ;
+            (D.f[DIR_P0P])[kte  ] += f_TE  ;
+            (D.f[DIR_M0M])[kbw  ] += f_TE  ;
+            (D.f[DIR_P0M])[kbe  ] += f_BE  ;
+            (D.f[DIR_M0P])[ktw  ] += f_BE  ;
+            (D.f[DIR_0PP])[ktn  ] += f_TN  ;
+            (D.f[DIR_0MM])[kbs  ] += f_TN  ;
+            (D.f[DIR_0PM])[kbn  ] += f_BN  ;
+            (D.f[DIR_0MP])[kts  ] += f_BN  ;
+            (D.f[DIR_PPP])[ktne ] += f_TNE ;
+            (D.f[DIR_MMM])[kbsw ] += f_TNE ;
+            (D.f[DIR_PPM])[kbne ] += f_TSW ;
+            (D.f[DIR_MMP])[ktsw ] += f_TSW ;
+            (D.f[DIR_PMP])[ktse ] += f_TSE ;
+            (D.f[DIR_MPM])[kbnw ] += f_TSE ;
+            (D.f[DIR_PMM])[kbse ] += f_TNW ;
+            (D.f[DIR_MPP])[ktnw ] += f_TNW ;
 
             //////////////////////////////////////////////////////////////////////////
         }
@@ -460,7 +460,7 @@ __global__ void LBInitNonEqPartSP27( unsigned int* neighborX,
 	    {
 		    //////////////////////////////////////////////////////////////////////////
 		    Distributions27 D;
-		    D.f[DIR_000] = &DD[DIR_000*size_Mat];
+		    D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
 		    //////////////////////////////////////////////////////////////////////////
 		    (D.f[DIR_000])[k] = c96o1;
 		    //////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/GPU/InitAdvectionDiffusion27.cu b/src/gpu/VirtualFluids_GPU/GPU/InitAdvectionDiffusion27.cu
index c091aa8b9a29017ddc0f6ea6584e805d7afc4859..7f67d1692f7e136a6537be6780fe8625adc33e22 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/InitAdvectionDiffusion27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/InitAdvectionDiffusion27.cu
@@ -47,7 +47,7 @@ __global__ void InitAD27(
 	real* velocityX,
 	real* velocityY,
 	real* velocityZ,
-	uint size_Mat,
+	unsigned long long numberOfLBnodes,
 	real* distributionsAD,
 	bool isEvenTimestep)
 {
@@ -68,7 +68,7 @@ __global__ void InitAD27(
 
 	//////////////////////////////////////////////////////////////////////////
 	// run for all indices in size_Mat and fluid nodes
-	if ((k < size_Mat) && (typeOfGridNode[k] == GEO_FLUID))
+	if ((k < numberOfLBnodes) && (typeOfGridNode[k] == GEO_FLUID))
 	{
 		//////////////////////////////////////////////////////////////////////////
 		//! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
@@ -77,63 +77,63 @@ __global__ void InitAD27(
 		Distributions27 distAD;
 		if (isEvenTimestep)
 		{
-			distAD.f[DIR_P00   ] = &distributionsAD[DIR_P00   *size_Mat];
-			distAD.f[DIR_M00   ] = &distributionsAD[DIR_M00   *size_Mat];
-			distAD.f[DIR_0P0   ] = &distributionsAD[DIR_0P0   *size_Mat];
-			distAD.f[DIR_0M0   ] = &distributionsAD[DIR_0M0   *size_Mat];
-			distAD.f[DIR_00P   ] = &distributionsAD[DIR_00P   *size_Mat];
-			distAD.f[DIR_00M   ] = &distributionsAD[DIR_00M   *size_Mat];
-			distAD.f[DIR_PP0  ] = &distributionsAD[DIR_PP0  *size_Mat];
-			distAD.f[DIR_MM0  ] = &distributionsAD[DIR_MM0  *size_Mat];
-			distAD.f[DIR_PM0  ] = &distributionsAD[DIR_PM0  *size_Mat];
-			distAD.f[DIR_MP0  ] = &distributionsAD[DIR_MP0  *size_Mat];
-			distAD.f[DIR_P0P  ] = &distributionsAD[DIR_P0P  *size_Mat];
-			distAD.f[DIR_M0M  ] = &distributionsAD[DIR_M0M  *size_Mat];
-			distAD.f[DIR_P0M  ] = &distributionsAD[DIR_P0M  *size_Mat];
-			distAD.f[DIR_M0P  ] = &distributionsAD[DIR_M0P  *size_Mat];
-			distAD.f[DIR_0PP  ] = &distributionsAD[DIR_0PP  *size_Mat];
-			distAD.f[DIR_0MM  ] = &distributionsAD[DIR_0MM  *size_Mat];
-			distAD.f[DIR_0PM  ] = &distributionsAD[DIR_0PM  *size_Mat];
-			distAD.f[DIR_0MP  ] = &distributionsAD[DIR_0MP  *size_Mat];
-			distAD.f[DIR_000] = &distributionsAD[DIR_000*size_Mat];
-			distAD.f[DIR_PPP ] = &distributionsAD[DIR_PPP *size_Mat];
-			distAD.f[DIR_MMP ] = &distributionsAD[DIR_MMP *size_Mat];
-			distAD.f[DIR_PMP ] = &distributionsAD[DIR_PMP *size_Mat];
-			distAD.f[DIR_MPP ] = &distributionsAD[DIR_MPP *size_Mat];
-			distAD.f[DIR_PPM ] = &distributionsAD[DIR_PPM *size_Mat];
-			distAD.f[DIR_MMM ] = &distributionsAD[DIR_MMM *size_Mat];
-			distAD.f[DIR_PMM ] = &distributionsAD[DIR_PMM *size_Mat];
-			distAD.f[DIR_MPM ] = &distributionsAD[DIR_MPM *size_Mat];
+			distAD.f[DIR_P00] = &distributionsAD[DIR_P00 * numberOfLBnodes];
+			distAD.f[DIR_M00] = &distributionsAD[DIR_M00 * numberOfLBnodes];
+			distAD.f[DIR_0P0] = &distributionsAD[DIR_0P0 * numberOfLBnodes];
+			distAD.f[DIR_0M0] = &distributionsAD[DIR_0M0 * numberOfLBnodes];
+			distAD.f[DIR_00P] = &distributionsAD[DIR_00P * numberOfLBnodes];
+			distAD.f[DIR_00M] = &distributionsAD[DIR_00M * numberOfLBnodes];
+			distAD.f[DIR_PP0] = &distributionsAD[DIR_PP0 * numberOfLBnodes];
+			distAD.f[DIR_MM0] = &distributionsAD[DIR_MM0 * numberOfLBnodes];
+			distAD.f[DIR_PM0] = &distributionsAD[DIR_PM0 * numberOfLBnodes];
+			distAD.f[DIR_MP0] = &distributionsAD[DIR_MP0 * numberOfLBnodes];
+			distAD.f[DIR_P0P] = &distributionsAD[DIR_P0P * numberOfLBnodes];
+			distAD.f[DIR_M0M] = &distributionsAD[DIR_M0M * numberOfLBnodes];
+			distAD.f[DIR_P0M] = &distributionsAD[DIR_P0M * numberOfLBnodes];
+			distAD.f[DIR_M0P] = &distributionsAD[DIR_M0P * numberOfLBnodes];
+			distAD.f[DIR_0PP] = &distributionsAD[DIR_0PP * numberOfLBnodes];
+			distAD.f[DIR_0MM] = &distributionsAD[DIR_0MM * numberOfLBnodes];
+			distAD.f[DIR_0PM] = &distributionsAD[DIR_0PM * numberOfLBnodes];
+			distAD.f[DIR_0MP] = &distributionsAD[DIR_0MP * numberOfLBnodes];
+			distAD.f[DIR_000] = &distributionsAD[DIR_000 * numberOfLBnodes];
+			distAD.f[DIR_PPP] = &distributionsAD[DIR_PPP * numberOfLBnodes];
+			distAD.f[DIR_MMP] = &distributionsAD[DIR_MMP * numberOfLBnodes];
+			distAD.f[DIR_PMP] = &distributionsAD[DIR_PMP * numberOfLBnodes];
+			distAD.f[DIR_MPP] = &distributionsAD[DIR_MPP * numberOfLBnodes];
+			distAD.f[DIR_PPM] = &distributionsAD[DIR_PPM * numberOfLBnodes];
+			distAD.f[DIR_MMM] = &distributionsAD[DIR_MMM * numberOfLBnodes];
+			distAD.f[DIR_PMM] = &distributionsAD[DIR_PMM * numberOfLBnodes];
+			distAD.f[DIR_MPM] = &distributionsAD[DIR_MPM * numberOfLBnodes];
 		}
 		else
 		{
-			distAD.f[DIR_M00   ] = &distributionsAD[DIR_P00   *size_Mat];
-			distAD.f[DIR_P00   ] = &distributionsAD[DIR_M00   *size_Mat];
-			distAD.f[DIR_0M0   ] = &distributionsAD[DIR_0P0   *size_Mat];
-			distAD.f[DIR_0P0   ] = &distributionsAD[DIR_0M0   *size_Mat];
-			distAD.f[DIR_00M   ] = &distributionsAD[DIR_00P   *size_Mat];
-			distAD.f[DIR_00P   ] = &distributionsAD[DIR_00M   *size_Mat];
-			distAD.f[DIR_MM0  ] = &distributionsAD[DIR_PP0  *size_Mat];
-			distAD.f[DIR_PP0  ] = &distributionsAD[DIR_MM0  *size_Mat];
-			distAD.f[DIR_MP0  ] = &distributionsAD[DIR_PM0  *size_Mat];
-			distAD.f[DIR_PM0  ] = &distributionsAD[DIR_MP0  *size_Mat];
-			distAD.f[DIR_M0M  ] = &distributionsAD[DIR_P0P  *size_Mat];
-			distAD.f[DIR_P0P  ] = &distributionsAD[DIR_M0M  *size_Mat];
-			distAD.f[DIR_M0P  ] = &distributionsAD[DIR_P0M  *size_Mat];
-			distAD.f[DIR_P0M  ] = &distributionsAD[DIR_M0P  *size_Mat];
-			distAD.f[DIR_0MM  ] = &distributionsAD[DIR_0PP  *size_Mat];
-			distAD.f[DIR_0PP  ] = &distributionsAD[DIR_0MM  *size_Mat];
-			distAD.f[DIR_0MP  ] = &distributionsAD[DIR_0PM  *size_Mat];
-			distAD.f[DIR_0PM  ] = &distributionsAD[DIR_0MP  *size_Mat];
-			distAD.f[DIR_000] = &distributionsAD[DIR_000*size_Mat];
-			distAD.f[DIR_MMM ] = &distributionsAD[DIR_PPP *size_Mat];
-			distAD.f[DIR_PPM ] = &distributionsAD[DIR_MMP *size_Mat];
-			distAD.f[DIR_MPM ] = &distributionsAD[DIR_PMP *size_Mat];
-			distAD.f[DIR_PMM ] = &distributionsAD[DIR_MPP *size_Mat];
-			distAD.f[DIR_MMP ] = &distributionsAD[DIR_PPM *size_Mat];
-			distAD.f[DIR_PPP ] = &distributionsAD[DIR_MMM *size_Mat];
-			distAD.f[DIR_MPP ] = &distributionsAD[DIR_PMM *size_Mat];
-			distAD.f[DIR_PMP ] = &distributionsAD[DIR_MPM *size_Mat];
+			distAD.f[DIR_M00] = &distributionsAD[DIR_P00 * numberOfLBnodes];
+			distAD.f[DIR_P00] = &distributionsAD[DIR_M00 * numberOfLBnodes];
+			distAD.f[DIR_0M0] = &distributionsAD[DIR_0P0 * numberOfLBnodes];
+			distAD.f[DIR_0P0] = &distributionsAD[DIR_0M0 * numberOfLBnodes];
+			distAD.f[DIR_00M] = &distributionsAD[DIR_00P * numberOfLBnodes];
+			distAD.f[DIR_00P] = &distributionsAD[DIR_00M * numberOfLBnodes];
+			distAD.f[DIR_MM0] = &distributionsAD[DIR_PP0 * numberOfLBnodes];
+			distAD.f[DIR_PP0] = &distributionsAD[DIR_MM0 * numberOfLBnodes];
+			distAD.f[DIR_MP0] = &distributionsAD[DIR_PM0 * numberOfLBnodes];
+			distAD.f[DIR_PM0] = &distributionsAD[DIR_MP0 * numberOfLBnodes];
+			distAD.f[DIR_M0M] = &distributionsAD[DIR_P0P * numberOfLBnodes];
+			distAD.f[DIR_P0P] = &distributionsAD[DIR_M0M * numberOfLBnodes];
+			distAD.f[DIR_M0P] = &distributionsAD[DIR_P0M * numberOfLBnodes];
+			distAD.f[DIR_P0M] = &distributionsAD[DIR_M0P * numberOfLBnodes];
+			distAD.f[DIR_0MM] = &distributionsAD[DIR_0PP * numberOfLBnodes];
+			distAD.f[DIR_0PP] = &distributionsAD[DIR_0MM * numberOfLBnodes];
+			distAD.f[DIR_0MP] = &distributionsAD[DIR_0PM * numberOfLBnodes];
+			distAD.f[DIR_0PM] = &distributionsAD[DIR_0MP * numberOfLBnodes];
+			distAD.f[DIR_000] = &distributionsAD[DIR_000 * numberOfLBnodes];
+			distAD.f[DIR_MMM] = &distributionsAD[DIR_PPP * numberOfLBnodes];
+			distAD.f[DIR_PPM] = &distributionsAD[DIR_MMP * numberOfLBnodes];
+			distAD.f[DIR_MPM] = &distributionsAD[DIR_PMP * numberOfLBnodes];
+			distAD.f[DIR_PMM] = &distributionsAD[DIR_MPP * numberOfLBnodes];
+			distAD.f[DIR_MMP] = &distributionsAD[DIR_PPM * numberOfLBnodes];
+			distAD.f[DIR_PPP] = &distributionsAD[DIR_MMM * numberOfLBnodes];
+			distAD.f[DIR_MPP] = &distributionsAD[DIR_PMM * numberOfLBnodes];
+			distAD.f[DIR_PMP] = &distributionsAD[DIR_MPM * numberOfLBnodes];
 		}
 		//////////////////////////////////////////////////////////////////////////
 		//! - Set local velocities and concetration
@@ -178,32 +178,32 @@ __global__ void InitAD27(
 		real cu_sq = c3o2*(vx1*vx1 + vx2*vx2 + vx3*vx3);
 
 		(distAD.f[DIR_000])[kzero] = c8o27  * conc * (c1o1 - cu_sq);
-		(distAD.f[DIR_P00   ])[ke   ] = c2o27  * conc * (c1o1 + c3o1 * ( vx1            ) + c9o2 * ( vx1            ) * ( vx1            ) - cu_sq);
-		(distAD.f[DIR_M00   ])[kw   ] = c2o27  * conc * (c1o1 + c3o1 * (-vx1            ) + c9o2 * (-vx1            ) * (-vx1            ) - cu_sq);
-		(distAD.f[DIR_0P0   ])[kn   ] = c2o27  * conc * (c1o1 + c3o1 * (       vx2      ) + c9o2 * (       vx2      ) * (       vx2      ) - cu_sq);
-		(distAD.f[DIR_0M0   ])[ks   ] = c2o27  * conc * (c1o1 + c3o1 * (     - vx2      ) + c9o2 * (     - vx2      ) * (     - vx2      ) - cu_sq);
-		(distAD.f[DIR_00P   ])[kt   ] = c2o27  * conc * (c1o1 + c3o1 * (             vx3) + c9o2 * (             vx3) * (             vx3) - cu_sq);
-		(distAD.f[DIR_00M   ])[kb   ] = c2o27  * conc * (c1o1 + c3o1 * (           - vx3) + c9o2 * (           - vx3) * (           - vx3) - cu_sq);
-		(distAD.f[DIR_PP0  ])[kne  ] = c1o54  * conc * (c1o1 + c3o1 * ( vx1 + vx2      ) + c9o2 * ( vx1 + vx2      ) * ( vx1 + vx2      ) - cu_sq);
-		(distAD.f[DIR_MM0  ])[ksw  ] = c1o54  * conc * (c1o1 + c3o1 * (-vx1 - vx2      ) + c9o2 * (-vx1 - vx2      ) * (-vx1 - vx2      ) - cu_sq);
-		(distAD.f[DIR_PM0  ])[kse  ] = c1o54  * conc * (c1o1 + c3o1 * ( vx1 - vx2      ) + c9o2 * ( vx1 - vx2      ) * ( vx1 - vx2      ) - cu_sq);
-		(distAD.f[DIR_MP0  ])[knw  ] = c1o54  * conc * (c1o1 + c3o1 * (-vx1 + vx2      ) + c9o2 * (-vx1 + vx2      ) * (-vx1 + vx2      ) - cu_sq);
-		(distAD.f[DIR_P0P  ])[kte  ] = c1o54  * conc * (c1o1 + c3o1 * ( vx1       + vx3) + c9o2 * ( vx1       + vx3) * ( vx1       + vx3) - cu_sq);
-		(distAD.f[DIR_M0M  ])[kbw  ] = c1o54  * conc * (c1o1 + c3o1 * (-vx1       - vx3) + c9o2 * (-vx1       - vx3) * (-vx1       - vx3) - cu_sq);
-		(distAD.f[DIR_P0M  ])[kbe  ] = c1o54  * conc * (c1o1 + c3o1 * ( vx1       - vx3) + c9o2 * ( vx1       - vx3) * ( vx1       - vx3) - cu_sq);
-		(distAD.f[DIR_M0P  ])[ktw  ] = c1o54  * conc * (c1o1 + c3o1 * (-vx1       + vx3) + c9o2 * (-vx1       + vx3) * (-vx1       + vx3) - cu_sq);
-		(distAD.f[DIR_0PP  ])[ktn  ] = c1o54  * conc * (c1o1 + c3o1 * (       vx2 + vx3) + c9o2 * (       vx2 + vx3) * (       vx2 + vx3) - cu_sq);
-		(distAD.f[DIR_0MM  ])[kbs  ] = c1o54  * conc * (c1o1 + c3o1 * (     - vx2 - vx3) + c9o2 * (     - vx2 - vx3) * (     - vx2 - vx3) - cu_sq);
-		(distAD.f[DIR_0PM  ])[kbn  ] = c1o54  * conc * (c1o1 + c3o1 * (       vx2 - vx3) + c9o2 * (       vx2 - vx3) * (       vx2 - vx3) - cu_sq);
-		(distAD.f[DIR_0MP  ])[kts  ] = c1o54  * conc * (c1o1 + c3o1 * (     - vx2 + vx3) + c9o2 * (     - vx2 + vx3) * (     - vx2 + vx3) - cu_sq);
-		(distAD.f[DIR_PPP ])[ktne ] = c1o216 * conc * (c1o1 + c3o1 * ( vx1 + vx2 + vx3) + c9o2 * ( vx1 + vx2 + vx3) * ( vx1 + vx2 + vx3) - cu_sq);
-		(distAD.f[DIR_MMM ])[kbsw ] = c1o216 * conc * (c1o1 + c3o1 * (-vx1 - vx2 - vx3) + c9o2 * (-vx1 - vx2 - vx3) * (-vx1 - vx2 - vx3) - cu_sq);
-		(distAD.f[DIR_PPM ])[kbne ] = c1o216 * conc * (c1o1 + c3o1 * ( vx1 + vx2 - vx3) + c9o2 * ( vx1 + vx2 - vx3) * ( vx1 + vx2 - vx3) - cu_sq);
-		(distAD.f[DIR_MMP ])[ktsw ] = c1o216 * conc * (c1o1 + c3o1 * (-vx1 - vx2 + vx3) + c9o2 * (-vx1 - vx2 + vx3) * (-vx1 - vx2 + vx3) - cu_sq);
-		(distAD.f[DIR_PMP ])[ktse ] = c1o216 * conc * (c1o1 + c3o1 * ( vx1 - vx2 + vx3) + c9o2 * ( vx1 - vx2 + vx3) * ( vx1 - vx2 + vx3) - cu_sq);
-		(distAD.f[DIR_MPM ])[kbnw ] = c1o216 * conc * (c1o1 + c3o1 * (-vx1 + vx2 - vx3) + c9o2 * (-vx1 + vx2 - vx3) * (-vx1 + vx2 - vx3) - cu_sq);
-		(distAD.f[DIR_PMM ])[kbse ] = c1o216 * conc * (c1o1 + c3o1 * ( vx1 - vx2 - vx3) + c9o2 * ( vx1 - vx2 - vx3) * ( vx1 - vx2 - vx3) - cu_sq);
-		(distAD.f[DIR_MPP ])[ktnw ] = c1o216 * conc * (c1o1 + c3o1 * (-vx1 + vx2 + vx3) + c9o2 * (-vx1 + vx2 + vx3) * (-vx1 + vx2 + vx3) - cu_sq);
+		(distAD.f[DIR_P00])[ke   ] = c2o27  * conc * (c1o1 + c3o1 * ( vx1            ) + c9o2 * ( vx1            ) * ( vx1            ) - cu_sq);
+		(distAD.f[DIR_M00])[kw   ] = c2o27  * conc * (c1o1 + c3o1 * (-vx1            ) + c9o2 * (-vx1            ) * (-vx1            ) - cu_sq);
+		(distAD.f[DIR_0P0])[kn   ] = c2o27  * conc * (c1o1 + c3o1 * (       vx2      ) + c9o2 * (       vx2      ) * (       vx2      ) - cu_sq);
+		(distAD.f[DIR_0M0])[ks   ] = c2o27  * conc * (c1o1 + c3o1 * (     - vx2      ) + c9o2 * (     - vx2      ) * (     - vx2      ) - cu_sq);
+		(distAD.f[DIR_00P])[kt   ] = c2o27  * conc * (c1o1 + c3o1 * (             vx3) + c9o2 * (             vx3) * (             vx3) - cu_sq);
+		(distAD.f[DIR_00M])[kb   ] = c2o27  * conc * (c1o1 + c3o1 * (           - vx3) + c9o2 * (           - vx3) * (           - vx3) - cu_sq);
+		(distAD.f[DIR_PP0])[kne  ] = c1o54  * conc * (c1o1 + c3o1 * ( vx1 + vx2      ) + c9o2 * ( vx1 + vx2      ) * ( vx1 + vx2      ) - cu_sq);
+		(distAD.f[DIR_MM0])[ksw  ] = c1o54  * conc * (c1o1 + c3o1 * (-vx1 - vx2      ) + c9o2 * (-vx1 - vx2      ) * (-vx1 - vx2      ) - cu_sq);
+		(distAD.f[DIR_PM0])[kse  ] = c1o54  * conc * (c1o1 + c3o1 * ( vx1 - vx2      ) + c9o2 * ( vx1 - vx2      ) * ( vx1 - vx2      ) - cu_sq);
+		(distAD.f[DIR_MP0])[knw  ] = c1o54  * conc * (c1o1 + c3o1 * (-vx1 + vx2      ) + c9o2 * (-vx1 + vx2      ) * (-vx1 + vx2      ) - cu_sq);
+		(distAD.f[DIR_P0P])[kte  ] = c1o54  * conc * (c1o1 + c3o1 * ( vx1       + vx3) + c9o2 * ( vx1       + vx3) * ( vx1       + vx3) - cu_sq);
+		(distAD.f[DIR_M0M])[kbw  ] = c1o54  * conc * (c1o1 + c3o1 * (-vx1       - vx3) + c9o2 * (-vx1       - vx3) * (-vx1       - vx3) - cu_sq);
+		(distAD.f[DIR_P0M])[kbe  ] = c1o54  * conc * (c1o1 + c3o1 * ( vx1       - vx3) + c9o2 * ( vx1       - vx3) * ( vx1       - vx3) - cu_sq);
+		(distAD.f[DIR_M0P])[ktw  ] = c1o54  * conc * (c1o1 + c3o1 * (-vx1       + vx3) + c9o2 * (-vx1       + vx3) * (-vx1       + vx3) - cu_sq);
+		(distAD.f[DIR_0PP])[ktn  ] = c1o54  * conc * (c1o1 + c3o1 * (       vx2 + vx3) + c9o2 * (       vx2 + vx3) * (       vx2 + vx3) - cu_sq);
+		(distAD.f[DIR_0MM])[kbs  ] = c1o54  * conc * (c1o1 + c3o1 * (     - vx2 - vx3) + c9o2 * (     - vx2 - vx3) * (     - vx2 - vx3) - cu_sq);
+		(distAD.f[DIR_0PM])[kbn  ] = c1o54  * conc * (c1o1 + c3o1 * (       vx2 - vx3) + c9o2 * (       vx2 - vx3) * (       vx2 - vx3) - cu_sq);
+		(distAD.f[DIR_0MP])[kts  ] = c1o54  * conc * (c1o1 + c3o1 * (     - vx2 + vx3) + c9o2 * (     - vx2 + vx3) * (     - vx2 + vx3) - cu_sq);
+		(distAD.f[DIR_PPP])[ktne ] = c1o216 * conc * (c1o1 + c3o1 * ( vx1 + vx2 + vx3) + c9o2 * ( vx1 + vx2 + vx3) * ( vx1 + vx2 + vx3) - cu_sq);
+		(distAD.f[DIR_MMM])[kbsw ] = c1o216 * conc * (c1o1 + c3o1 * (-vx1 - vx2 - vx3) + c9o2 * (-vx1 - vx2 - vx3) * (-vx1 - vx2 - vx3) - cu_sq);
+		(distAD.f[DIR_PPM])[kbne ] = c1o216 * conc * (c1o1 + c3o1 * ( vx1 + vx2 - vx3) + c9o2 * ( vx1 + vx2 - vx3) * ( vx1 + vx2 - vx3) - cu_sq);
+		(distAD.f[DIR_MMP])[ktsw ] = c1o216 * conc * (c1o1 + c3o1 * (-vx1 - vx2 + vx3) + c9o2 * (-vx1 - vx2 + vx3) * (-vx1 - vx2 + vx3) - cu_sq);
+		(distAD.f[DIR_PMP])[ktse ] = c1o216 * conc * (c1o1 + c3o1 * ( vx1 - vx2 + vx3) + c9o2 * ( vx1 - vx2 + vx3) * ( vx1 - vx2 + vx3) - cu_sq);
+		(distAD.f[DIR_MPM])[kbnw ] = c1o216 * conc * (c1o1 + c3o1 * (-vx1 + vx2 - vx3) + c9o2 * (-vx1 + vx2 - vx3) * (-vx1 + vx2 - vx3) - cu_sq);
+		(distAD.f[DIR_PMM])[kbse ] = c1o216 * conc * (c1o1 + c3o1 * ( vx1 - vx2 - vx3) + c9o2 * ( vx1 - vx2 - vx3) * ( vx1 - vx2 - vx3) - cu_sq);
+		(distAD.f[DIR_MPP])[ktnw ] = c1o216 * conc * (c1o1 + c3o1 * (-vx1 + vx2 + vx3) + c9o2 * (-vx1 + vx2 + vx3) * (-vx1 + vx2 + vx3) - cu_sq);
 	}
 }
 
@@ -263,63 +263,63 @@ __global__ void InitAD27(
 //          Distributions27 D27;
 //          if (EvenOrOdd==true)
 //          {
-//             D27.f[DIR_P00   ] = &DD27[DIR_P00   *size_Mat];
-//             D27.f[DIR_M00   ] = &DD27[DIR_M00   *size_Mat];
-//             D27.f[DIR_0P0   ] = &DD27[DIR_0P0   *size_Mat];
-//             D27.f[DIR_0M0   ] = &DD27[DIR_0M0   *size_Mat];
-//             D27.f[DIR_00P   ] = &DD27[DIR_00P   *size_Mat];
-//             D27.f[DIR_00M   ] = &DD27[DIR_00M   *size_Mat];
-//             D27.f[DIR_PP0  ] = &DD27[DIR_PP0  *size_Mat];
-//             D27.f[DIR_MM0  ] = &DD27[DIR_MM0  *size_Mat];
-//             D27.f[DIR_PM0  ] = &DD27[DIR_PM0  *size_Mat];
-//             D27.f[DIR_MP0  ] = &DD27[DIR_MP0  *size_Mat];
-//             D27.f[DIR_P0P  ] = &DD27[DIR_P0P  *size_Mat];
-//             D27.f[DIR_M0M  ] = &DD27[DIR_M0M  *size_Mat];
-//             D27.f[DIR_P0M  ] = &DD27[DIR_P0M  *size_Mat];
-//             D27.f[DIR_M0P  ] = &DD27[DIR_M0P  *size_Mat];
-//             D27.f[DIR_0PP  ] = &DD27[DIR_0PP  *size_Mat];
-//             D27.f[DIR_0MM  ] = &DD27[DIR_0MM  *size_Mat];
-//             D27.f[DIR_0PM  ] = &DD27[DIR_0PM  *size_Mat];
-//             D27.f[DIR_0MP  ] = &DD27[DIR_0MP  *size_Mat];
-//             D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-//             D27.f[DIR_PPP ] = &DD27[DIR_PPP *size_Mat];
-//             D27.f[DIR_MMP ] = &DD27[DIR_MMP *size_Mat];
-//             D27.f[DIR_PMP ] = &DD27[DIR_PMP *size_Mat];
-//             D27.f[DIR_MPP ] = &DD27[DIR_MPP *size_Mat];
-//             D27.f[DIR_PPM ] = &DD27[DIR_PPM *size_Mat];
-//             D27.f[DIR_MMM ] = &DD27[DIR_MMM *size_Mat];
-//             D27.f[DIR_PMM ] = &DD27[DIR_PMM *size_Mat];
-//             D27.f[DIR_MPM ] = &DD27[DIR_MPM *size_Mat];
+//             D27.f[DIR_P00] = &DD27[DIR_P00 * size_Mat];
+//             D27.f[DIR_M00] = &DD27[DIR_M00 * size_Mat];
+//             D27.f[DIR_0P0] = &DD27[DIR_0P0 * size_Mat];
+//             D27.f[DIR_0M0] = &DD27[DIR_0M0 * size_Mat];
+//             D27.f[DIR_00P] = &DD27[DIR_00P * size_Mat];
+//             D27.f[DIR_00M] = &DD27[DIR_00M * size_Mat];
+//             D27.f[DIR_PP0] = &DD27[DIR_PP0 * size_Mat];
+//             D27.f[DIR_MM0] = &DD27[DIR_MM0 * size_Mat];
+//             D27.f[DIR_PM0] = &DD27[DIR_PM0 * size_Mat];
+//             D27.f[DIR_MP0] = &DD27[DIR_MP0 * size_Mat];
+//             D27.f[DIR_P0P] = &DD27[DIR_P0P * size_Mat];
+//             D27.f[DIR_M0M] = &DD27[DIR_M0M * size_Mat];
+//             D27.f[DIR_P0M] = &DD27[DIR_P0M * size_Mat];
+//             D27.f[DIR_M0P] = &DD27[DIR_M0P * size_Mat];
+//             D27.f[DIR_0PP] = &DD27[DIR_0PP * size_Mat];
+//             D27.f[DIR_0MM] = &DD27[DIR_0MM * size_Mat];
+//             D27.f[DIR_0PM] = &DD27[DIR_0PM * size_Mat];
+//             D27.f[DIR_0MP] = &DD27[DIR_0MP * size_Mat];
+//             D27.f[DIR_000] = &DD27[DIR_000 * size_Mat];
+//             D27.f[DIR_PPP] = &DD27[DIR_PPP * size_Mat];
+//             D27.f[DIR_MMP] = &DD27[DIR_MMP * size_Mat];
+//             D27.f[DIR_PMP] = &DD27[DIR_PMP * size_Mat];
+//             D27.f[DIR_MPP] = &DD27[DIR_MPP * size_Mat];
+//             D27.f[DIR_PPM] = &DD27[DIR_PPM * size_Mat];
+//             D27.f[DIR_MMM] = &DD27[DIR_MMM * size_Mat];
+//             D27.f[DIR_PMM] = &DD27[DIR_PMM * size_Mat];
+//             D27.f[DIR_MPM] = &DD27[DIR_MPM * size_Mat];
 //          }
 //          else
 //          {
-//             D27.f[DIR_M00   ] = &DD27[DIR_P00   *size_Mat];
-//             D27.f[DIR_P00   ] = &DD27[DIR_M00   *size_Mat];
-//             D27.f[DIR_0M0   ] = &DD27[DIR_0P0   *size_Mat];
-//             D27.f[DIR_0P0   ] = &DD27[DIR_0M0   *size_Mat];
-//             D27.f[DIR_00M   ] = &DD27[DIR_00P   *size_Mat];
-//             D27.f[DIR_00P   ] = &DD27[DIR_00M   *size_Mat];
-//             D27.f[DIR_MM0  ] = &DD27[DIR_PP0  *size_Mat];
-//             D27.f[DIR_PP0  ] = &DD27[DIR_MM0  *size_Mat];
-//             D27.f[DIR_MP0  ] = &DD27[DIR_PM0  *size_Mat];
-//             D27.f[DIR_PM0  ] = &DD27[DIR_MP0  *size_Mat];
-//             D27.f[DIR_M0M  ] = &DD27[DIR_P0P  *size_Mat];
-//             D27.f[DIR_P0P  ] = &DD27[DIR_M0M  *size_Mat];
-//             D27.f[DIR_M0P  ] = &DD27[DIR_P0M  *size_Mat];
-//             D27.f[DIR_P0M  ] = &DD27[DIR_M0P  *size_Mat];
-//             D27.f[DIR_0MM  ] = &DD27[DIR_0PP  *size_Mat];
-//             D27.f[DIR_0PP  ] = &DD27[DIR_0MM  *size_Mat];
-//             D27.f[DIR_0MP  ] = &DD27[DIR_0PM  *size_Mat];
-//             D27.f[DIR_0PM  ] = &DD27[DIR_0MP  *size_Mat];
-//             D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-//             D27.f[DIR_MMM ] = &DD27[DIR_PPP *size_Mat];
-//             D27.f[DIR_PPM ] = &DD27[DIR_MMP *size_Mat];
-//             D27.f[DIR_MPM ] = &DD27[DIR_PMP *size_Mat];
-//             D27.f[DIR_PMM ] = &DD27[DIR_MPP *size_Mat];
-//             D27.f[DIR_MMP ] = &DD27[DIR_PPM *size_Mat];
-//             D27.f[DIR_PPP ] = &DD27[DIR_MMM *size_Mat];
-//             D27.f[DIR_MPP ] = &DD27[DIR_PMM *size_Mat];
-//             D27.f[DIR_PMP ] = &DD27[DIR_MPM *size_Mat];
+//             D27.f[DIR_M00] = &DD27[DIR_P00 * size_Mat];
+//             D27.f[DIR_P00] = &DD27[DIR_M00 * size_Mat];
+//             D27.f[DIR_0M0] = &DD27[DIR_0P0 * size_Mat];
+//             D27.f[DIR_0P0] = &DD27[DIR_0M0 * size_Mat];
+//             D27.f[DIR_00M] = &DD27[DIR_00P * size_Mat];
+//             D27.f[DIR_00P] = &DD27[DIR_00M * size_Mat];
+//             D27.f[DIR_MM0] = &DD27[DIR_PP0 * size_Mat];
+//             D27.f[DIR_PP0] = &DD27[DIR_MM0 * size_Mat];
+//             D27.f[DIR_MP0] = &DD27[DIR_PM0 * size_Mat];
+//             D27.f[DIR_PM0] = &DD27[DIR_MP0 * size_Mat];
+//             D27.f[DIR_M0M] = &DD27[DIR_P0P * size_Mat];
+//             D27.f[DIR_P0P] = &DD27[DIR_M0M * size_Mat];
+//             D27.f[DIR_M0P] = &DD27[DIR_P0M * size_Mat];
+//             D27.f[DIR_P0M] = &DD27[DIR_M0P * size_Mat];
+//             D27.f[DIR_0MM] = &DD27[DIR_0PP * size_Mat];
+//             D27.f[DIR_0PP] = &DD27[DIR_0MM * size_Mat];
+//             D27.f[DIR_0MP] = &DD27[DIR_0PM * size_Mat];
+//             D27.f[DIR_0PM] = &DD27[DIR_0MP * size_Mat];
+//             D27.f[DIR_000] = &DD27[DIR_000 * size_Mat];
+//             D27.f[DIR_MMM] = &DD27[DIR_PPP * size_Mat];
+//             D27.f[DIR_PPM] = &DD27[DIR_MMP * size_Mat];
+//             D27.f[DIR_MPM] = &DD27[DIR_PMP * size_Mat];
+//             D27.f[DIR_PMM] = &DD27[DIR_MPP * size_Mat];
+//             D27.f[DIR_MMP] = &DD27[DIR_PPM * size_Mat];
+//             D27.f[DIR_PPP] = &DD27[DIR_MMM * size_Mat];
+//             D27.f[DIR_MPP] = &DD27[DIR_PMM * size_Mat];
+//             D27.f[DIR_PMP] = &DD27[DIR_MPM * size_Mat];
 //          }
 //          //////////////////////////////////////////////////////////////////////////
 //          real ConcD = Conc[k];
@@ -391,32 +391,32 @@ __global__ void InitAD27(
 //          real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
 //          (D27.f[DIR_000])[kzero] =   c8o27* ConcD*(c1o1-cu_sq);
-//          (D27.f[DIR_P00   ])[ke   ] =   c2o27* ConcD*(c1o1+c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cu_sq);
-//          (D27.f[DIR_M00   ])[kw   ] =   c2o27* ConcD*(c1o1+c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cu_sq);
-//          (D27.f[DIR_0P0   ])[kn   ] =   c2o27* ConcD*(c1o1+c3o1*(    vx2     )+c9o2*(     vx2    )*(     vx2    )-cu_sq);
-//          (D27.f[DIR_0M0   ])[ks   ] =   c2o27* ConcD*(c1o1+c3o1*(   -vx2     )+c9o2*(    -vx2    )*(    -vx2    )-cu_sq);
-//          (D27.f[DIR_00P   ])[kt   ] =   c2o27* ConcD*(c1o1+c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cu_sq);
-//          (D27.f[DIR_00M   ])[kb   ] =   c2o27* ConcD*(c1o1+c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cu_sq);
-//          (D27.f[DIR_PP0  ])[kne  ] =   c1o54* ConcD*(c1o1+c3o1*( vx1+vx2    )+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
-//          (D27.f[DIR_MM0  ])[ksw  ] =   c1o54* ConcD*(c1o1+c3o1*(-vx1-vx2    )+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
-//          (D27.f[DIR_PM0  ])[kse  ] =   c1o54* ConcD*(c1o1+c3o1*( vx1-vx2    )+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
-//          (D27.f[DIR_MP0  ])[knw  ] =   c1o54* ConcD*(c1o1+c3o1*(-vx1+vx2    )+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
-//          (D27.f[DIR_P0P  ])[kte  ] =   c1o54* ConcD*(c1o1+c3o1*( vx1    +vx3)+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
-//          (D27.f[DIR_M0M  ])[kbw  ] =   c1o54* ConcD*(c1o1+c3o1*(-vx1    -vx3)+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
-//          (D27.f[DIR_P0M  ])[kbe  ] =   c1o54* ConcD*(c1o1+c3o1*( vx1    -vx3)+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
-//          (D27.f[DIR_M0P  ])[ktw  ] =   c1o54* ConcD*(c1o1+c3o1*(-vx1    +vx3)+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
-//          (D27.f[DIR_0PP  ])[ktn  ] =   c1o54* ConcD*(c1o1+c3o1*(     vx2+vx3)+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
-//          (D27.f[DIR_0MM  ])[kbs  ] =   c1o54* ConcD*(c1o1+c3o1*(    -vx2-vx3)+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
-//          (D27.f[DIR_0PM  ])[kbn  ] =   c1o54* ConcD*(c1o1+c3o1*(     vx2-vx3)+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
-//          (D27.f[DIR_0MP  ])[kts  ] =   c1o54* ConcD*(c1o1+c3o1*(    -vx2+vx3)+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
-//          (D27.f[DIR_PPP ])[ktne ] =   c1o216*ConcD*(c1o1+c3o1*( vx1+vx2+vx3)+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
-//          (D27.f[DIR_MMM ])[kbsw ] =   c1o216*ConcD*(c1o1+c3o1*(-vx1-vx2-vx3)+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
-//          (D27.f[DIR_PPM ])[kbne ] =   c1o216*ConcD*(c1o1+c3o1*( vx1+vx2-vx3)+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
-//          (D27.f[DIR_MMP ])[ktsw ] =   c1o216*ConcD*(c1o1+c3o1*(-vx1-vx2+vx3)+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
-//          (D27.f[DIR_PMP ])[ktse ] =   c1o216*ConcD*(c1o1+c3o1*( vx1-vx2+vx3)+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
-//          (D27.f[DIR_MPM ])[kbnw ] =   c1o216*ConcD*(c1o1+c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
-//          (D27.f[DIR_PMM ])[kbse ] =   c1o216*ConcD*(c1o1+c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
-//          (D27.f[DIR_MPP ])[ktnw ] =   c1o216*ConcD*(c1o1+c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
+//          (D27.f[DIR_P00])[ke   ] =   c2o27* ConcD*(c1o1+c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cu_sq);
+//          (D27.f[DIR_M00])[kw   ] =   c2o27* ConcD*(c1o1+c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cu_sq);
+//          (D27.f[DIR_0P0])[kn   ] =   c2o27* ConcD*(c1o1+c3o1*(    vx2     )+c9o2*(     vx2    )*(     vx2    )-cu_sq);
+//          (D27.f[DIR_0M0])[ks   ] =   c2o27* ConcD*(c1o1+c3o1*(   -vx2     )+c9o2*(    -vx2    )*(    -vx2    )-cu_sq);
+//          (D27.f[DIR_00P])[kt   ] =   c2o27* ConcD*(c1o1+c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cu_sq);
+//          (D27.f[DIR_00M])[kb   ] =   c2o27* ConcD*(c1o1+c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cu_sq);
+//          (D27.f[DIR_PP0])[kne  ] =   c1o54* ConcD*(c1o1+c3o1*( vx1+vx2    )+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
+//          (D27.f[DIR_MM0])[ksw  ] =   c1o54* ConcD*(c1o1+c3o1*(-vx1-vx2    )+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
+//          (D27.f[DIR_PM0])[kse  ] =   c1o54* ConcD*(c1o1+c3o1*( vx1-vx2    )+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
+//          (D27.f[DIR_MP0])[knw  ] =   c1o54* ConcD*(c1o1+c3o1*(-vx1+vx2    )+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
+//          (D27.f[DIR_P0P])[kte  ] =   c1o54* ConcD*(c1o1+c3o1*( vx1    +vx3)+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
+//          (D27.f[DIR_M0M])[kbw  ] =   c1o54* ConcD*(c1o1+c3o1*(-vx1    -vx3)+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
+//          (D27.f[DIR_P0M])[kbe  ] =   c1o54* ConcD*(c1o1+c3o1*( vx1    -vx3)+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
+//          (D27.f[DIR_M0P])[ktw  ] =   c1o54* ConcD*(c1o1+c3o1*(-vx1    +vx3)+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
+//          (D27.f[DIR_0PP])[ktn  ] =   c1o54* ConcD*(c1o1+c3o1*(     vx2+vx3)+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
+//          (D27.f[DIR_0MM])[kbs  ] =   c1o54* ConcD*(c1o1+c3o1*(    -vx2-vx3)+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
+//          (D27.f[DIR_0PM])[kbn  ] =   c1o54* ConcD*(c1o1+c3o1*(     vx2-vx3)+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
+//          (D27.f[DIR_0MP])[kts  ] =   c1o54* ConcD*(c1o1+c3o1*(    -vx2+vx3)+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
+//          (D27.f[DIR_PPP])[ktne ] =   c1o216*ConcD*(c1o1+c3o1*( vx1+vx2+vx3)+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
+//          (D27.f[DIR_MMM])[kbsw ] =   c1o216*ConcD*(c1o1+c3o1*(-vx1-vx2-vx3)+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
+//          (D27.f[DIR_PPM])[kbne ] =   c1o216*ConcD*(c1o1+c3o1*( vx1+vx2-vx3)+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
+//          (D27.f[DIR_MMP])[ktsw ] =   c1o216*ConcD*(c1o1+c3o1*(-vx1-vx2+vx3)+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
+//          (D27.f[DIR_PMP])[ktse ] =   c1o216*ConcD*(c1o1+c3o1*( vx1-vx2+vx3)+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
+//          (D27.f[DIR_MPM])[kbnw ] =   c1o216*ConcD*(c1o1+c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
+//          (D27.f[DIR_PMM])[kbse ] =   c1o216*ConcD*(c1o1+c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
+//          (D27.f[DIR_MPP])[ktnw ] =   c1o216*ConcD*(c1o1+c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
 //          ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //       }
 //    }
@@ -448,7 +448,7 @@ __global__ void InitAD7( unsigned int* neighborX,
                                     real* ux,
                                     real* uy,
                                     real* uz,
-                                    unsigned int size_Mat,
+                                    unsigned long long numberOfLBnodes,
                                     real* DD7,
                                     bool EvenOrOdd)
 {
@@ -463,7 +463,7 @@ __global__ void InitAD7( unsigned int* neighborX,
    const unsigned k = nx*(ny*z + y) + x;
    //////////////////////////////////////////////////////////////////////////
 
-   if(k<size_Mat)
+   if(k<numberOfLBnodes)
    {
       ////////////////////////////////////////////////////////////////////////////////
       unsigned int BC;
@@ -474,23 +474,23 @@ __global__ void InitAD7( unsigned int* neighborX,
          Distributions7 D7;
          if (EvenOrOdd==true)
          {
-            D7.f[0] = &DD7[0*size_Mat];
-            D7.f[1] = &DD7[1*size_Mat];
-            D7.f[2] = &DD7[2*size_Mat];
-            D7.f[3] = &DD7[3*size_Mat];
-            D7.f[4] = &DD7[4*size_Mat];
-            D7.f[5] = &DD7[5*size_Mat];
-            D7.f[6] = &DD7[6*size_Mat];
+            D7.f[0] = &DD7[0*numberOfLBnodes];
+            D7.f[1] = &DD7[1*numberOfLBnodes];
+            D7.f[2] = &DD7[2*numberOfLBnodes];
+            D7.f[3] = &DD7[3*numberOfLBnodes];
+            D7.f[4] = &DD7[4*numberOfLBnodes];
+            D7.f[5] = &DD7[5*numberOfLBnodes];
+            D7.f[6] = &DD7[6*numberOfLBnodes];
          }
          else
          {
-            D7.f[0] = &DD7[0*size_Mat];
-            D7.f[2] = &DD7[1*size_Mat];
-            D7.f[1] = &DD7[2*size_Mat];
-            D7.f[4] = &DD7[3*size_Mat];
-            D7.f[3] = &DD7[4*size_Mat];
-            D7.f[6] = &DD7[5*size_Mat];
-            D7.f[5] = &DD7[6*size_Mat];
+            D7.f[0] = &DD7[0*numberOfLBnodes];
+            D7.f[2] = &DD7[1*numberOfLBnodes];
+            D7.f[1] = &DD7[2*numberOfLBnodes];
+            D7.f[4] = &DD7[3*numberOfLBnodes];
+            D7.f[3] = &DD7[4*numberOfLBnodes];
+            D7.f[6] = &DD7[5*numberOfLBnodes];
+            D7.f[5] = &DD7[6*numberOfLBnodes];
          }
          //////////////////////////////////////////////////////////////////////////
          real ConcD = Conc[k];
diff --git a/src/gpu/VirtualFluids_GPU/GPU/KernelUtilities.h b/src/gpu/VirtualFluids_GPU/GPU/KernelUtilities.h
deleted file mode 100644
index 2f6a11aa17398b65858508c3f94b241c16551b37..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/GPU/KernelUtilities.h
+++ /dev/null
@@ -1,177 +0,0 @@
-//=======================================================================================
-// ____          ____    __    ______     __________   __      __       __        __         
-// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
-//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
-//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
-//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
-//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
-//      \    \  |    |   ________________________________________________________________    
-//       \    \ |    |  |  ______________________________________________________________|   
-//        \    \|    |  |  |         __          __     __     __     ______      _______    
-//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
-//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
-//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
-//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
-//
-//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file KernelUtilities.h
-//! \ingroup GPU
-//! \author Martin Schoenherr, Anna Wellmann
-//======================================================================================
-#ifndef KERNELUTILS_H
-#define KERNELUTILS_H
-
-#include "LBM/LB.h"
-#include "lbm/constants/D3Q27.h"
-#include "lbm/constants/NumericConstants.h"
-
-using namespace vf::lbm::constant;
-using namespace vf::lbm::dir;
-
-__inline__ __device__ void getPointersToDistributions(Distributions27 &dist, real *distributionArray, const uint numberOfLBnodes, const bool isEvenTimestep)
-{
-    if (isEvenTimestep)
-    {
-        dist.f[DIR_P00   ] = &distributionArray[DIR_P00   *numberOfLBnodes];
-        dist.f[DIR_M00   ] = &distributionArray[DIR_M00   *numberOfLBnodes];
-        dist.f[DIR_0P0   ] = &distributionArray[DIR_0P0   *numberOfLBnodes];
-        dist.f[DIR_0M0   ] = &distributionArray[DIR_0M0   *numberOfLBnodes];
-        dist.f[DIR_00P   ] = &distributionArray[DIR_00P   *numberOfLBnodes];
-        dist.f[DIR_00M   ] = &distributionArray[DIR_00M   *numberOfLBnodes];
-        dist.f[DIR_PP0  ] = &distributionArray[DIR_PP0  *numberOfLBnodes];
-        dist.f[DIR_MM0  ] = &distributionArray[DIR_MM0  *numberOfLBnodes];
-        dist.f[DIR_PM0  ] = &distributionArray[DIR_PM0  *numberOfLBnodes];
-        dist.f[DIR_MP0  ] = &distributionArray[DIR_MP0  *numberOfLBnodes];
-        dist.f[DIR_P0P  ] = &distributionArray[DIR_P0P  *numberOfLBnodes];
-        dist.f[DIR_M0M  ] = &distributionArray[DIR_M0M  *numberOfLBnodes];
-        dist.f[DIR_P0M  ] = &distributionArray[DIR_P0M  *numberOfLBnodes];
-        dist.f[DIR_M0P  ] = &distributionArray[DIR_M0P  *numberOfLBnodes];
-        dist.f[DIR_0PP  ] = &distributionArray[DIR_0PP  *numberOfLBnodes];
-        dist.f[DIR_0MM  ] = &distributionArray[DIR_0MM  *numberOfLBnodes];
-        dist.f[DIR_0PM  ] = &distributionArray[DIR_0PM  *numberOfLBnodes];
-        dist.f[DIR_0MP  ] = &distributionArray[DIR_0MP  *numberOfLBnodes];
-        dist.f[DIR_000] = &distributionArray[DIR_000*numberOfLBnodes];
-        dist.f[DIR_PPP ] = &distributionArray[DIR_PPP *numberOfLBnodes];
-        dist.f[DIR_MMP ] = &distributionArray[DIR_MMP *numberOfLBnodes];
-        dist.f[DIR_PMP ] = &distributionArray[DIR_PMP *numberOfLBnodes];
-        dist.f[DIR_MPP ] = &distributionArray[DIR_MPP *numberOfLBnodes];
-        dist.f[DIR_PPM ] = &distributionArray[DIR_PPM *numberOfLBnodes];
-        dist.f[DIR_MMM ] = &distributionArray[DIR_MMM *numberOfLBnodes];
-        dist.f[DIR_PMM ] = &distributionArray[DIR_PMM *numberOfLBnodes];
-        dist.f[DIR_MPM ] = &distributionArray[DIR_MPM *numberOfLBnodes];
-    }
-    else
-    {
-         dist.f[DIR_M00   ] = &distributionArray[DIR_P00   *numberOfLBnodes];
-         dist.f[DIR_P00   ] = &distributionArray[DIR_M00   *numberOfLBnodes];
-         dist.f[DIR_0M0   ] = &distributionArray[DIR_0P0   *numberOfLBnodes];
-         dist.f[DIR_0P0   ] = &distributionArray[DIR_0M0   *numberOfLBnodes];
-         dist.f[DIR_00M   ] = &distributionArray[DIR_00P   *numberOfLBnodes];
-         dist.f[DIR_00P   ] = &distributionArray[DIR_00M   *numberOfLBnodes];
-         dist.f[DIR_MM0  ] = &distributionArray[DIR_PP0  *numberOfLBnodes];
-         dist.f[DIR_PP0  ] = &distributionArray[DIR_MM0  *numberOfLBnodes];
-         dist.f[DIR_MP0  ] = &distributionArray[DIR_PM0  *numberOfLBnodes];
-         dist.f[DIR_PM0  ] = &distributionArray[DIR_MP0  *numberOfLBnodes];
-         dist.f[DIR_M0M  ] = &distributionArray[DIR_P0P  *numberOfLBnodes];
-         dist.f[DIR_P0P  ] = &distributionArray[DIR_M0M  *numberOfLBnodes];
-         dist.f[DIR_M0P  ] = &distributionArray[DIR_P0M  *numberOfLBnodes];
-         dist.f[DIR_P0M  ] = &distributionArray[DIR_M0P  *numberOfLBnodes];
-         dist.f[DIR_0MM  ] = &distributionArray[DIR_0PP  *numberOfLBnodes];
-         dist.f[DIR_0PP  ] = &distributionArray[DIR_0MM  *numberOfLBnodes];
-         dist.f[DIR_0MP  ] = &distributionArray[DIR_0PM  *numberOfLBnodes];
-         dist.f[DIR_0PM  ] = &distributionArray[DIR_0MP  *numberOfLBnodes];
-         dist.f[DIR_000] = &distributionArray[DIR_000*numberOfLBnodes];
-         dist.f[DIR_PPP ] = &distributionArray[DIR_MMM *numberOfLBnodes];
-         dist.f[DIR_MMP ] = &distributionArray[DIR_PPM *numberOfLBnodes];
-         dist.f[DIR_PMP ] = &distributionArray[DIR_MPM *numberOfLBnodes];
-         dist.f[DIR_MPP ] = &distributionArray[DIR_PMM *numberOfLBnodes];
-         dist.f[DIR_PPM ] = &distributionArray[DIR_MMP *numberOfLBnodes];
-         dist.f[DIR_MMM ] = &distributionArray[DIR_PPP *numberOfLBnodes];
-         dist.f[DIR_PMM ] = &distributionArray[DIR_MPP *numberOfLBnodes];
-         dist.f[DIR_MPM ] = &distributionArray[DIR_PMP *numberOfLBnodes];
-    }
-}
-
-__inline__ __device__ void getPointersToSubgridDistances(SubgridDistances27& subgridD, real* subgridDistances, const unsigned int numberOfSubgridIndices)
-{
-    subgridD.q[DIR_P00   ] = &subgridDistances[DIR_P00    *numberOfSubgridIndices];
-    subgridD.q[DIR_M00   ] = &subgridDistances[DIR_M00    *numberOfSubgridIndices];
-    subgridD.q[DIR_0P0   ] = &subgridDistances[DIR_0P0    *numberOfSubgridIndices];
-    subgridD.q[DIR_0M0   ] = &subgridDistances[DIR_0M0    *numberOfSubgridIndices];
-    subgridD.q[DIR_00P   ] = &subgridDistances[DIR_00P    *numberOfSubgridIndices];
-    subgridD.q[DIR_00M   ] = &subgridDistances[DIR_00M    *numberOfSubgridIndices];
-    subgridD.q[DIR_PP0  ] = &subgridDistances[DIR_PP0   *numberOfSubgridIndices];
-    subgridD.q[DIR_MM0  ] = &subgridDistances[DIR_MM0   *numberOfSubgridIndices];
-    subgridD.q[DIR_PM0  ] = &subgridDistances[DIR_PM0   *numberOfSubgridIndices];
-    subgridD.q[DIR_MP0  ] = &subgridDistances[DIR_MP0   *numberOfSubgridIndices];
-    subgridD.q[DIR_P0P  ] = &subgridDistances[DIR_P0P   *numberOfSubgridIndices];
-    subgridD.q[DIR_M0M  ] = &subgridDistances[DIR_M0M   *numberOfSubgridIndices];
-    subgridD.q[DIR_P0M  ] = &subgridDistances[DIR_P0M   *numberOfSubgridIndices];
-    subgridD.q[DIR_M0P  ] = &subgridDistances[DIR_M0P   *numberOfSubgridIndices];
-    subgridD.q[DIR_0PP  ] = &subgridDistances[DIR_0PP   *numberOfSubgridIndices];
-    subgridD.q[DIR_0MM  ] = &subgridDistances[DIR_0MM   *numberOfSubgridIndices];
-    subgridD.q[DIR_0PM  ] = &subgridDistances[DIR_0PM   *numberOfSubgridIndices];
-    subgridD.q[DIR_0MP  ] = &subgridDistances[DIR_0MP   *numberOfSubgridIndices];
-    subgridD.q[DIR_000] = &subgridDistances[DIR_000 *numberOfSubgridIndices];
-    subgridD.q[DIR_PPP ] = &subgridDistances[DIR_PPP  *numberOfSubgridIndices];
-    subgridD.q[DIR_MMP ] = &subgridDistances[DIR_MMP  *numberOfSubgridIndices];
-    subgridD.q[DIR_PMP ] = &subgridDistances[DIR_PMP  *numberOfSubgridIndices];
-    subgridD.q[DIR_MPP ] = &subgridDistances[DIR_MPP  *numberOfSubgridIndices];
-    subgridD.q[DIR_PPM ] = &subgridDistances[DIR_PPM  *numberOfSubgridIndices];
-    subgridD.q[DIR_MMM ] = &subgridDistances[DIR_MMM  *numberOfSubgridIndices];
-    subgridD.q[DIR_PMM ] = &subgridDistances[DIR_PMM  *numberOfSubgridIndices];
-    subgridD.q[DIR_MPM ] = &subgridDistances[DIR_MPM  *numberOfSubgridIndices];
-}
-
-__inline__ __device__ real getEquilibriumForBC(const real& drho, const real& velocity, const real& cu_sq, const real weight)
-{
-    return weight * (drho + c9o2 * velocity * velocity * (c1o1 + drho) - cu_sq);
-}
-
-__inline__ __device__ real getInterpolatedDistributionForVeloBC(const real& q, const real& f, const real& fInverse, const real& feq, 
-                                                                const real& omega, const real& velocity, const real weight)
-{
-
-    return (c1o1-q) / (c1o1+q) * (f - fInverse + (f + fInverse - c2o1 * feq * omega) / (c1o1 - omega)) * c1o2 
-           + (q * (f + fInverse) - c6o1 * weight * velocity) / (c1o1 + q);
-}
-
-__inline__ __device__ real getBounceBackDistributionForVeloBC(  const real& f, 
-                                                                const real& velocity, const real weight)
-{
-
-    return f - (c6o1 * weight * velocity);
-}
-
-__inline__ __device__ real getInterpolatedDistributionForNoSlipBC(const real& q, const real& f, const real& fInverse, const real& feq, 
-                                                                  const real& omega)
-{
-
-    return (c1o1-q) / (c1o1+q) * (f - fInverse + (f + fInverse - c2o1 * feq * omega) / (c1o1 - omega)) * c1o2 
-           + (q * (f + fInverse)) / (c1o1 + q);
-}
-
-
-__inline__ __device__ real getInterpolatedDistributionForVeloWithPressureBC(const real& q, const real& f, const real& fInverse, const real& feq, 
-                                                                            const real& omega, const real& drho, const real& velocity, const real weight)
-{
-
-    return (c1o1-q) / (c1o1+q) * (f - fInverse + (f + fInverse - c2o1 * feq * omega) / (c1o1 - omega)) * c1o2 
-           + (q * (f + fInverse) - c6o1 * weight * velocity) / (c1o1 + q) - weight * drho;
-}
-
-
-
-#endif
diff --git a/src/gpu/VirtualFluids_GPU/GPU/KineticEnergyAnalyzer.cu b/src/gpu/VirtualFluids_GPU/GPU/KineticEnergyAnalyzer.cu
index 51368bbe09e6fc43a7a1ff6b8b15387417774964..b05cb9201ce30038bd6edf52e2e95a13c6f6d7d4 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/KineticEnergyAnalyzer.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/KineticEnergyAnalyzer.cu
@@ -14,7 +14,7 @@
 
 #include <iomanip>
 
-//#include "Core/Logger/Logger.h"
+#include "cuda/CudaGrid.h"
 
 #include "Parameter/Parameter.h"
 // includes, kernels
@@ -24,7 +24,7 @@
 using namespace vf::lbm::constant;
 using namespace vf::lbm::dir;
 
-__global__                 void kineticEnergyKernel  (real* vx, real* vy, real* vz, real* rho, uint* neighborX, uint* neighborY, uint* neighborZ, uint* neighborWSB, uint* geo, real* kineticEnergy, uint* isFluid, uint size_Mat);
+__global__                 void kineticEnergyKernel  (real* vx, real* vy, real* vz, real* rho, uint* neighborX, uint* neighborY, uint* neighborZ, uint* neighborWSB, uint* geo, real* kineticEnergy, uint* isFluid, unsigned long long numberOfLBnodes);
 
 __host__ __device__ inline void kineticEnergyFunction(real* vx, real* vy, real* vz, real* rho, uint* neighborX, uint* neighborY, uint* neighborZ, uint* neighborWSB, uint* geo, real* kineticEnergy, uint* isFluid, uint index);
 
@@ -35,56 +35,42 @@ bool KineticEnergyAnalyzer::run(uint iter)
     if( iter % this->analyzeIter != 0 ) return false;
 
 	int lev = 0;
-	int size_Mat = this->para->getParD(lev)->numberOfNodes;
-
-    thrust::device_vector<real> kineticEnergy(size_Mat, c0o1);
-    thrust::device_vector<uint> isFluid      (size_Mat, 0);
-
-	unsigned int numberOfThreads = 128;
-    int Grid = (size_Mat / numberOfThreads)+1;
-    int Grid1, Grid2;
-    if (Grid>512)
-    {
-       Grid1 = 512;
-       Grid2 = (Grid/Grid1)+1;
-    } 
-    else
-    {
-       Grid1 = 1;
-       Grid2 = Grid;
-    }
-    dim3 grid(Grid1, Grid2);
-    dim3 threads(numberOfThreads, 1, 1 );
-
-    LBCalcMacCompSP27<<< grid, threads >>> (para->getParD(lev)->velocityX,
-											para->getParD(lev)->velocityY,
-											para->getParD(lev)->velocityZ,
-											para->getParD(lev)->rho,
-											para->getParD(lev)->pressure,
-											para->getParD(lev)->typeOfGridNode,
-											para->getParD(lev)->neighborX,
-											para->getParD(lev)->neighborY,
-											para->getParD(lev)->neighborZ,
-											para->getParD(lev)->numberOfNodes,
-											para->getParD(lev)->distributions.f[0],
-											para->getParD(lev)->isEvenTimestep); 
-    getLastCudaError("LBCalcMacSP27 execution failed"); 
-
-	kineticEnergyKernel <<< grid, threads >>> ( para->getParD(lev)->velocityX, 
-											    para->getParD(lev)->velocityY, 
-												para->getParD(lev)->velocityZ, 
-												para->getParD(lev)->rho, 
-											    para->getParD(lev)->neighborX,
-											    para->getParD(lev)->neighborY,
-											    para->getParD(lev)->neighborZ,
-											    para->getParD(lev)->neighborInverse,
-											    para->getParD(lev)->typeOfGridNode,
-												kineticEnergy.data().get(), 
-                                                isFluid.data().get(),
-												size_Mat);
-	cudaDeviceSynchronize();
-
-	 getLastCudaError("kineticEnergyKernel execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(lev)->numberofthreads, para->getParD(lev)->numberOfNodes);
+
+    thrust::device_vector<real> kineticEnergy( this->para->getParD(lev)->numberOfNodes, c0o1);
+    thrust::device_vector<uint> isFluid      ( this->para->getParD(lev)->numberOfNodes, 0);
+
+    LBCalcMacCompSP27<<< grid.grid, grid.threads >>>(
+        para->getParD(lev)->velocityX,
+        para->getParD(lev)->velocityY,
+        para->getParD(lev)->velocityZ,
+        para->getParD(lev)->rho,
+        para->getParD(lev)->pressure,
+        para->getParD(lev)->typeOfGridNode,
+        para->getParD(lev)->neighborX,
+        para->getParD(lev)->neighborY,
+        para->getParD(lev)->neighborZ,
+        para->getParD(lev)->numberOfNodes,
+        para->getParD(lev)->distributions.f[0],
+        para->getParD(lev)->isEvenTimestep); 
+    getLastCudaError("LBCalcMacCompSP27 execution failed"); 
+
+    kineticEnergyKernel<<< grid.grid, grid.threads >>>(
+        para->getParD(lev)->velocityX, 
+        para->getParD(lev)->velocityY, 
+        para->getParD(lev)->velocityZ, 
+        para->getParD(lev)->rho, 
+        para->getParD(lev)->neighborX,
+        para->getParD(lev)->neighborY,
+        para->getParD(lev)->neighborZ,
+        para->getParD(lev)->neighborInverse,
+        para->getParD(lev)->typeOfGridNode,
+        kineticEnergy.data().get(), 
+        isFluid.data().get(),
+        para->getParD(lev)->numberOfNodes);
+    cudaDeviceSynchronize();
+
+    getLastCudaError("kineticEnergyKernel execution failed");
 
 	 real EKin               = thrust::reduce(kineticEnergy.begin(), kineticEnergy.end(), c0o1, thrust::plus<real>());
      uint numberOfFluidNodes = thrust::reduce(isFluid.begin(),       isFluid.end(),       0,    thrust::plus<uint>());
@@ -99,7 +85,7 @@ bool KineticEnergyAnalyzer::run(uint iter)
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-__global__ void kineticEnergyKernel(real* vx, real* vy, real* vz, real* rho, uint* neighborX, uint* neighborY, uint* neighborZ, uint* neighborWSB, uint* geo, real* kineticEnergy, uint* isFluid, uint size_Mat)
+__global__ void kineticEnergyKernel(real* vx, real* vy, real* vz, real* rho, uint* neighborX, uint* neighborY, uint* neighborZ, uint* neighborWSB, uint* geo, real* kineticEnergy, uint* isFluid, unsigned long long numberOfLBnodes)
 {
     //////////////////////////////////////////////////////////////////////////
     const uint x = threadIdx.x;  // Globaler x-Index 
@@ -115,7 +101,7 @@ __global__ void kineticEnergyKernel(real* vx, real* vy, real* vz, real* rho, uin
 
     //if( index % 34 == 0 || index % 34 == 33 ) return;
 
-    if( index >= size_Mat) return;
+    if( index >= (uint)numberOfLBnodes) return;
 
 	unsigned int BC;
 	BC = geo[index];
diff --git a/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu b/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
index 63fc5be0ebe5d4a26d4662ee8c0dddbc3098247a..4faea21102b6a68dd9a0aa30e9cecc7eba6051b0 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
@@ -18,2176 +18,1644 @@
 
 #include "Parameter/Parameter.h"
 //////////////////////////////////////////////////////////////////////////
-void KernelCas27( unsigned int grid_nx,
-                             unsigned int grid_ny,
-                             unsigned int grid_nz,
-                             real s9,
-                             unsigned int* bcMatD,
-                             unsigned int* neighborX,
-                             unsigned int* neighborY,
-                             unsigned int* neighborZ,
-                             real* DD,
-                             int size_Mat,
-                             bool EvenOrOdd)
-{
-   dim3 threads       ( grid_nx, 1, 1 );
-   dim3 grid          ( grid_ny, grid_nz );   // Gitter fuer Kollision und Propagation
-
-      LB_Kernel_Casc27<<< grid, threads >>>( s9,
-                                             bcMatD,
-                                             neighborX,
-                                             neighborY,
-                                             neighborZ,
-                                             DD,
-                                             size_Mat,
-                                             EvenOrOdd);
-     getLastCudaError("LB_Kernel_Casc27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void KernelCasSP27( unsigned int numberOfThreads,
-                               real s9,
-                               unsigned int* bcMatD,
-                               unsigned int* neighborX,
-                               unsigned int* neighborY,
-                               unsigned int* neighborZ,
-                               real* DD,
-                               int size_Mat,
-                               bool EvenOrOdd)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LB_Kernel_Casc_SP_27<<< grid, threads >>>(s9,
-                                                bcMatD,
-                                                neighborX,
-                                                neighborY,
-                                                neighborZ,
-                                                DD,
-                                                size_Mat,
-                                                EvenOrOdd);
-      getLastCudaError("LB_Kernel_Casc_SP_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void KernelCasSPMS27( unsigned int numberOfThreads,
-                                 real s9,
-                                 unsigned int* bcMatD,
-                                 unsigned int* neighborX,
-                                 unsigned int* neighborY,
-                                 unsigned int* neighborZ,
-                                 real* DD,
-                                 int size_Mat,
-                                 bool EvenOrOdd)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LB_Kernel_Casc_SP_MS_27<<< grid, threads >>>(s9,
-                                                   bcMatD,
-                                                   neighborX,
-                                                   neighborY,
-                                                   neighborZ,
-                                                   DD,
-                                                   size_Mat,
-                                                   EvenOrOdd);
-      getLastCudaError("LB_Kernel_Casc_SP_MS_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void KernelCasSPMSOHM27( unsigned int numberOfThreads,
-                                    real s9,
-                                    unsigned int* bcMatD,
-                                    unsigned int* neighborX,
-                                    unsigned int* neighborY,
-                                    unsigned int* neighborZ,
-                                    real* DD,
-                                    int size_Mat,
-                                    bool EvenOrOdd)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LB_Kernel_Casc_SP_MS_OHM_27<<< grid, threads >>>(  s9,
-                                                         bcMatD,
-                                                         neighborX,
-                                                         neighborY,
-                                                         neighborZ,
-                                                         DD,
-                                                         size_Mat,
-                                                         EvenOrOdd);
-      getLastCudaError("LB_Kernel_Casc_SP_MS_OHM_27 execution failed");
+void KernelCas27(
+    unsigned int grid_nx,
+    unsigned int grid_ny,
+    unsigned int grid_nz,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    bool EvenOrOdd)
+{
+    dim3 threads       ( grid_nx, 1, 1 );
+    dim3 grid          ( grid_ny, grid_nz );   // Gitter fuer Kollision und Propagation
+
+    LB_Kernel_Casc27<<< grid, threads >>>(
+        s9,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_Casc27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void KernelCasSP27( 
+    unsigned int numberOfThreads,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LB_Kernel_Casc_SP_27<<< grid.grid, grid.threads >>>(
+        s9,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_Casc_SP_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void KernelCasSPMS27(
+    unsigned int numberOfThreads,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LB_Kernel_Casc_SP_MS_27<<< grid.grid, grid.threads >>>(
+        s9,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_Casc_SP_MS_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void KernelCasSPMSOHM27(
+    unsigned int numberOfThreads,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LB_Kernel_Casc_SP_MS_OHM_27<<< grid.grid, grid.threads >>>(
+        s9,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_Casc_SP_MS_OHM_27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void KernelKumCompSRTSP27(
-	unsigned int numberOfThreads,
-	real omega,
-	unsigned int* bcMatD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	real* DDStart,
-	int size_Mat,
-	int level,
-	real* forces,
-	bool EvenOrOdd)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   LB_Kernel_Kum_New_Comp_SRT_SP_27 <<< grid, threads >>>(
-	   omega,
-	   bcMatD,
-	   neighborX,
-	   neighborY,
-	   neighborZ,
-	   DDStart,
-	   size_Mat,
-	   level,
-	   forces,
-	   EvenOrOdd);
-      getLastCudaError("LB_Kernel_Kum_New_Comp_SRT_SP_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void KernelKum1hSP27(    unsigned int numberOfThreads,
-									real omega,
-									real deltaPhi,
-									real angularVelocity,
-									unsigned int* bcMatD,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									real* coordX,
-									real* coordY,
-									real* coordZ,
-									real* DDStart,
-									int size_Mat,
-									bool EvenOrOdd)
-{
-	int Grid = (size_Mat / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-		LB_Kernel_Kum_1h_SP_27<<< grid, threads >>>(omega,
-													deltaPhi,
-													angularVelocity,
-													bcMatD,
-													neighborX,
-													neighborY,
-													neighborZ,
-													coordX,
-													coordY,
-													coordZ,
-													DDStart,
-													size_Mat,
-													EvenOrOdd);
-		getLastCudaError("LB_Kernel_Kum_New_SP_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void KernelCascadeSP27(  unsigned int numberOfThreads,
-									real s9,
-									unsigned int* bcMatD,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									real* DD,
-									int size_Mat,
-									bool EvenOrOdd)
-{
-	int Grid = (size_Mat / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-		LB_Kernel_Cascade_SP_27<<< grid, threads >>>(s9,
-													bcMatD,
-													neighborX,
-													neighborY,
-													neighborZ,
-													DD,
-													size_Mat,
-													EvenOrOdd);
-		getLastCudaError("LB_Kernel_Cascade_SP_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void KernelKumNewSP27(   unsigned int numberOfThreads,
-									real s9,
-									unsigned int* bcMatD,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									real* DD,
-									int size_Mat,
-									bool EvenOrOdd)
-{
-	int Grid = (size_Mat / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-		LB_Kernel_Kum_New_SP_27<<< grid, threads >>>(s9,
-													bcMatD,
-													neighborX,
-													neighborY,
-													neighborZ,
-													DD,
-													size_Mat,
-													EvenOrOdd);
-		getLastCudaError("LB_Kernel_Kum_New_SP_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void KernelKumNewCompSP27(unsigned int numberOfThreads,
-									real s9,
-									unsigned int* bcMatD,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									real* DD,
-									int size_Mat,
-									int size_Array,
-									int level,
-									real* forces,
-									bool EvenOrOdd)
-{
-	//int Grid = size_Array / numberOfThreads;
-	//dim3 grid(Grid, 1, 1);
-	//dim3 threads(numberOfThreads, 1, 1 );
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-		//LB_Kernel_Kum_New_Comp_SP_27<<< grid, threads >>>(	s9,
-		//													bcMatD,
-		//													neighborX,
-		//													neighborY,
-		//													neighborZ,
-		//													DD,
-		//													size_Mat,
-		//													level,
-		//													forces,
-		//													EvenOrOdd);
-		//getLastCudaError("LB_Kernel_Kum_New_Comp_SP_27 execution failed");
-}
-
-//////////////////////////////////////////////////////////////////////////
-void CumulantOnePreconditionedErrorDiffusionChimCompSP27(unsigned int numberOfThreads,
-																	real s9,
-																	unsigned int* bcMatD,
-																	unsigned int* neighborX,
-																	unsigned int* neighborY,
-																	unsigned int* neighborZ,
-																	real* DD,
-																	int size_Mat,
-																	int size_Array,
-																	int level,
-																	real* forces,
-																	bool EvenOrOdd)
-{
-	//int Grid = size_Array / numberOfThreads;
-	//dim3 grid(Grid, 1, 1);
-	//dim3 threads(numberOfThreads, 1, 1 );
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	Cumulant_One_preconditioned_errorDiffusion_chim_Comp_SP_27 <<< grid, threads >>>(	s9,
-																						bcMatD,
-																						neighborX,
-																						neighborY,
-																						neighborZ,
-																						DD,
-																						size_Mat,
-																						level,
-																						forces,
-																						EvenOrOdd);
-		getLastCudaError("Cumulant_One_preconditioned_chim_Comp_SP_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CumulantOnePreconditionedChimCompSP27(  unsigned int numberOfThreads,
-														real s9,
-														unsigned int* bcMatD,
-														unsigned int* neighborX,
-														unsigned int* neighborY,
-														unsigned int* neighborZ,
-														real* DD,
-														int size_Mat,
-														int size_Array,
-														int level,
-														real* forces,
-														bool EvenOrOdd)
-{
-	//int Grid = size_Array / numberOfThreads;
-	//dim3 grid(Grid, 1, 1);
-	//dim3 threads(numberOfThreads, 1, 1 );
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	Cumulant_One_preconditioned_chim_Comp_SP_27 <<< grid, threads >>>(	s9,
-																		bcMatD,
-																		neighborX,
-																		neighborY,
-																		neighborZ,
-																		DD,
-																		size_Mat,
-																		level,
-																		forces,
-																		EvenOrOdd);
-		getLastCudaError("Cumulant_One_preconditioned_chim_Comp_SP_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CumulantOneChimCompSP27(unsigned int numberOfThreads,
-										real s9,
-										unsigned int* bcMatD,
-										unsigned int* neighborX,
-										unsigned int* neighborY,
-										unsigned int* neighborZ,
-										real* DD,
-										int size_Mat,
-										int size_Array,
-										int level,
-										real* forces,
-										bool EvenOrOdd)
-{
-	//int Grid = size_Array / numberOfThreads;
-	//dim3 grid(Grid, 1, 1);
-	//dim3 threads(numberOfThreads, 1, 1 );
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	Cumulant_One_chim_Comp_SP_27 <<< grid, threads >>>(	s9,
-														bcMatD,
-														neighborX,
-														neighborY,
-														neighborZ,
-														DD,
-														size_Mat,
-														level,
-														forces,
-														EvenOrOdd);
-		getLastCudaError("Cumulant_One_chim_Comp_SP_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void KernelKumIsoTestSP27(unsigned int numberOfThreads,
-									 real s9,
-									 unsigned int* bcMatD,
-									 unsigned int* neighborX,
-									 unsigned int* neighborY,
-									 unsigned int* neighborZ,
-									 real* DD,
-									 real* dxxUx,
-									 real* dyyUy,
-									 real* dzzUz,
-									 int size_Mat,
-									 bool EvenOrOdd)
-{
-	int Grid = (size_Mat / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	LB_Kernel_Kum_IsoTest_SP_27<<< grid, threads >>>(s9,
-													bcMatD,
-													neighborX,
-													neighborY,
-													neighborZ,
-													DD,
-													dxxUx,
-													dyyUy,
-													dzzUz,
-													size_Mat,
-													EvenOrOdd);
-	getLastCudaError("LB_Kernel_Kum_IsoTest_SP_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void KernelKumCompSP27(  unsigned int numberOfThreads,
-									real s9,
-									unsigned int* bcMatD,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									real* DD,
-									int size_Mat,
-									bool EvenOrOdd)
-{
-	int Grid = (size_Mat / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-		LB_Kernel_Kum_Comp_SP_27<<< grid, threads >>>(s9,
-													bcMatD,
-													neighborX,
-													neighborY,
-													neighborZ,
-													DD,
-													size_Mat,
-													EvenOrOdd);
-		getLastCudaError("LB_Kernel_Kum_Comp_SP_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void KernelPMCumOneCompSP27(unsigned int numberOfThreads,
-									   real omega,
-									   unsigned int* neighborX,
-									   unsigned int* neighborY,
-									   unsigned int* neighborZ,
-									   real* DD,
-									   int size_Mat,
-									   int level,
-									   real* forces,
-									   real porosity,
-									   real darcy,
-									   real forchheimer,
-									   unsigned int sizeOfPorousMedia,
-									   unsigned int* nodeIdsPorousMedia,
-									   bool EvenOrOdd)
-{
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_PM_Cum_One_Comp_SP_27 <<< grid, threads >>>(omega,
-														  neighborX,
-														  neighborY,
-														  neighborZ,
-														  DD,
-														  size_Mat,
-														  level,
-														  forces,
-														  porosity,
-														  darcy,
-														  forchheimer,
-														  sizeOfPorousMedia,
-														  nodeIdsPorousMedia,
-														  EvenOrOdd);
-	getLastCudaError("LB_Kernel_PM_Cum_One_Comp_SP_27 execution failed");
+    unsigned int numberOfThreads,
+    real omega,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DDStart,
+    unsigned long long numberOfLBnodes,
+    int level,
+    real* forces,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LB_Kernel_Kum_New_Comp_SRT_SP_27 <<< grid.grid, grid.threads >>>(
+        omega,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DDStart,
+        numberOfLBnodes,
+        level,
+        forces,
+        EvenOrOdd);
+        getLastCudaError("LB_Kernel_Kum_New_Comp_SRT_SP_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void KernelKum1hSP27(
+    unsigned int numberOfThreads,
+    real omega,
+    real deltaPhi,
+    real angularVelocity,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* coordX,
+    real* coordY,
+    real* coordZ,
+    real* DDStart,
+    unsigned long long numberOfLBnodes,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LB_Kernel_Kum_1h_SP_27<<< grid.grid, grid.threads >>>(
+        omega,
+        deltaPhi,
+        angularVelocity,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        coordX,
+        coordY,
+        coordZ,
+        DDStart,
+        numberOfLBnodes,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_Kum_1h_SP_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void KernelCascadeSP27(
+    unsigned int numberOfThreads,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LB_Kernel_Cascade_SP_27<<< grid.grid, grid.threads >>>(
+        s9,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_Cascade_SP_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void KernelKumNewSP27(
+    unsigned int numberOfThreads,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+
+    LB_Kernel_Kum_New_SP_27<<< grid.grid, grid.threads >>>(
+        s9,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_Kum_New_SP_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void KernelKumNewCompSP27(
+    unsigned int numberOfThreads,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    int size_Array,
+    int level,
+    real* forces,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    //LB_Kernel_Kum_New_Comp_SP_27<<< grid.grid, grid.threads >>>(	s9,
+    //													bcMatD,
+    //													neighborX,
+    //													neighborY,
+    //													neighborZ,
+    //													DD,
+    //													numberOfLBnodes,
+    //													level,
+    //													forces,
+    //													EvenOrOdd);
+    //getLastCudaError("LB_Kernel_Kum_New_Comp_SP_27 execution failed");
+}
+
+//////////////////////////////////////////////////////////////////////////
+void CumulantOnePreconditionedErrorDiffusionChimCompSP27(
+    unsigned int numberOfThreads,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    int size_Array,
+    int level,
+    real* forces,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    Cumulant_One_preconditioned_errorDiffusion_chim_Comp_SP_27 <<< grid.grid, grid.threads >>>(
+        s9,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        level,
+        forces,
+        EvenOrOdd);
+    getLastCudaError("Cumulant_One_preconditioned_chim_Comp_SP_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CumulantOnePreconditionedChimCompSP27(
+    unsigned int numberOfThreads,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    int size_Array,
+    int level,
+    real* forces,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    Cumulant_One_preconditioned_chim_Comp_SP_27 <<< grid.grid, grid.threads >>>(
+        s9,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        level,
+        forces,
+        EvenOrOdd);
+    getLastCudaError("Cumulant_One_preconditioned_chim_Comp_SP_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CumulantOneChimCompSP27(
+    unsigned int numberOfThreads,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    int size_Array,
+    int level,
+    real* forces,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    Cumulant_One_chim_Comp_SP_27 <<< grid.grid, grid.threads >>>(
+        s9,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        level,
+        forces,
+        EvenOrOdd);
+    getLastCudaError("Cumulant_One_chim_Comp_SP_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void KernelKumIsoTestSP27(
+    unsigned int numberOfThreads,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    real* dxxUx,
+    real* dyyUy,
+    real* dzzUz,
+    unsigned long long numberOfLBnodes,
+    bool EvenOrOdd)
+{
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LB_Kernel_Kum_IsoTest_SP_27<<< grid.grid, grid.threads >>>(
+        s9,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        dxxUx,
+        dyyUy,
+        dzzUz,
+        numberOfLBnodes,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_Kum_IsoTest_SP_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void KernelKumCompSP27(
+    unsigned int numberOfThreads,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+
+    LB_Kernel_Kum_Comp_SP_27<<< grid.grid, grid.threads >>>(
+        s9,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_Kum_Comp_SP_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void KernelPMCumOneCompSP27(
+    unsigned int numberOfThreads,
+    real omega,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    int level,
+    real* forces,
+    real porosity,
+    real darcy,
+    real forchheimer,
+    unsigned int sizeOfPorousMedia,
+    unsigned int* nodeIdsPorousMedia,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LB_Kernel_PM_Cum_One_Comp_SP_27 <<< grid.grid, grid.threads >>>(
+        omega,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        level,
+        forces,
+        porosity,
+        darcy,
+        forchheimer,
+        sizeOfPorousMedia,
+        nodeIdsPorousMedia,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_PM_Cum_One_Comp_SP_27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void KernelWaleBySoniMalavCumAA2016CompSP27(
-	unsigned int numberOfThreads,
-	real s9,
-	unsigned int* bcMatD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	unsigned int* neighborWSB,
-	real* veloX,
-	real* veloY,
-	real* veloZ,
-	real* DD,
-	real* turbulentViscosity,
-	int size_Mat,
-	int size_Array,
-	int level,
-	real* forces,
-	bool EvenOrOdd)
-{
-	//int Grid = size_Array / numberOfThreads;
-	//dim3 grid(Grid, 1, 1);
-	//dim3 threads(numberOfThreads, 1, 1 );
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_WaleBySoniMalav_Cum_AA2016_Comp_SP_27 << < grid, threads >> >(
-		s9,
-		bcMatD,
-		neighborX,
-		neighborY,
-		neighborZ,
-		neighborWSB,
-		veloX,
-		veloY,
-		veloZ,
-		DD,
-		turbulentViscosity,
-		size_Mat,
-		level,
-		forces,
-		EvenOrOdd);
-	getLastCudaError("LB_Kernel_WaleBySoniMalav_Cum_AA2016_Comp_SP_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void KernelADincomp7(   unsigned int numberOfThreads,
-								   real diffusivity,
-								   unsigned int* bcMatD,
-								   unsigned int* neighborX,
-								   unsigned int* neighborY,
-								   unsigned int* neighborZ,
-								   real* DD,
-								   real* DD7,
-								   int size_Mat,
-								   bool EvenOrOdd)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LB_Kernel_AD_Incomp_7<<< grid, threads >>>( diffusivity,
-												  bcMatD,
-												  neighborX,
-												  neighborY,
-												  neighborZ,
-												  DD,
-												  DD7,
-												  size_Mat,
-												  EvenOrOdd);
-      getLastCudaError("LB_Kernel_AD_Incomp_7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void KernelADincomp27( unsigned int numberOfThreads,
-								  real diffusivity,
-								  unsigned int* bcMatD,
-								  unsigned int* neighborX,
-								  unsigned int* neighborY,
-								  unsigned int* neighborZ,
-								  real* DD,
-								  real* DD27,
-								  int size_Mat,
-								  bool EvenOrOdd)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LB_Kernel_AD_Incomp_27<<< grid, threads >>>( diffusivity,
-													bcMatD,
-													neighborX,
-													neighborY,
-													neighborZ,
-													DD,
-													DD27,
-													size_Mat,
-													EvenOrOdd);
-      getLastCudaError("LB_Kernel_AD_Incomp_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void Init27( int myid,
-                        int numprocs,
-                        real u0,
-                        unsigned int* geoD,
-                        unsigned int* neighborX,
-                        unsigned int* neighborY,
-                        unsigned int* neighborZ,
-                        real* vParab,
-                        unsigned int size_Mat,
-                        unsigned int grid_nx,
-                        unsigned int grid_ny,
-                        unsigned int grid_nz,
-                        real* DD,
-                        int level,
-                        int maxlevel)
-{
-   dim3 threads       ( grid_nx, 1, 1 );
-   dim3 grid          ( grid_ny, grid_nz );   // Gitter fuer Kollision und Propagation
-
-      LBInit27<<< grid, threads >>> (  myid,
-                                       numprocs,
-                                       u0,
-                                       geoD,
-                                       neighborX,
-                                       neighborY,
-                                       neighborZ,
-                                       vParab,
-                                       size_Mat,
-                                       grid_nx,
-                                       grid_ny,
-                                       grid_nz,
-                                       DD,
-                                       level,
-                                       maxlevel);
-      getLastCudaError("LBInit27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void InitNonEqPartSP27( unsigned int numberOfThreads,
-                                   unsigned int* neighborX,
-                                   unsigned int* neighborY,
-                                   unsigned int* neighborZ,
-                                   unsigned int* neighborWSB,
-                                   unsigned int* geoD,
-                                   real* rho,
-                                   real* ux,
-                                   real* uy,
-                                   real* uz,
-                                   unsigned int size_Mat,
-                                   real* DD,
-                                   real omega,
-                                   bool EvenOrOdd)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBInitNonEqPartSP27<<< grid, threads >>>( neighborX,
-                                                neighborY,
-                                                neighborZ,
-                                                neighborWSB,
-                                                geoD,
-                                                rho,
-                                                ux,
-                                                uy,
-                                                uz,
-                                                size_Mat,
-                                                DD,
-                                                omega,
-                                                EvenOrOdd);
-      getLastCudaError("LBInitNonEqPartSP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void InitThS7(     unsigned int numberOfThreads,
-                              unsigned int* neighborX,
-                              unsigned int* neighborY,
-                              unsigned int* neighborZ,
-                              unsigned int* geoD,
-                              real* Conc,
-                              real* ux,
-                              real* uy,
-                              real* uz,
-                              unsigned int size_Mat,
-                              real* DD7,
-                              bool EvenOrOdd)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      InitAD7<<< grid, threads >>>( neighborX,
-                                       neighborY,
-                                       neighborZ,
-                                       geoD,
-                                       Conc,
-                                       ux,
-                                       uy,
-                                       uz,
-                                       size_Mat,
-                                       DD7,
-                                       EvenOrOdd);
-      getLastCudaError("InitAD7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void InitADDev27( unsigned int numberOfThreads,
-                           unsigned int* neighborX,
-                           unsigned int* neighborY,
-                           unsigned int* neighborZ,
-                           unsigned int* geoD,
-                           real* Conc,
-                           real* ux,
-                           real* uy,
-                           real* uz,
-                           unsigned int size_Mat,
-                           real* DD27,
-                           bool EvenOrOdd)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      InitAD27<<< grid, threads >>>(neighborX,
-                                       neighborY,
-                                       neighborZ,
-                                       geoD,
-                                       Conc,
-                                       ux,
-                                       uy,
-                                       uz,
-                                       size_Mat,
-                                       DD27,
-                                       EvenOrOdd);
-      getLastCudaError("InitAD27 execution failed");
+    unsigned int numberOfThreads,
+    real s9,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned int* neighborWSB,
+    real* veloX,
+    real* veloY,
+    real* veloZ,
+    real* DD,
+    real* turbulentViscosity,
+    unsigned long long numberOfLBnodes,
+    int size_Array,
+    int level,
+    real* forces,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LB_Kernel_WaleBySoniMalav_Cum_AA2016_Comp_SP_27 <<< grid.grid, grid.threads >>>(
+        s9,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        neighborWSB,
+        veloX,
+        veloY,
+        veloZ,
+        DD,
+        turbulentViscosity,
+        numberOfLBnodes,
+        level,
+        forces,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_WaleBySoniMalav_Cum_AA2016_Comp_SP_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void KernelADincomp7(
+    unsigned int numberOfThreads,
+    real diffusivity,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    real* DD7,
+    unsigned long long numberOfLBnodes,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LB_Kernel_AD_Incomp_7<<< grid.grid, grid.threads >>>(
+        diffusivity,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        DD7,
+        numberOfLBnodes,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_AD_Incomp_7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void KernelADincomp27(
+    unsigned int numberOfThreads,
+    real diffusivity,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    real* DD27,
+    unsigned long long numberOfLBnodes,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LB_Kernel_AD_Incomp_27<<< grid.grid, grid.threads >>>(
+        diffusivity,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        DD27,
+        numberOfLBnodes,
+        EvenOrOdd);
+    getLastCudaError("LB_Kernel_AD_Incomp_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void Init27(
+    int myid,
+    int numprocs,
+    real u0,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* vParab,
+    unsigned long long numberOfLBnodes,
+    unsigned int grid_nx,
+    unsigned int grid_ny,
+    unsigned int grid_nz,
+    real* DD,
+    int level,
+    int maxlevel)
+{
+    dim3 threads       ( grid_nx, 1, 1 );
+    dim3 grid          ( grid_ny, grid_nz );
+
+    LBInit27<<< grid, threads >>> (
+        myid,
+        numprocs,
+        u0,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        vParab,
+        numberOfLBnodes,
+        grid_nx,
+        grid_ny,
+        grid_nz,
+        DD,
+        level,
+        maxlevel);
+    getLastCudaError("LBInit27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void InitNonEqPartSP27(
+    unsigned int numberOfThreads,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned int* neighborWSB,
+    unsigned int* geoD,
+    real* rho,
+    real* ux,
+    real* uy,
+    real* uz,
+    unsigned long long numberOfLBnodes,
+    real* DD,
+    real omega,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBInitNonEqPartSP27<<< grid.grid, grid.threads >>>(
+        neighborX,
+        neighborY,
+        neighborZ,
+        neighborWSB,
+        geoD,
+        rho,
+        ux,
+        uy,
+        uz,
+        numberOfLBnodes,
+        DD,
+        omega,
+        EvenOrOdd);
+    getLastCudaError("LBInitNonEqPartSP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void InitThS7(
+    unsigned int numberOfThreads,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned int* geoD,
+    real* Conc,
+    real* ux,
+    real* uy,
+    real* uz,
+    unsigned long long numberOfLBnodes,
+    real* DD7,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    InitAD7<<< grid.grid, grid.threads >>>(
+        neighborX,
+        neighborY,
+        neighborZ,
+        geoD,
+        Conc,
+        ux,
+        uy,
+        uz,
+        numberOfLBnodes,
+        DD7,
+        EvenOrOdd);
+    getLastCudaError("InitAD7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void InitADDev27(
+    unsigned int numberOfThreads,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned int* geoD,
+    real* Conc,
+    real* ux,
+    real* uy,
+    real* uz,
+    unsigned long long numberOfLBnodes,
+    real* DD27,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    InitAD27<<< grid.grid, grid.threads >>>(
+        neighborX,
+        neighborY,
+        neighborZ,
+        geoD,
+        Conc,
+        ux,
+        uy,
+        uz,
+        numberOfLBnodes,
+        DD27,
+        EvenOrOdd);
+    getLastCudaError("InitAD27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void PostProcessorF3_2018Fehlberg(
-	unsigned int numberOfThreads,
-	real omega,
-	unsigned int* bcMatD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	real* rhoOut,
-	real* vxOut,
-	real* vyOut,
-	real* vzOut,
-	real* DDStart,
-	real* G6,
-	int size_Mat,
-	int level,
-	real* forces,
-	bool EvenOrOdd)
-{
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	  LB_PostProcessor_F3_2018_Fehlberg <<< grid, threads >>> (   omega,
-																  bcMatD,
-																  neighborX,
-																  neighborY,
-																  neighborZ,
-																  rhoOut,
-																  vxOut,
-																  vyOut,
-																  vzOut,
-																  DDStart,
-																  G6,
-																  size_Mat,
-																  level,
-																  forces,
-																  EvenOrOdd);
-      getLastCudaError("LB_PostProcessor_F3_2018_Fehlberg execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CalcMac27( real* vxD,
-                           real* vyD,
-                           real* vzD,
-                           real* rhoD,
-                           unsigned int* geoD,
-                           unsigned int* neighborX,
-                           unsigned int* neighborY,
-                           unsigned int* neighborZ,
-                           unsigned int size_Mat,
-                           unsigned int grid_nx,
-                           unsigned int grid_ny,
-                           unsigned int grid_nz,
-                           real* DD,
-                           bool isEvenTimestep)
+    unsigned int numberOfThreads,
+    real omega,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* rhoOut,
+    real* vxOut,
+    real* vyOut,
+    real* vzOut,
+    real* DDStart,
+    real* G6,
+    unsigned long long numberOfLBnodes,
+    int level,
+    real* forces,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LB_PostProcessor_F3_2018_Fehlberg <<< grid.grid, grid.threads >>> (
+        omega,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        rhoOut,
+        vxOut,
+        vyOut,
+        vzOut,
+        DDStart,
+        G6,
+        numberOfLBnodes,
+        level,
+        forces,
+        EvenOrOdd);
+    getLastCudaError("LB_PostProcessor_F3_2018_Fehlberg execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CalcMac27(
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int grid_nx,
+    unsigned int grid_ny,
+    unsigned int grid_nz,
+    real* DD,
+    bool isEvenTimestep)
 {
    dim3 threads       ( grid_nx, 1, 1 );
    dim3 grid          ( grid_ny, grid_nz );
 
-      LBCalcMac27<<< grid, threads >>> (  vxD,
-                                          vyD,
-                                          vzD,
-                                          rhoD,
-                                          geoD,
-                                          neighborX,
-                                          neighborY,
-                                          neighborZ,
-                                          size_Mat,
-                                          DD,
-                                          isEvenTimestep);
-      getLastCudaError("LBCalcMac27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CalcMacSP27( real* vxD,
-                             real* vyD,
-                             real* vzD,
-                             real* rhoD,
-                             real* pressD,
-                             unsigned int* geoD,
-                             unsigned int* neighborX,
-                             unsigned int* neighborY,
-                             unsigned int* neighborZ,
-                             unsigned int size_Mat,
-                             unsigned int numberOfThreads,
-                             real* DD,
-                             bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcMacSP27<<< grid, threads >>> (   vxD,
-                                             vyD,
-                                             vzD,
-                                             rhoD,
-                                             pressD,
-                                             geoD,
-                                             neighborX,
-                                             neighborY,
-                                             neighborZ,
-                                             size_Mat,
-                                             DD,
-                                             isEvenTimestep);
-      getLastCudaError("LBCalcMacSP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CalcMacCompSP27( real* vxD,
-								 real* vyD,
-								 real* vzD,
-								 real* rhoD,
-								 real* pressD,
-								 unsigned int* geoD,
-								 unsigned int* neighborX,
-								 unsigned int* neighborY,
-								 unsigned int* neighborZ,
-								 unsigned int size_Mat,
-								 unsigned int numberOfThreads,
-								 real* DD,
-								 bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcMacCompSP27<<< grid, threads >>> (   vxD,
-												 vyD,
-												 vzD,
-												 rhoD,
-												 pressD,
-												 geoD,
-												 neighborX,
-												 neighborY,
-												 neighborZ,
-												 size_Mat,
-												 DD,
-												 isEvenTimestep);
-      getLastCudaError("LBCalcMacSP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CalcMacThS7(  real* Conc,
-                              unsigned int* geoD,
-                              unsigned int* neighborX,
-                              unsigned int* neighborY,
-                              unsigned int* neighborZ,
-                              unsigned int size_Mat,
-                              unsigned int numberOfThreads,
-                              real* DD7,
-                              bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      CalcConc7<<< grid, threads >>> (Conc,
-                                          geoD,
-                                          neighborX,
-                                          neighborY,
-                                          neighborZ,
-                                          size_Mat,
-                                          DD7,
-                                          isEvenTimestep);
-      getLastCudaError("CalcConc7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void PlaneConcThS7(real* Conc,
-							  int* kPC,
-							  unsigned int numberOfPointskPC,
-							  unsigned int* geoD,
-							  unsigned int* neighborX,
-							  unsigned int* neighborY,
-							  unsigned int* neighborZ,
-							  unsigned int size_Mat,
-                              unsigned int numberOfThreads,
-							  real* DD7,
-							  bool isEvenTimestep)
-{
-   int Grid = (numberOfPointskPC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      GetPlaneConc7<<< grid, threads >>> (	Conc,
-												kPC,
-												numberOfPointskPC,
-												geoD,
-												neighborX,
-												neighborY,
-												neighborZ,
-												size_Mat,
-												DD7,
-												isEvenTimestep);
-      getLastCudaError("GetPlaneConc7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void PlaneConcThS27(real* Conc,
-							   int* kPC,
-							   unsigned int numberOfPointskPC,
-							   unsigned int* geoD,
-							   unsigned int* neighborX,
-							   unsigned int* neighborY,
-							   unsigned int* neighborZ,
-							   unsigned int size_Mat,
-                               unsigned int numberOfThreads,
-							   real* DD27,
-							   bool isEvenTimestep)
-{
-   int Grid = (numberOfPointskPC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      GetPlaneConc27<<< grid, threads >>> (	Conc,
-												kPC,
-												numberOfPointskPC,
-												geoD,
-												neighborX,
-												neighborY,
-												neighborZ,
-												size_Mat,
-												DD27,
-												isEvenTimestep);
-      getLastCudaError("GetPlaneConc27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CalcConcentration27( unsigned int numberOfThreads,
-                                     real* Conc,
-                                     unsigned int* geoD,
-                                     unsigned int* neighborX,
-                                     unsigned int* neighborY,
-                                     unsigned int* neighborZ,
-                                     unsigned int size_Mat,
-                                     real* DD27,
-                                     bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      CalcConc27<<< grid, threads >>> (  Conc,
-                                             geoD,
-                                             neighborX,
-                                             neighborY,
-                                             neighborZ,
-                                             size_Mat,
-                                             DD27,
-                                             isEvenTimestep);
-      getLastCudaError("CalcConc27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CalcMedSP27(  real* vxD,
-                              real* vyD,
-                              real* vzD,
-                              real* rhoD,
-                              real* pressD,
-                              unsigned int* geoD,
-                              unsigned int* neighborX,
-                              unsigned int* neighborY,
-                              unsigned int* neighborZ,
-                              unsigned int size_Mat,
-                              unsigned int numberOfThreads,
-                              real* DD,
-                              bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcMedSP27<<< grid, threads >>> (   vxD,
-                                             vyD,
-                                             vzD,
-                                             rhoD,
-                                             pressD,
-                                             geoD,
-                                             neighborX,
-                                             neighborY,
-                                             neighborZ,
-                                             size_Mat,
-                                             DD,
-                                             isEvenTimestep);
-      getLastCudaError("LBCalcMedSP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CalcMedCompSP27(  real* vxD,
-								  real* vyD,
-								  real* vzD,
-								  real* rhoD,
-								  real* pressD,
-								  unsigned int* geoD,
-								  unsigned int* neighborX,
-								  unsigned int* neighborY,
-								  unsigned int* neighborZ,
-								  unsigned int size_Mat,
-								  unsigned int numberOfThreads,
-								  real* DD,
-								  bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcMedCompSP27<<< grid, threads >>> (   vxD,
-												 vyD,
-												 vzD,
-												 rhoD,
-												 pressD,
-												 geoD,
-												 neighborX,
-												 neighborY,
-												 neighborZ,
-												 size_Mat,
-												 DD,
-												 isEvenTimestep);
-      getLastCudaError("LBCalcMedSP27 execution failed");
+    LBCalcMac27<<< grid, threads >>> (
+        vxD,
+        vyD,
+        vzD,
+        rhoD,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD,
+        isEvenTimestep);
+    getLastCudaError("LBCalcMac27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CalcMacSP27(
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBCalcMacSP27<<< grid.grid, grid.threads >>> (
+        vxD,
+        vyD,
+        vzD,
+        rhoD,
+        pressD,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD,
+        isEvenTimestep);
+    getLastCudaError("LBCalcMacSP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CalcMacCompSP27(
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBCalcMacCompSP27<<< grid.grid, grid.threads >>> (
+        vxD,
+        vyD,
+        vzD,
+        rhoD,
+        pressD,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD,
+        isEvenTimestep);
+    getLastCudaError("LBCalcMacCompSP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CalcMacThS7(
+    real* Conc,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD7,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    CalcConc7<<< grid.grid, grid.threads >>> (
+        Conc,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD7,
+        isEvenTimestep);
+    getLastCudaError("CalcConc7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void PlaneConcThS7(
+    real* Conc,
+    int* kPC,
+    unsigned int numberOfPointskPC,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD7,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfPointskPC);
+
+    GetPlaneConc7<<< grid.grid, grid.threads >>> (
+        Conc,
+        kPC,
+        numberOfPointskPC,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD7,
+        isEvenTimestep);
+    getLastCudaError("GetPlaneConc7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void PlaneConcThS27(
+    real* Conc,
+    int* kPC,
+    unsigned int numberOfPointskPC,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD27,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfPointskPC);
+
+    GetPlaneConc27<<< grid.grid, grid.threads >>> (
+        Conc,
+        kPC,
+        numberOfPointskPC,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD27,
+        isEvenTimestep);
+    getLastCudaError("GetPlaneConc27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CalcConcentration27(
+    unsigned int numberOfThreads,
+    real* Conc,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    real* DD27,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    CalcConc27<<< grid.grid, grid.threads >>> (
+        Conc,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD27,
+        isEvenTimestep);
+    getLastCudaError("CalcConc27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CalcMedSP27(
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBCalcMedSP27<<< grid.grid, grid.threads >>> (
+        vxD,
+        vyD,
+        vzD,
+        rhoD,
+        pressD,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD,
+        isEvenTimestep);
+    getLastCudaError("LBCalcMedSP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CalcMedCompSP27(
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBCalcMedCompSP27<<< grid.grid, grid.threads >>> (
+        vxD,
+        vyD,
+        vzD,
+        rhoD,
+        pressD,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD,
+        isEvenTimestep);
+    getLastCudaError("LBCalcMedCompSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void CalcMedCompAD27(
-	real* vxD,
-	real* vyD,
-	real* vzD,
-	real* rhoD,
-	real* pressD,
-	real* concD,
-	unsigned int* geoD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	unsigned int size_Mat,
-	unsigned int numberOfThreads,
-	real* DD,
-	real* DD_AD,
-	bool isEvenTimestep)
-{
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LBCalcMedCompAD27 <<< grid, threads >>> (
-		vxD,
-		vyD,
-		vzD,
-		rhoD,
-		pressD,
-		concD,
-		geoD,
-		neighborX,
-		neighborY,
-		neighborZ,
-		size_Mat,
-		DD,
-		DD_AD,
-		isEvenTimestep);
-	getLastCudaError("LBCalcMedAD27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CalcMacMedSP27(  real* vxD,
-                                 real* vyD,
-                                 real* vzD,
-                                 real* rhoD,
-                                 real* pressD,
-                                 unsigned int* geoD,
-                                 unsigned int* neighborX,
-                                 unsigned int* neighborY,
-                                 unsigned int* neighborZ,
-                                 unsigned int tdiff,
-                                 unsigned int size_Mat,
-                                 unsigned int numberOfThreads,
-                                 bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcMacMedSP27<<< grid, threads >>> (   vxD,
-                                                vyD,
-                                                vzD,
-                                                rhoD,
-                                                pressD,
-                                                geoD,
-                                                neighborX,
-                                                neighborY,
-                                                neighborZ,
-                                                tdiff,
-                                                size_Mat,
-                                                isEvenTimestep);
-      getLastCudaError("LBCalcMacMedSP27 execution failed");
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    real* concD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD_AD,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBCalcMedCompAD27 <<< grid.grid, grid.threads >>> (
+        vxD,
+        vyD,
+        vzD,
+        rhoD,
+        pressD,
+        concD,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD,
+        DD_AD,
+        isEvenTimestep);
+    getLastCudaError("LBCalcMedCompAD27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CalcMacMedSP27(
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned int tdiff,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBCalcMacMedSP27<<< grid.grid, grid.threads >>> (
+        vxD,
+        vyD,
+        vzD,
+        rhoD,
+        pressD,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        tdiff,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("LBCalcMacMedSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void ResetMedianValuesSP27(
-	real* vxD,
-	real* vyD,
-	real* vzD,
-	real* rhoD,
-	real* pressD,
-	unsigned int size_Mat,
-	unsigned int numberOfThreads,
-	bool isEvenTimestep)
-{
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LBResetMedianValuesSP27 << < grid, threads >> > (
-		vxD,
-		vyD,
-		vzD,
-		rhoD,
-		pressD,
-		size_Mat,
-		isEvenTimestep);
-	getLastCudaError("LBResetMedianValuesSP27 execution failed");
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBResetMedianValuesSP27 <<< grid.grid, grid.threads >>> (
+        vxD,
+        vyD,
+        vzD,
+        rhoD,
+        pressD,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("LBResetMedianValuesSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void ResetMedianValuesAD27(
-	real* vxD,
-	real* vyD,
-	real* vzD,
-	real* rhoD,
-	real* pressD,
-	real* concD,
-	unsigned int size_Mat,
-	unsigned int numberOfThreads,
-	bool isEvenTimestep)
-{
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LBResetMedianValuesAD27 << < grid, threads >> > (
-		vxD,
-		vyD,
-		vzD,
-		rhoD,
-		pressD,
-		concD,
-		size_Mat,
-		isEvenTimestep);
-	getLastCudaError("LBResetMedianValuesAD27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void Calc2ndMomentsIncompSP27(real* kxyFromfcNEQ,
-										 real* kyzFromfcNEQ,
-										 real* kxzFromfcNEQ,
-										 real* kxxMyyFromfcNEQ,
-										 real* kxxMzzFromfcNEQ,
-										 unsigned int* geoD,
-										 unsigned int* neighborX,
-										 unsigned int* neighborY,
-										 unsigned int* neighborZ,
-										 unsigned int size_Mat,
-										 unsigned int numberOfThreads,
-										 real* DD,
-										 bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalc2ndMomentsIncompSP27<<< grid, threads >>> (  kxyFromfcNEQ,
-														 kyzFromfcNEQ,
-														 kxzFromfcNEQ,
-														 kxxMyyFromfcNEQ,
-														 kxxMzzFromfcNEQ,
-														 geoD,
-														 neighborX,
-														 neighborY,
-														 neighborZ,
-														 size_Mat,
-														 DD,
-														 isEvenTimestep);
-      getLastCudaError("LBCalc2ndMomentsIncompSP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void Calc2ndMomentsCompSP27( real* kxyFromfcNEQ,
-										real* kyzFromfcNEQ,
-										real* kxzFromfcNEQ,
-										real* kxxMyyFromfcNEQ,
-										real* kxxMzzFromfcNEQ,
-										unsigned int* geoD,
-										unsigned int* neighborX,
-										unsigned int* neighborY,
-										unsigned int* neighborZ,
-										unsigned int size_Mat,
-										unsigned int numberOfThreads,
-										real* DD,
-										bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalc2ndMomentsCompSP27<<< grid, threads >>> (kxyFromfcNEQ,
-													 kyzFromfcNEQ,
-													 kxzFromfcNEQ,
-													 kxxMyyFromfcNEQ,
-													 kxxMzzFromfcNEQ,
-													 geoD,
-													 neighborX,
-													 neighborY,
-													 neighborZ,
-													 size_Mat,
-													 DD,
-													 isEvenTimestep);
-      getLastCudaError("LBCalc2ndMomentsCompSP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void Calc3rdMomentsIncompSP27(real* CUMbbb,
-										 real* CUMabc,
-										 real* CUMbac,
-										 real* CUMbca,
-										 real* CUMcba,
-										 real* CUMacb,
-										 real* CUMcab,
-										 unsigned int* geoD,
-										 unsigned int* neighborX,
-										 unsigned int* neighborY,
-										 unsigned int* neighborZ,
-										 unsigned int size_Mat,
-										 unsigned int numberOfThreads,
-										 real* DD,
-										 bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalc3rdMomentsIncompSP27<<< grid, threads >>> (  CUMbbb,
-														 CUMabc,
-														 CUMbac,
-														 CUMbca,
-														 CUMcba,
-														 CUMacb,
-														 CUMcab,
-														 geoD,
-														 neighborX,
-														 neighborY,
-														 neighborZ,
-														 DD,
-														 size_Mat,
-														 isEvenTimestep);
-      getLastCudaError("LBCalc3rdMomentsIncompSP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void Calc3rdMomentsCompSP27( real* CUMbbb,
-										real* CUMabc,
-										real* CUMbac,
-										real* CUMbca,
-										real* CUMcba,
-										real* CUMacb,
-										real* CUMcab,
-										unsigned int* geoD,
-										unsigned int* neighborX,
-										unsigned int* neighborY,
-										unsigned int* neighborZ,
-										unsigned int size_Mat,
-										unsigned int numberOfThreads,
-										real* DD,
-										bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalc3rdMomentsCompSP27<<< grid, threads >>> (CUMbbb,
-													 CUMabc,
-													 CUMbac,
-													 CUMbca,
-													 CUMcba,
-													 CUMacb,
-													 CUMcab,
-													 geoD,
-													 neighborX,
-													 neighborY,
-													 neighborZ,
-													 DD,
-													 size_Mat,
-													 isEvenTimestep);
-      getLastCudaError("LBCalc3rdMomentsCompSP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CalcHigherMomentsIncompSP27(real* CUMcbb,
-											real* CUMbcb,
-											real* CUMbbc,
-											real* CUMcca,
-											real* CUMcac,
-											real* CUMacc,
-											real* CUMbcc,
-											real* CUMcbc,
-											real* CUMccb,
-											real* CUMccc,
-											unsigned int* geoD,
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int size_Mat,
-											unsigned int numberOfThreads,
-											real* DD,
-											bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcHigherMomentsIncompSP27<<< grid, threads >>> (CUMcbb,
-														  CUMbcb,
-														  CUMbbc,
-														  CUMcca,
-														  CUMcac,
-														  CUMacc,
-														  CUMbcc,
-														  CUMcbc,
-														  CUMccb,
-														  CUMccc,
-														  geoD,
-														  neighborX,
-														  neighborY,
-														  neighborZ,
-														  DD,
-														  size_Mat,
-														  isEvenTimestep);
-      getLastCudaError("LBCalcHigherMomentsIncompSP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CalcHigherMomentsCompSP27(  real* CUMcbb,
-											real* CUMbcb,
-											real* CUMbbc,
-											real* CUMcca,
-											real* CUMcac,
-											real* CUMacc,
-											real* CUMbcc,
-											real* CUMcbc,
-											real* CUMccb,
-											real* CUMccc,
-											unsigned int* geoD,
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int size_Mat,
-											unsigned int numberOfThreads,
-											real* DD,
-											bool isEvenTimestep)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcHigherMomentsCompSP27<<< grid, threads >>> (  CUMcbb,
-														  CUMbcb,
-														  CUMbbc,
-														  CUMcca,
-														  CUMcac,
-														  CUMacc,
-														  CUMbcc,
-														  CUMcbc,
-														  CUMccb,
-														  CUMccc,
-														  geoD,
-														  neighborX,
-														  neighborY,
-														  neighborZ,
-														  DD,
-														  size_Mat,
-														  isEvenTimestep);
-      getLastCudaError("LBCalcHigherMomentsCompSP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void LBCalcMeasurePoints27(real* vxMP,
-                                      real* vyMP,
-                                      real* vzMP,
-                                      real* rhoMP,
-                                      unsigned int* kMP,
-                                      unsigned int numberOfPointskMP,
-                                      unsigned int MPClockCycle,
-                                      unsigned int t,
-                                      unsigned int* geoD,
-                                      unsigned int* neighborX,
-                                      unsigned int* neighborY,
-                                      unsigned int* neighborZ,
-                                      unsigned int size_Mat,
-                                      real* DD,
-                                      unsigned int numberOfThreads,
-                                      bool isEvenTimestep)
-{
-   int Grid = (numberOfPointskMP / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcMeasurePoints<<< grid, threads >>> (vxMP,
-                                                vyMP,
-                                                vzMP,
-                                                rhoMP,
-                                                kMP,
-                                                numberOfPointskMP,
-                                                MPClockCycle,
-                                                t,
-                                                geoD,
-                                                neighborX,
-                                                neighborY,
-                                                neighborZ,
-                                                size_Mat,
-                                                DD,
-                                                isEvenTimestep);
-      getLastCudaError("LBCalcMeasurePoints execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void BcPress27( int nx,
-                           int ny,
-                           int tz,
-                           unsigned int grid_nx,
-                           unsigned int grid_ny,
-                           unsigned int* bcMatD,
-                           unsigned int* neighborX,
-                           unsigned int* neighborY,
-                           unsigned int* neighborZ,
-                           real* DD,
-                           unsigned int size_Mat,
-                           bool isEvenTimestep)
-{
-   dim3 threads       ( grid_nx, 1, 1 );
-   dim3 grid          ( grid_ny, 1 );
-
-      LB_BC_Press_East27<<< grid, threads >>> ( nx,
-                                                ny,
-                                                tz,
-                                                bcMatD,
-                                                neighborX,
-                                                neighborY,
-                                                neighborZ,
-                                                DD,
-                                                size_Mat,
-                                                isEvenTimestep);
-      getLastCudaError("LB_BC_Press_East27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void BcVel27(int nx,
-                        int ny,
-                        int nz,
-                        int itz,
-                        unsigned int grid_nx,
-                        unsigned int grid_ny,
-                        unsigned int* bcMatD,
-                        unsigned int* neighborX,
-                        unsigned int* neighborY,
-                        unsigned int* neighborZ,
-                        real* DD,
-                        unsigned int size_Mat,
-                        bool isEvenTimestep,
-                        real u0x,
-                        real om)
-{
-   dim3 threads       ( grid_nx, 1, 1 );
-   dim3 grid          ( grid_ny, 1 );
-
-      LB_BC_Vel_West_27<<< grid, threads >>> (  nx,
-                                                ny,
-                                                nz,
-                                                itz,
-                                                bcMatD,
-                                                neighborX,
-                                                neighborY,
-                                                neighborZ,
-                                                DD,
-                                                size_Mat,
-                                                isEvenTimestep,
-                                                u0x,
-                                                grid_nx,
-                                                grid_ny,
-                                                om);
-      getLastCudaError("LB_BC_Vel_West_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QADPressDev7( unsigned int numberOfThreads,
-                              real* DD,
-                              real* DD7,
-                              real* temp,
-                              real* velo,
-                              real diffusivity,
-                              int* k_Q,
-                              real* QQ,
-                              unsigned int numberOfBCnodes,
-                              real om1,
-                              unsigned int* neighborX,
-                              unsigned int* neighborY,
-                              unsigned int* neighborZ,
-                              unsigned int size_Mat,
-                              bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADPress7<<< gridQ, threads >>>( DD,
-                                       DD7,
-                                       temp,
-                                       velo,
-                                       diffusivity,
-                                       k_Q,
-                                       QQ,
-                                       numberOfBCnodes,
-                                       om1,
-                                       neighborX,
-                                       neighborY,
-                                       neighborZ,
-                                       size_Mat,
-                                       isEvenTimestep);
-      getLastCudaError("QADPress7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QADPressDev27(unsigned int numberOfThreads,
-                              real* DD,
-                              real* DD27,
-                              real* temp,
-                              real* velo,
-                              real diffusivity,
-                              int* k_Q,
-                              real* QQ,
-                              unsigned int numberOfBCnodes,
-                              real om1,
-                              unsigned int* neighborX,
-                              unsigned int* neighborY,
-                              unsigned int* neighborZ,
-                              unsigned int size_Mat,
-                              bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADPress27<<< gridQ, threads >>>(   DD,
-                                          DD27,
-                                          temp,
-                                          velo,
-                                          diffusivity,
-                                          k_Q,
-                                          QQ,
-                                          numberOfBCnodes,
-                                          om1,
-                                          neighborX,
-                                          neighborY,
-                                          neighborZ,
-                                          size_Mat,
-                                          isEvenTimestep);
-      getLastCudaError("QADPress27 execution failed");
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* rhoD,
+    real* pressD,
+    real* concD,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBResetMedianValuesAD27 <<< grid.grid, grid.threads >>> (
+        vxD,
+        vyD,
+        vzD,
+        rhoD,
+        pressD,
+        concD,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("LBResetMedianValuesAD27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void Calc2ndMomentsIncompSP27(
+    real* kxyFromfcNEQ,
+    real* kyzFromfcNEQ,
+    real* kxzFromfcNEQ,
+    real* kxxMyyFromfcNEQ,
+    real* kxxMzzFromfcNEQ,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBCalc2ndMomentsIncompSP27<<< grid.grid, grid.threads >>> (
+        kxyFromfcNEQ,
+        kyzFromfcNEQ,
+        kxzFromfcNEQ,
+        kxxMyyFromfcNEQ,
+        kxxMzzFromfcNEQ,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD,
+        isEvenTimestep);
+    getLastCudaError("LBCalc2ndMomentsIncompSP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void Calc2ndMomentsCompSP27(
+    real* kxyFromfcNEQ,
+    real* kyzFromfcNEQ,
+    real* kxzFromfcNEQ,
+    real* kxxMyyFromfcNEQ,
+    real* kxxMzzFromfcNEQ,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBCalc2ndMomentsCompSP27<<< grid.grid, grid.threads >>> (
+        kxyFromfcNEQ,
+        kyzFromfcNEQ,
+        kxzFromfcNEQ,
+        kxxMyyFromfcNEQ,
+        kxxMzzFromfcNEQ,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD,
+        isEvenTimestep);
+    getLastCudaError("LBCalc2ndMomentsCompSP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void Calc3rdMomentsIncompSP27(
+    real* CUMbbb,
+    real* CUMabc,
+    real* CUMbac,
+    real* CUMbca,
+    real* CUMcba,
+    real* CUMacb,
+    real* CUMcab,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBCalc3rdMomentsIncompSP27<<< grid.grid, grid.threads >>> (
+        CUMbbb,
+        CUMabc,
+        CUMbac,
+        CUMbca,
+        CUMcba,
+        CUMacb,
+        CUMcab,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("LBCalc3rdMomentsIncompSP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void Calc3rdMomentsCompSP27(
+    real* CUMbbb,
+    real* CUMabc,
+    real* CUMbac,
+    real* CUMbca,
+    real* CUMcba,
+    real* CUMacb,
+    real* CUMcab,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBCalc3rdMomentsCompSP27<<< grid.grid, grid.threads >>> (
+        CUMbbb,
+        CUMabc,
+        CUMbac,
+        CUMbca,
+        CUMcba,
+        CUMacb,
+        CUMcab,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("LBCalc3rdMomentsCompSP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CalcHigherMomentsIncompSP27(
+    real* CUMcbb,
+    real* CUMbcb,
+    real* CUMbbc,
+    real* CUMcca,
+    real* CUMcac,
+    real* CUMacc,
+    real* CUMbcc,
+    real* CUMcbc,
+    real* CUMccb,
+    real* CUMccc,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBCalcHigherMomentsIncompSP27<<< grid.grid, grid.threads >>> (
+        CUMcbb,
+        CUMbcb,
+        CUMbbc,
+        CUMcca,
+        CUMcac,
+        CUMacc,
+        CUMbcc,
+        CUMcbc,
+        CUMccb,
+        CUMccc,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("LBCalcHigherMomentsIncompSP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CalcHigherMomentsCompSP27(
+    real* CUMcbb,
+    real* CUMbcb,
+    real* CUMbbc,
+    real* CUMcca,
+    real* CUMcac,
+    real* CUMacc,
+    real* CUMbcc,
+    real* CUMcbc,
+    real* CUMccb,
+    real* CUMccc,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    real* DD,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+
+    LBCalcHigherMomentsCompSP27<<< grid.grid, grid.threads >>> (
+        CUMcbb,
+        CUMbcb,
+        CUMbbc,
+        CUMcca,
+        CUMcac,
+        CUMacc,
+        CUMbcc,
+        CUMcbc,
+        CUMccb,
+        CUMccc,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("LBCalcHigherMomentsCompSP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void LBCalcMeasurePoints27(
+    real* vxMP,
+    real* vyMP,
+    real* vzMP,
+    real* rhoMP,
+    unsigned int* kMP,
+    unsigned int numberOfPointskMP,
+    unsigned int MPClockCycle,
+    unsigned int t,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    real* DD,
+    unsigned int numberOfThreads,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfPointskMP);
+
+    LBCalcMeasurePoints<<< grid.grid, grid.threads >>> (
+        vxMP,
+        vyMP,
+        vzMP,
+        rhoMP,
+        kMP,
+        numberOfPointskMP,
+        MPClockCycle,
+        t,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD,
+        isEvenTimestep);
+    getLastCudaError("LBCalcMeasurePoints execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void BcPress27(
+    int nx,
+    int ny,
+    int tz,
+    unsigned int grid_nx,
+    unsigned int grid_ny,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    dim3 threads       ( grid_nx, 1, 1 );
+    dim3 grid          ( grid_ny, 1 );
+
+    LB_BC_Press_East27<<< grid, threads >>> (
+        nx,
+        ny,
+        tz,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("LB_BC_Press_East27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void BcVel27(
+    int nx,
+    int ny,
+    int nz,
+    int itz,
+    unsigned int grid_nx,
+    unsigned int grid_ny,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    real u0x,
+    real om)
+{
+    dim3 threads       ( grid_nx, 1, 1 );
+    dim3 grid          ( grid_ny, 1 );
+
+    LB_BC_Vel_West_27<<< grid, threads >>> (
+        nx,
+        ny,
+        nz,
+        itz,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        DD,
+        numberOfLBnodes,
+        isEvenTimestep,
+        u0x,
+        grid_nx,
+        grid_ny,
+        om);
+    getLastCudaError("LB_BC_Vel_West_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QADPressDev7(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD7,
+    real* temp,
+    real* velo,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QADPress7<<< grid.grid, grid.threads >>>(
+        DD,
+        DD7,
+        temp,
+        velo,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QADPress7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QADPressDev27(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD27,
+    real* temp,
+    real* velo,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QADPress27<<< grid.grid, grid.threads >>>(
+        DD,
+        DD27,
+        temp,
+        velo,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QADPress27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QADPressNEQNeighborDev27(
-											unsigned int numberOfThreads,
-											real* DD,
-											real* DD27,
-											int* k_Q,
-											int* k_N,
-											int numberOfBCnodes,
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int size_Mat,
-											bool isEvenTimestep
-										)
-{
-
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   QADPressNEQNeighbor27<<< gridQ, threads >>>(
-												DD,
-												DD27,
-												k_Q,
-												k_N,
-												numberOfBCnodes,
-												neighborX,
-												neighborY,
-												neighborZ,
-												size_Mat,
-												isEvenTimestep
-											  );
-   getLastCudaError("QADPressNEQNeighbor27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QADVelDev7(unsigned int numberOfThreads,
-                           real* DD,
-                           real* DD7,
-                           real* temp,
-                           real* velo,
-                           real diffusivity,
-                           int* k_Q,
-                           real* QQ,
-                           unsigned int numberOfBCnodes,
-                           real om1,
-                           unsigned int* neighborX,
-                           unsigned int* neighborY,
-                           unsigned int* neighborZ,
-                           unsigned int size_Mat,
-                           bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADVel7<<< gridQ, threads >>> (  
-                                       DD,
-                                       DD7,
-                                       temp,
-                                       velo,
-                                       diffusivity,
-                                       k_Q,
-                                       QQ,
-                                       numberOfBCnodes,
-                                       om1,
-                                       neighborX,
-                                       neighborY,
-                                       neighborZ,
-                                       size_Mat,
-                                       isEvenTimestep);
-      getLastCudaError("QADVel7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QADVelDev27(  unsigned int numberOfThreads,
-                              real* DD,
-                              real* DD27,
-                              real* temp,
-                              real* velo,
-                              real diffusivity,
-                              int* k_Q,
-                              real* QQ,
-                              unsigned int numberOfBCnodes,
-                              real om1,
-                              unsigned int* neighborX,
-                              unsigned int* neighborY,
-                              unsigned int* neighborZ,
-                              unsigned int size_Mat,
-                              bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADVel27<<< gridQ, threads >>> ( DD,
-                                      DD27,
-                                      temp,
-                                      velo,
-                                      diffusivity,
-                                      k_Q,
-                                      QQ,
-                                      numberOfBCnodes,
-                                      om1,
-                                      neighborX,
-                                      neighborY,
-                                      neighborZ,
-                                      size_Mat,
-                                      isEvenTimestep);
-      getLastCudaError("QADVel27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QADDev7(unsigned int numberOfThreads,
-                        real* DD,
-                        real* DD7,
-                        real* temp,
-                        real diffusivity,
-                        int* k_Q,
-                        real* QQ,
-                        unsigned int numberOfBCnodes,
-                        real om1,
-                        unsigned int* neighborX,
-                        unsigned int* neighborY,
-                        unsigned int* neighborZ,
-                        unsigned int size_Mat,
-                        bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QAD7<<< gridQ, threads >>> (     DD,
-                                       DD7,
-                                       temp,
-                                       diffusivity,
-                                       k_Q,
-                                       QQ,
-                                       numberOfBCnodes,
-                                       om1,
-                                       neighborX,
-                                       neighborY,
-                                       neighborZ,
-                                       size_Mat,
-                                       isEvenTimestep);
-      getLastCudaError("QAD7 execution failed");
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD27,
+    int* k_Q,
+    int* k_N,
+    int numberOfBCnodes,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QADPressNEQNeighbor27<<< grid.grid, grid.threads >>>(
+        DD,
+        DD27,
+        k_Q,
+        k_N,
+        numberOfBCnodes,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+       getLastCudaError("QADPressNEQNeighbor27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QADVelDev7(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD7,
+    real* temp,
+    real* velo,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QADVel7<<< grid.grid, grid.threads >>> (
+        DD,
+        DD7,
+        temp,
+        velo,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QADVel7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QADVelDev27(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD27,
+    real* temp,
+    real* velo,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QADVel27<<< grid.grid, grid.threads >>> (
+        DD,
+        DD27,
+        temp,
+        velo,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QADVel27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QADDev7(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD7,
+    real* temp,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QAD7<<< grid.grid, grid.threads >>> (
+        DD,
+        DD7,
+        temp,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QAD7 execution failed");
 }
 
 
@@ -2202,1700 +1670,1430 @@ void FactorizedCentralMomentsAdvectionDiffusionDeviceKernel(
    uint* neighborZ,
    real* distributions,
    real* distributionsAD,
-   int size_Mat,
+   unsigned long long numberOfLBnodes,
    real* forces,
    bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads) + 1;
-   dim3 grid(Grid, 1, 1);
-   dim3 threads(numberOfThreads, 1, 1);
+    int Grid = (numberOfLBnodes / numberOfThreads) + 1;
+    dim3 grid(Grid, 1, 1);
+    dim3 threads(numberOfThreads, 1, 1);
 
-   Factorized_Central_Moments_Advection_Diffusion_Device_Kernel <<< grid, threads >>> (
-      omegaDiffusivity,
-      typeOfGridNode,
-      neighborX,
-      neighborY,
-      neighborZ,
-      distributions,
-      distributionsAD,
-      size_Mat,
-      forces,
-      isEvenTimestep);
-   getLastCudaError("Factorized_Central_Moments_Advection_Diffusion_Device_Kernel execution failed");
+    Factorized_Central_Moments_Advection_Diffusion_Device_Kernel <<< grid, threads >>> (
+        omegaDiffusivity,
+        typeOfGridNode,
+        neighborX,
+        neighborY,
+        neighborZ,
+        distributions,
+        distributionsAD,
+        numberOfLBnodes,
+        forces,
+        isEvenTimestep);
+    getLastCudaError("Factorized_Central_Moments_Advection_Diffusion_Device_Kernel execution failed");
 }
 
 //////////////////////////////////////////////////////////////////////////
 void ADSlipVelDevComp(
-	uint numberOfThreads,
-	real * normalX,
-	real * normalY,
-	real * normalZ,
-	real * distributions,
-	real * distributionsAD,
-	int* QindexArray,
-	real * Qarrays,
-	uint numberOfBCnodes,
-	real omegaDiffusivity,
-	uint * neighborX,
-	uint * neighborY,
-	uint * neighborZ,
-	uint size_Mat,
-	bool isEvenTimestep)
-{
-	int Grid = (numberOfBCnodes / numberOfThreads) + 1;
-	dim3 gridQ(Grid, 1, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	AD_SlipVelDeviceComp << < gridQ, threads >> > (
-		normalX,
-		normalY,
-		normalZ,
-		distributions,
-		distributionsAD,
-		QindexArray,
-		Qarrays,
-		numberOfBCnodes,
-		omegaDiffusivity,
-		neighborX,
-		neighborY,
-		neighborZ,
-		size_Mat,
-		isEvenTimestep);
-	getLastCudaError("AD_SlipVelDeviceComp execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-
-void QADDirichletDev27( unsigned int numberOfThreads,
-								   real* DD,
-								   real* DD27,
-								   real* temp,
-								   real diffusivity,
-								   int* k_Q,
-								   real* QQ,
-								   unsigned int numberOfBCnodes,
-								   real om1,
-								   unsigned int* neighborX,
-								   unsigned int* neighborY,
-								   unsigned int* neighborZ,
-								   unsigned int size_Mat,
-								   bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADDirichlet27<<< gridQ, threads >>> (
-											   DD,
-											   DD27,
-											   temp,
-											   diffusivity,
-											   k_Q,
-											   QQ,
-											   numberOfBCnodes,
-											   om1,
-											   neighborX,
-											   neighborY,
-											   neighborZ,
-											   size_Mat,
-											   isEvenTimestep);
-      getLastCudaError("QADDirichletDev27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QADBBDev27(unsigned int numberOfThreads,
-                           real* DD,
-                           real* DD27,
-                           real* temp,
-                           real diffusivity,
-                           int* k_Q,
-                           real* QQ,
-                           unsigned int numberOfBCnodes,
-                           real om1,
-                           unsigned int* neighborX,
-                           unsigned int* neighborY,
-                           unsigned int* neighborZ,
-                           unsigned int size_Mat,
-                           bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADBB27<<< gridQ, threads >>> (  DD,
-                                       DD27,
-                                       temp,
-                                       diffusivity,
-                                       k_Q,
-                                       QQ,
-                                       numberOfBCnodes,
-                                       om1,
-                                       neighborX,
-                                       neighborY,
-                                       neighborZ,
-                                       size_Mat,
-                                       isEvenTimestep);
-      getLastCudaError("QADBB27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QNoSlipADincompDev7(unsigned int numberOfThreads,
-									real* DD,
-									real* DD7,
-									real* temp,
-									real diffusivity,
-									int* k_Q,
-									real* QQ,
-									unsigned int numberOfBCnodes,
-									real om1,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									unsigned int size_Mat,
-									bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QNoSlipADincomp7<<< gridQ, threads >>> (
-											   DD,
-											   DD7,
-											   temp,
-											   diffusivity,
-											   k_Q,
-											   QQ,
-											   numberOfBCnodes,
-											   om1,
-											   neighborX,
-											   neighborY,
-											   neighborZ,
-											   size_Mat,
-											   isEvenTimestep);
-      getLastCudaError("QNoSlipADincomp7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QNoSlipADincompDev27(  unsigned int numberOfThreads,
-									   real* DD,
-									   real* DD27,
-									   real* temp,
-									   real diffusivity,
-									   int* k_Q,
-									   real* QQ,
-									   unsigned int numberOfBCnodes,
-									   real om1,
-									   unsigned int* neighborX,
-									   unsigned int* neighborY,
-									   unsigned int* neighborZ,
-									   unsigned int size_Mat,
-									   bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QNoSlipADincomp27<<< gridQ, threads >>> (
-											   DD,
-											   DD27,
-											   temp,
-											   diffusivity,
-											   k_Q,
-											   QQ,
-											   numberOfBCnodes,
-											   om1,
-											   neighborX,
-											   neighborY,
-											   neighborZ,
-											   size_Mat,
-											   isEvenTimestep);
-      getLastCudaError("QNoSlipADincomp27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QADVeloIncompDev7( unsigned int numberOfThreads,
-								   real* DD,
-								   real* DD7,
-								   real* temp,
-								   real* velo,
-								   real diffusivity,
-								   int* k_Q,
-								   real* QQ,
-								   unsigned int numberOfBCnodes,
-								   real om1,
-								   unsigned int* neighborX,
-								   unsigned int* neighborY,
-								   unsigned int* neighborZ,
-								   unsigned int size_Mat,
-								   bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADVeloIncomp7<<< gridQ, threads >>> ( 
-											   DD,
-											   DD7,
-											   temp,
-											   velo,
-											   diffusivity,
-											   k_Q,
-											   QQ,
-											   numberOfBCnodes,
-											   om1,
-											   neighborX,
-											   neighborY,
-											   neighborZ,
-											   size_Mat,
-											   isEvenTimestep);
-      getLastCudaError("QADVeloIncomp7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QADVeloIncompDev27(   unsigned int numberOfThreads,
-									  real* DD,
-									  real* DD27,
-									  real* temp,
-									  real* velo,
-									  real diffusivity,
-									  int* k_Q,
-									  real* QQ,
-									  unsigned int numberOfBCnodes,
-									  real om1,
-									  unsigned int* neighborX,
-									  unsigned int* neighborY,
-									  unsigned int* neighborZ,
-									  unsigned int size_Mat,
-									  bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADVeloIncomp27<<< gridQ, threads >>> (
-											  DD,
-											  DD27,
-											  temp,
-											  velo,
-											  diffusivity,
-											  k_Q,
-											  QQ,
-											  numberOfBCnodes,
-											  om1,
-											  neighborX,
-											  neighborY,
-											  neighborZ,
-											  size_Mat,
-											  isEvenTimestep);
-      getLastCudaError("QADVeloIncomp27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QADPressIncompDev7( unsigned int numberOfThreads,
-									  real* DD,
-									  real* DD7,
-									  real* temp,
-									  real* velo,
-									  real diffusivity,
-									  int* k_Q,
-									  real* QQ,
-									  unsigned int numberOfBCnodes,
-									  real om1,
-									  unsigned int* neighborX,
-									  unsigned int* neighborY,
-									  unsigned int* neighborZ,
-									  unsigned int size_Mat,
-									  bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADPressIncomp7<<< gridQ, threads >>>(
-											   DD,
-											   DD7,
-											   temp,
-											   velo,
-											   diffusivity,
-											   k_Q,
-											   QQ,
-											   numberOfBCnodes,
-											   om1,
-											   neighborX,
-											   neighborY,
-											   neighborZ,
-											   size_Mat,
-											   isEvenTimestep);
-      getLastCudaError("QADPressIncomp7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QADPressIncompDev27(  unsigned int numberOfThreads,
-									  real* DD,
-									  real* DD27,
-									  real* temp,
-									  real* velo,
-									  real diffusivity,
-									  int* k_Q,
-									  real* QQ,
-									  unsigned int numberOfBCnodes,
-									  real om1,
-									  unsigned int* neighborX,
-									  unsigned int* neighborY,
-									  unsigned int* neighborZ,
-									  unsigned int size_Mat,
-									  bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADPressIncomp27<<< gridQ, threads >>>(
-											  DD,
-											  DD27,
-											  temp,
-											  velo,
-											  diffusivity,
-											  k_Q,
-											  QQ,
-											  numberOfBCnodes,
-											  om1,
-											  neighborX,
-											  neighborY,
-											  neighborZ,
-											  size_Mat,
-											  isEvenTimestep);
-      getLastCudaError("QADPressIncomp27 execution failed");
+    uint numberOfThreads,
+    real * normalX,
+    real * normalY,
+    real * normalZ,
+    real * distributions,
+    real * distributionsAD,
+    int* QindexArray,
+    real * Qarrays,
+    uint numberOfBCnodes,
+    real omegaDiffusivity,
+    uint * neighborX,
+    uint * neighborY,
+    uint * neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    AD_SlipVelDeviceComp <<< grid.grid, grid.threads >>> (
+        normalX,
+        normalY,
+        normalZ,
+        distributions,
+        distributionsAD,
+        QindexArray,
+        Qarrays,
+        numberOfBCnodes,
+        omegaDiffusivity,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("AD_SlipVelDeviceComp execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+
+void QADDirichletDev27(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD27,
+    real* temp,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QADDirichlet27<<< grid.grid, grid.threads >>> (
+        DD,
+        DD27,
+        temp,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QADDirichletDev27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QADBBDev27(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD27,
+    real* temp,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QADBB27<<< grid.grid, grid.threads >>> (
+        DD,
+        DD27,
+        temp,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QADBB27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QNoSlipADincompDev7(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD7,
+    real* temp,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QNoSlipADincomp7<<< grid.grid, grid.threads >>> (
+        DD,
+        DD7,
+        temp,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QNoSlipADincomp7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QNoSlipADincompDev27(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD27,
+    real* temp,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QNoSlipADincomp27<<< grid.grid, grid.threads >>> (
+        DD,
+        DD27,
+        temp,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QNoSlipADincomp27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QADVeloIncompDev7(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD7,
+    real* temp,
+    real* velo,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QADVeloIncomp7<<< grid.grid, grid.threads >>> (
+        DD,
+        DD7,
+        temp,
+        velo,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QADVeloIncomp7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QADVeloIncompDev27(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD27,
+    real* temp,
+    real* velo,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QADVeloIncomp27<<< grid.grid, grid.threads >>> (
+        DD,
+        DD27,
+        temp,
+        velo,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QADVeloIncomp27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QADPressIncompDev7(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD7,
+    real* temp,
+    real* velo,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QADPressIncomp7<<< grid.grid, grid.threads >>>(
+        DD,
+        DD7,
+        temp,
+        velo,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QADPressIncomp7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QADPressIncompDev27(
+    unsigned int numberOfThreads,
+    real* DD,
+    real* DD27,
+    real* temp,
+    real* velo,
+    real diffusivity,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QADPressIncomp27<<< grid.grid, grid.threads >>>(
+        DD,
+        DD27,
+        temp,
+        velo,
+        diffusivity,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QADPressIncomp27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
-
-      QDevice27<<< grid, threads >>> (
-            parameterDevice->distributions.f[0],
-            boundaryCondition->k,
-            boundaryCondition->q27[0],
-            boundaryCondition->numberOfBCnodes,
-            parameterDevice->omega,
-            parameterDevice->neighborX,
-            parameterDevice->neighborY,
-            parameterDevice->neighborZ,
-            parameterDevice->numberOfNodes,
-            parameterDevice->isEvenTimestep);
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
+    QDevice27<<< grid, threads >>> (
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
       getLastCudaError("QDevice27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QDevComp27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
-
-      QDeviceComp27<<< grid, threads >>> (
-           parameterDevice->distributions.f[0],
-           boundaryCondition->k,
-           boundaryCondition->q27[0],
-           boundaryCondition->numberOfBCnodes,
-           parameterDevice->omega,
-           parameterDevice->neighborX,
-           parameterDevice->neighborY,
-           parameterDevice->neighborZ,
-           parameterDevice->numberOfNodes,
-           parameterDevice->isEvenTimestep);
-      getLastCudaError("QDeviceComp27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QDevCompThinWalls27(unsigned int numberOfThreads,
-									real* DD,
-									int* k_Q,
-									real* QQ,
-									unsigned int numberOfBCnodes,
-									real om1,
-									unsigned int* geom,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									unsigned int* neighborWSB,
-									unsigned int size_Mat,
-									bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   QDeviceCompThinWallsPartOne27 <<< gridQ, threads >>> (DD,
-														 k_Q,
-														 QQ,
-														 numberOfBCnodes,
-														 om1,
-														 neighborX,
-														 neighborY,
-														 neighborZ,
-														 size_Mat,
-														 isEvenTimestep);
-   getLastCudaError("QDeviceCompThinWallsPartOne27 execution failed");
-
-   QThinWallsPartTwo27 <<< gridQ, threads >>> ( DD,
-												k_Q,
-												QQ,
-												numberOfBCnodes,
-												geom,
-												neighborX,
-												neighborY,
-												neighborZ,
-												neighborWSB,
-												size_Mat,
-												isEvenTimestep);
-   getLastCudaError("QThinWallsPartTwo27 execution failed");
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
+    QDeviceComp27<<< grid, threads >>> (
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QDeviceComp27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QDevCompThinWalls27(
+    unsigned int numberOfThreads,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* geom,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned int* neighborWSB,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QDeviceCompThinWallsPartOne27 <<< grid.grid, grid.threads >>> (
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QDeviceCompThinWallsPartOne27 execution failed");
+
+    QThinWallsPartTwo27 <<< grid.grid, grid.threads >>> (
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        geom,
+        neighborX,
+        neighborY,
+        neighborZ,
+        neighborWSB,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QThinWallsPartTwo27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QDev3rdMomentsComp27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1);
-
-   QDevice3rdMomentsComp27<<< grid, threads >>> (
-         parameterDevice->distributions.f[0],
-         boundaryCondition->k,
-         boundaryCondition->q27[0],
-         boundaryCondition->numberOfBCnodes,
-         parameterDevice->omega,
-         parameterDevice->neighborX,
-         parameterDevice->neighborY,
-         parameterDevice->neighborZ,
-         parameterDevice->numberOfNodes,
-         parameterDevice->isEvenTimestep);
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1);
+
+    QDevice3rdMomentsComp27<<< grid, threads >>> (
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
    getLastCudaError("QDevice3rdMomentsComp27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
-void QDevIncompHighNu27( unsigned int numberOfThreads,
-									real* DD,
-									int* k_Q,
-									real* QQ,
-									unsigned int numberOfBCnodes,
-									real om1,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									unsigned int size_Mat,
-									bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QDeviceIncompHighNu27<<< gridQ, threads >>> (
-												   DD,
-												   k_Q,
-												   QQ,
-												   numberOfBCnodes,
-												   om1,
-												   neighborX,
-												   neighborY,
-												   neighborZ,
-												   size_Mat,
-												   isEvenTimestep);
-      getLastCudaError("QDeviceIncompHighNu27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QDevCompHighNu27(   unsigned int numberOfThreads,
-									real* DD,
-									int* k_Q,
-									real* QQ,
-									unsigned int numberOfBCnodes,
-									real om1,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									unsigned int size_Mat,
-									bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QDeviceCompHighNu27<<< gridQ, threads >>> (
-												   DD,
-												   k_Q,
-												   QQ,
-												   numberOfBCnodes,
-												   om1,
-												   neighborX,
-												   neighborY,
-												   neighborZ,
-												   size_Mat,
-												   isEvenTimestep);
-      getLastCudaError("QDevice27 execution failed");
+void QDevIncompHighNu27(
+    unsigned int numberOfThreads,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QDeviceIncompHighNu27<<< grid.grid, grid.threads >>> (
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QDeviceIncompHighNu27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QDevCompHighNu27(
+    unsigned int numberOfThreads,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QDeviceCompHighNu27<<< grid.grid, grid.threads >>> (
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QDeviceCompHighNu27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QVelDevicePlainBB27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QVelDevPlainBB27<<< grid, threads >>> (
-         boundaryCondition->Vx,
-         boundaryCondition->Vy,
-         boundaryCondition->Vz,
-         parameterDevice->distributions.f[0],
-         boundaryCondition->k,
-         boundaryCondition->q27[0],
-         boundaryCondition->numberOfBCnodes,
-         parameterDevice->neighborX,
-         parameterDevice->neighborY,
-         parameterDevice->neighborZ,
-         parameterDevice->numberOfNodes,
-         parameterDevice->isEvenTimestep);
-   getLastCudaError("QVelDevicePlainBB27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QVelDeviceCouette27(unsigned int numberOfThreads,
-									real* vx,
-									real* vy,
-									real* vz,
-									real* DD,
-									int* k_Q,
-									real* QQ,
-									unsigned int numberOfBCnodes,
-									real om1,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									unsigned int size_Mat,
-									bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QVelDevCouette27<<< gridQ, threads >>> ( vx,
-												vy,
-												vz,
-												DD,
-												k_Q,
-												QQ,
-												numberOfBCnodes,
-												om1,
-												neighborX,
-												neighborY,
-												neighborZ,
-												size_Mat,
-												isEvenTimestep);
-      getLastCudaError("QVelDevicePlainBB27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QVelDevice1h27(   unsigned int numberOfThreads,
-								  int nx,
-								  int ny,
-								  real* vx,
-								  real* vy,
-								  real* vz,
-								  real* DD,
-								  int* k_Q,
-								  real* QQ,
-								  unsigned int numberOfBCnodes,
-								  real om1,
-								  real Phi,
-								  real angularVelocity,
-								  unsigned int* neighborX,
-								  unsigned int* neighborY,
-								  unsigned int* neighborZ,
-								  real* coordX,
-								  real* coordY,
-								  real* coordZ,
-								  unsigned int size_Mat,
-								  bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QVelDev1h27<<< gridQ, threads >>> (nx,
-                                          ny,
-                                          vx,
-                                          vy,
-                                          vz,
-                                          DD,
-                                          k_Q,
-                                          QQ,
-                                          numberOfBCnodes,
-                                          om1,
-										  Phi,
-										  angularVelocity,
-                                          neighborX,
-                                          neighborY,
-                                          neighborZ,
-										  coordX,
-										  coordY,
-										  coordZ,
-                                          size_Mat,
-                                          isEvenTimestep);
-      getLastCudaError("QVelDevice27 execution failed");
+    QVelDevPlainBB27<<< grid, threads >>> (
+        boundaryCondition->Vx,
+        boundaryCondition->Vy,
+        boundaryCondition->Vz,
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QVelDevicePlainBB27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QVelDeviceCouette27(
+    unsigned int numberOfThreads,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QVelDevCouette27<<< grid.grid, grid.threads >>> (
+        vx,
+        vy,
+        vz,
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QVelDevCouette27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QVelDevice1h27(
+    unsigned int numberOfThreads,
+    int nx,
+    int ny,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    real Phi,
+    real angularVelocity,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* coordX,
+    real* coordY,
+    real* coordZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QVelDev1h27<<< grid.grid, grid.threads >>> (
+        nx,
+        ny,
+        vx,
+        vy,
+        vz,
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        Phi,
+        angularVelocity,
+        neighborX,
+        neighborY,
+        neighborZ,
+        coordX,
+        coordY,
+        coordZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QVelDev1h27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QVelDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-      QVelDevice27<<< grid, threads >>> (
-            parameterDevice->nx,
-            parameterDevice->ny,
-            boundaryCondition->Vx,
-            boundaryCondition->Vy,
-            boundaryCondition->Vz,
-            parameterDevice->distributions.f[0],
-            boundaryCondition->k,
-            boundaryCondition->q27[0],
-            boundaryCondition->numberOfBCnodes,
-            parameterDevice->omega,
-            parameterDevice->neighborX,
-            parameterDevice->neighborY,
-            parameterDevice->neighborZ,
-            parameterDevice->numberOfNodes,
-            parameterDevice->isEvenTimestep);
-      getLastCudaError("QVelDevice27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QVelDevCompPlusSlip27(unsigned int numberOfThreads,
-									  real* vx,
-									  real* vy,
-									  real* vz,
-									  real* DD,
-									  int* k_Q,
-									  real* QQ,
-									  unsigned int numberOfBCnodes,
-									  real om1,
-									  unsigned int* neighborX,
-									  unsigned int* neighborY,
-									  unsigned int* neighborZ,
-									  unsigned int size_Mat,
-									  bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QVelDeviceCompPlusSlip27<<< gridQ, threads >>> (
-													  vx,
-													  vy,
-													  vz,
-													  DD,
-													  k_Q,
-													  QQ,
-													  numberOfBCnodes,
-													  om1,
-													  neighborX,
-													  neighborY,
-													  neighborZ,
-													  size_Mat,
-													  isEvenTimestep);
-      getLastCudaError("QVelDeviceCompPlusSlip27 execution failed");
+    QVelDevice27<<< grid, threads >>> (
+        parameterDevice->nx,
+        parameterDevice->ny,
+        boundaryCondition->Vx,
+        boundaryCondition->Vy,
+        boundaryCondition->Vz,
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QVelDevice27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QVelDevCompPlusSlip27(
+    unsigned int numberOfThreads,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QVelDeviceCompPlusSlip27<<< grid.grid, grid.threads >>> (
+        vx,
+        vy,
+        vz,
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QVelDeviceCompPlusSlip27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QVelDevComp27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid(parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid(parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QVelDeviceComp27<<< grid, threads >>> (
-            boundaryCondition->Vx,
-            boundaryCondition->Vy,
-            boundaryCondition->Vz,
-            parameterDevice->distributions.f[0],
-            boundaryCondition->k,        
-            boundaryCondition->q27[0],
-            boundaryCondition->numberOfBCnodes,
-            parameterDevice->omega,
-            parameterDevice->neighborX,
-            parameterDevice->neighborY,
-            parameterDevice->neighborZ,
-            parameterDevice->numberOfNodes,
-            parameterDevice->isEvenTimestep);
+    QVelDeviceComp27<<< grid, threads >>> (
+        boundaryCondition->Vx,
+        boundaryCondition->Vy,
+        boundaryCondition->Vz,
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
    getLastCudaError("QVelDeviceComp27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
-void QVelDevCompThinWalls27(unsigned int numberOfThreads,
-							           real* vx,
-							           real* vy,
-							           real* vz,
-							           real* DD,
-							           int* k_Q,
-							           real* QQ,
-							           unsigned int numberOfBCnodes,
-							           real om1,
-									     unsigned int* geom,
-							           unsigned int* neighborX,
-							           unsigned int* neighborY,
-							           unsigned int* neighborZ,
-									     unsigned int* neighborWSB,
-							           unsigned int size_Mat,
-							           bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   QVelDeviceCompThinWallsPartOne27<<< gridQ, threads >>> (vx,
-											                  vy,
-											                  vz,
-											                  DD,
-											                  k_Q,
-											                  QQ,
-											                  numberOfBCnodes,
-											                  om1,
-											                  neighborX,
-											                  neighborY,
-											                  neighborZ,
-											                  size_Mat,
-											                  isEvenTimestep);
-   getLastCudaError("QVelDeviceCompThinWallsPartOne27 execution failed");
-
-	QThinWallsPartTwo27 <<< gridQ, threads >>> (
-       DD,
-       k_Q,
-       QQ,
-       numberOfBCnodes,
-       geom,
-       neighborX,
-       neighborY,
-       neighborZ,
-       neighborWSB,
-       size_Mat,
-       isEvenTimestep);
-   getLastCudaError("QThinWallsPartTwo27 execution failed");
-}
-
-void QVelDevCompZeroPress27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
+void QVelDevCompThinWalls27(
+    unsigned int numberOfThreads,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* geom,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned int* neighborWSB,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QVelDeviceCompThinWallsPartOne27<<< grid.grid, grid.threads >>> (
+        vx,
+        vy,
+        vz,
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QVelDeviceCompThinWallsPartOne27 execution failed");
+
+    QThinWallsPartTwo27 <<< grid.grid, grid.threads >>> (
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        geom,
+        neighborX,
+        neighborY,
+        neighborZ,
+        neighborWSB,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QThinWallsPartTwo27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QVelDevCompZeroPress27(LBMSimulationParameter *parameterDevice, QforBoundaryConditions *boundaryCondition)
 {
    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QVelDeviceCompZeroPress27<<< grid, threads >>> (
-            boundaryCondition->Vx,
-            boundaryCondition->Vy,
-            boundaryCondition->Vz,
-            parameterDevice->distributions.f[0],
-            boundaryCondition->k,
-            boundaryCondition->q27[0],
-            boundaryCondition->numberOfBCnodes,
-            parameterDevice->omega,
-            parameterDevice->neighborX,
-            parameterDevice->neighborY,
-            parameterDevice->neighborZ,
-            parameterDevice->numberOfNodes,
-            parameterDevice->isEvenTimestep);
-   getLastCudaError("QVelDeviceCompZeroPress27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QVelDevIncompHighNu27(unsigned int numberOfThreads,
-									  real* vx,
-									  real* vy,
-									  real* vz,
-									  real* DD,
-									  int* k_Q,
-									  real* QQ,
-									  unsigned int numberOfBCnodes,
-									  real om1,
-									  unsigned int* neighborX,
-									  unsigned int* neighborY,
-									  unsigned int* neighborZ,
-									  unsigned int size_Mat,
-									  bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QVelDeviceIncompHighNu27<<< gridQ, threads >>> (
-													  vx,
-													  vy,
-													  vz,
-													  DD,
-													  k_Q,
-													  QQ,
-													  numberOfBCnodes,
-													  om1,
-													  neighborX,
-													  neighborY,
-													  neighborZ,
-													  size_Mat,
-													  isEvenTimestep);
-      getLastCudaError("QVelDeviceIncompHighNu27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QVelDevCompHighNu27(  unsigned int numberOfThreads,
-									  real* vx,
-									  real* vy,
-									  real* vz,
-									  real* DD,
-									  int* k_Q,
-									  real* QQ,
-									  unsigned int numberOfBCnodes,
-									  real om1,
-									  unsigned int* neighborX,
-									  unsigned int* neighborY,
-									  unsigned int* neighborZ,
-									  unsigned int size_Mat,
-									  bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QVelDeviceCompHighNu27<<< gridQ, threads >>> (
-													  vx,
-													  vy,
-													  vz,
-													  DD,
-													  k_Q,
-													  QQ,
-													  numberOfBCnodes,
-													  om1,
-													  neighborX,
-													  neighborY,
-													  neighborZ,
-													  size_Mat,
-													  isEvenTimestep);
-      getLastCudaError("QVelDeviceComp27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QVeloDevEQ27(unsigned int numberOfThreads,
-							 real* VeloX,
-							 real* VeloY,
-							 real* VeloZ,
-							 real* DD,
-							 int* k_Q,
-							 int numberOfBCnodes,
-							 real om1,
-							 unsigned int* neighborX,
-							 unsigned int* neighborY,
-							 unsigned int* neighborZ,
-							 unsigned int size_Mat,
-							 bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QVeloDeviceEQ27<<< gridQ, threads >>> (VeloX,
-											 VeloY,
-											 VeloZ,
-											 DD,
-											 k_Q,
-											 numberOfBCnodes,
-											 om1,
-											 neighborX,
-											 neighborY,
-											 neighborZ,
-											 size_Mat,
-											 isEvenTimestep);
-      getLastCudaError("QVeloDeviceEQ27 execution failed");
+    QVelDeviceCompZeroPress27<<< grid, threads >>> (
+        boundaryCondition->Vx,
+        boundaryCondition->Vy,
+        boundaryCondition->Vz,
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QVelDeviceCompZeroPress27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QVelDevIncompHighNu27(
+    unsigned int numberOfThreads,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QVelDeviceIncompHighNu27<<< grid.grid, grid.threads >>> (
+        vx,
+        vy,
+        vz,
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QVelDeviceIncompHighNu27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QVelDevCompHighNu27(
+    unsigned int numberOfThreads,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QVelDeviceCompHighNu27<<< grid.grid, grid.threads >>> (
+        vx,
+        vy,
+        vz,
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QVelDeviceComp27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QVeloDevEQ27(
+    unsigned int numberOfThreads,
+    real* VeloX,
+    real* VeloY,
+    real* VeloZ,
+    real* DD,
+    int* k_Q,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QVeloDeviceEQ27<<< grid.grid, grid.threads >>> (
+        VeloX,
+        VeloY,
+        VeloZ,
+        DD,
+        k_Q,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QVeloDeviceEQ27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QVeloStreetDevEQ27(
-	uint  numberOfThreads,
-	real* veloXfraction,
-	real* veloYfraction,
-	int*  naschVelo,
-	real* DD,
-	int*  naschIndex,
-	int   numberOfStreetNodes,
-	real  velocityRatio,
-	uint* neighborX,
-	uint* neighborY,
-	uint* neighborZ,
-	uint  size_Mat,
-	bool  isEvenTimestep)
-{
-	int Grid = (numberOfStreetNodes / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 gridQ(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	QVeloStreetDeviceEQ27 << < gridQ, threads >> > (
-		veloXfraction,
-		veloYfraction,
-		naschVelo,
-		DD,
-		naschIndex,
-		numberOfStreetNodes,
-		velocityRatio,
-		neighborX,
-		neighborY,
-		neighborZ,
-		size_Mat,
-		isEvenTimestep);
-	getLastCudaError("QVeloStreetDeviceEQ27 execution failed");
+    uint  numberOfThreads,
+    real* veloXfraction,
+    real* veloYfraction,
+    int*  naschVelo,
+    real* DD,
+    int*  naschIndex,
+    int   numberOfStreetNodes,
+    real  velocityRatio,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    uint  numberOfLBnodes,
+    bool  isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfStreetNodes);
+
+    QVeloStreetDeviceEQ27 << < grid.grid, grid.threads >> > (
+        veloXfraction,
+        veloYfraction,
+        naschVelo,
+        DD,
+        naschIndex,
+        numberOfStreetNodes,
+        velocityRatio,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QVeloStreetDeviceEQ27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QSlipDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QSlipDevice27<<< grid, threads >>> (
-         parameterDevice->distributions.f[0],
-         boundaryCondition->k,
-         boundaryCondition->q27[0],
-         boundaryCondition->numberOfBCnodes,
-         parameterDevice->omega,
-         parameterDevice->neighborX,
-         parameterDevice->neighborY,
-         parameterDevice->neighborZ,
-         parameterDevice->numberOfNodes,
-         parameterDevice->isEvenTimestep);
-   getLastCudaError("QSlipDevice27 execution failed");
+    QSlipDevice27<<< grid, threads >>> (
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QSlipDevice27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QSlipDevCompTurbulentViscosity27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QSlipDeviceComp27TurbViscosity<<< grid, threads >>> (
-         parameterDevice->distributions.f[0],
-         boundaryCondition->k,
-         boundaryCondition->q27[0],
-         boundaryCondition->numberOfBCnodes,
-         parameterDevice->omega,
-         parameterDevice->neighborX,
-         parameterDevice->neighborY,
-         parameterDevice->neighborZ,
-         parameterDevice->turbViscosity,
-         parameterDevice->numberOfNodes,
-         parameterDevice->isEvenTimestep);
-   getLastCudaError("QSlipDeviceComp27TurbViscosity execution failed");
+    QSlipDeviceComp27TurbViscosity<<< grid, threads >>> (
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->turbViscosity,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QSlipDeviceComp27TurbViscosity execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QSlipPressureDevCompTurbulentViscosity27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QSlipPressureDeviceComp27TurbViscosity<<< grid, threads >>> (
-         parameterDevice->distributions.f[0],
-         boundaryCondition->k,
-         boundaryCondition->q27[0],
-         boundaryCondition->numberOfBCnodes,
-         parameterDevice->omega,
-         parameterDevice->neighborX,
-         parameterDevice->neighborY,
-         parameterDevice->neighborZ,
-         parameterDevice->turbViscosity,
-         parameterDevice->numberOfNodes,
-         parameterDevice->isEvenTimestep);
-   getLastCudaError("QSlipDeviceComp27TurbViscosity execution failed");
+    QSlipPressureDeviceComp27TurbViscosity<<< grid, threads >>> (
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->turbViscosity,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QSlipDeviceComp27TurbViscosity execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QSlipDevComp27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QSlipDeviceComp27<<< grid, threads >>> (
-         parameterDevice->distributions.f[0],
-         boundaryCondition->k,
-         boundaryCondition->q27[0],
-         boundaryCondition->numberOfBCnodes,
-         parameterDevice->omega,
-         parameterDevice->neighborX,
-         parameterDevice->neighborY,
-         parameterDevice->neighborZ,
-         parameterDevice->numberOfNodes,
-         parameterDevice->isEvenTimestep);
-   getLastCudaError("QSlipDeviceComp27 execution failed");
+    QSlipDeviceComp27<<< grid, threads >>> (
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QSlipDeviceComp27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void BBSlipDevComp27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QSlipDeviceComp27<<< grid, threads >>> (
-         parameterDevice->distributions.f[0],
-         boundaryCondition->k,
-         boundaryCondition->q27[0],
-         boundaryCondition->numberOfBCnodes,
-         parameterDevice->omega,
-         parameterDevice->neighborX,
-         parameterDevice->neighborY,
-         parameterDevice->neighborZ,
-         parameterDevice->numberOfNodes,
-         parameterDevice->isEvenTimestep);
-   getLastCudaError("BBSlipDeviceComp27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QSlipGeomDevComp27(unsigned int numberOfThreads,
-								   real* DD,
-								   int* k_Q,
-								   real* QQ,
-								   unsigned int numberOfBCnodes,
-								   real om1,
-								   real* NormalX,
-								   real* NormalY,
-								   real* NormalZ,
-								   unsigned int* neighborX,
-								   unsigned int* neighborY,
-								   unsigned int* neighborZ,
-								   unsigned int size_Mat,
-								   bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QSlipGeomDeviceComp27<<< gridQ, threads >>> (DD,
-												   k_Q,
-												   QQ,
-												   numberOfBCnodes,
-												   om1,
-												   NormalX,
-												   NormalY,
-												   NormalZ,
-												   neighborX,
-												   neighborY,
-												   neighborZ,
-												   size_Mat,
-												   isEvenTimestep);
-      getLastCudaError("QSlipGeomDeviceComp27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QSlipNormDevComp27(unsigned int numberOfThreads,
-								   real* DD,
-								   int* k_Q,
-								   real* QQ,
-								   unsigned int numberOfBCnodes,
-								   real om1,
-								   real* NormalX,
-								   real* NormalY,
-								   real* NormalZ,
-								   unsigned int* neighborX,
-								   unsigned int* neighborY,
-								   unsigned int* neighborZ,
-								   unsigned int size_Mat,
-								   bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QSlipNormDeviceComp27<<< gridQ, threads >>> (DD,
-												   k_Q,
-												   QQ,
-												   numberOfBCnodes,
-												   om1,
-												   NormalX,
-												   NormalY,
-												   NormalZ,
-												   neighborX,
-												   neighborY,
-												   neighborZ,
-												   size_Mat,
-												   isEvenTimestep);
-      getLastCudaError("QSlipGeomDeviceComp27 execution failed");
+    BBSlipDeviceComp27<<< grid, threads >>> (
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("BBSlipDeviceComp27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QSlipGeomDevComp27(
+    unsigned int numberOfThreads,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    real* NormalX,
+    real* NormalY,
+    real* NormalZ,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid(numberOfThreads, numberOfBCnodes);
+
+    QSlipGeomDeviceComp27<<< grid.grid, grid.threads >>> (
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        NormalX,
+        NormalY,
+        NormalZ,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QSlipGeomDeviceComp27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QSlipNormDevComp27(
+    unsigned int numberOfThreads,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    real* NormalX,
+    real* NormalY,
+    real* NormalZ,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QSlipNormDeviceComp27<<< grid.grid, grid.threads >>> (
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        NormalX,
+        NormalY,
+        NormalZ,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QSlipNormDeviceComp27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QStressDevComp27(Parameter *para,  QforBoundaryConditions* boundaryCondition, const int level)
 {
-   dim3 grid = vf::cuda::getCudaGrid(  para->getParD(level)->numberofthreads, boundaryCondition->numberOfBCnodes);
-   dim3 threads(para->getParD(level)->numberofthreads, 1, 1 );
-
-      QStressDeviceComp27<<< grid, threads >>> (
-         para->getParD(level)->distributions.f[0],
-         boundaryCondition->k,
-         boundaryCondition->kN,
-         boundaryCondition->q27[0],
-         boundaryCondition->numberOfBCnodes,
-         para->getParD(level)->omega,
-         para->getParD(level)->turbViscosity,
-         para->getParD(level)->velocityX,
-         para->getParD(level)->velocityY,
-         para->getParD(level)->velocityY,
-         boundaryCondition->normalX,
-         boundaryCondition->normalY,
-         boundaryCondition->normalZ,
-         boundaryCondition->Vx,
-         boundaryCondition->Vy,
-         boundaryCondition->Vz,
-         boundaryCondition->Vx1,
-         boundaryCondition->Vy1,
-         boundaryCondition->Vz1,
-         para->getParD(level)->wallModel.samplingOffset,
-         para->getParD(level)->wallModel.z0,
-         para->getHasWallModelMonitor(),
-         para->getParD(level)->wallModel.u_star,
-         para->getParD(level)->wallModel.Fx,
-         para->getParD(level)->wallModel.Fy,
-         para->getParD(level)->wallModel.Fz,
-         para->getParD(level)->neighborX,
-         para->getParD(level)->neighborY,
-         para->getParD(level)->neighborZ,
-         para->getParD(level)->numberOfNodes,
-         para->getParD(level)->isEvenTimestep);
-      getLastCudaError("QSlipDeviceComp27 execution failed");
+    dim3 grid = vf::cuda::getCudaGrid(  para->getParD(level)->numberofthreads, boundaryCondition->numberOfBCnodes);
+    dim3 threads(para->getParD(level)->numberofthreads, 1, 1 );
+
+    QStressDeviceComp27<<< grid, threads >>> (
+        para->getParD(level)->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->kN,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        para->getParD(level)->omega,
+        para->getParD(level)->turbViscosity,
+        para->getParD(level)->velocityX,
+        para->getParD(level)->velocityY,
+        para->getParD(level)->velocityY,
+        boundaryCondition->normalX,
+        boundaryCondition->normalY,
+        boundaryCondition->normalZ,
+        boundaryCondition->Vx,
+        boundaryCondition->Vy,
+        boundaryCondition->Vz,
+        boundaryCondition->Vx1,
+        boundaryCondition->Vy1,
+        boundaryCondition->Vz1,
+        para->getParD(level)->wallModel.samplingOffset,
+        para->getParD(level)->wallModel.z0,
+        para->getHasWallModelMonitor(),
+        para->getParD(level)->wallModel.u_star,
+        para->getParD(level)->wallModel.Fx,
+        para->getParD(level)->wallModel.Fy,
+        para->getParD(level)->wallModel.Fz,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("QStressDeviceComp27 execution failed");
 }
 
 //////////////////////////////////////////////////////////////////////////
 void BBStressDev27(Parameter *para,  QforBoundaryConditions* boundaryCondition, const int level)
 {
-   dim3 grid = vf::cuda::getCudaGrid( para->getParD(level)->numberofthreads, boundaryCondition->numberOfBCnodes);
-   dim3 threads(para->getParD(level)->numberofthreads, 1, 1 );
-
-   BBStressDevice27<<< grid, threads >>> (
-      para->getParD(level)->distributions.f[0],
-      boundaryCondition->k,
-      boundaryCondition->kN,
-      boundaryCondition->q27[0],
-      boundaryCondition->numberOfBCnodes,
-      para->getParD(level)->velocityX,
-      para->getParD(level)->velocityY,
-      para->getParD(level)->velocityY,
-      boundaryCondition->normalX,
-      boundaryCondition->normalY,
-      boundaryCondition->normalZ,
-      boundaryCondition->Vx,
-      boundaryCondition->Vy,
-      boundaryCondition->Vz,
-      boundaryCondition->Vx1,
-      boundaryCondition->Vy1,
-      boundaryCondition->Vz1,
-      para->getParD(level)->wallModel.samplingOffset,
-      para->getParD(level)->wallModel.z0,
-      para->getHasWallModelMonitor(),
-      para->getParD(level)->wallModel.u_star,
-      para->getParD(level)->wallModel.Fx,
-      para->getParD(level)->wallModel.Fy,
-      para->getParD(level)->wallModel.Fz,
-      para->getParD(level)->neighborX,
-      para->getParD(level)->neighborY,
-      para->getParD(level)->neighborZ,
-      para->getParD(level)->numberOfNodes,
-      para->getParD(level)->isEvenTimestep);
-      getLastCudaError("BBStressDevice27 execution failed");
+    dim3 grid = vf::cuda::getCudaGrid( para->getParD(level)->numberofthreads, boundaryCondition->numberOfBCnodes);
+    dim3 threads(para->getParD(level)->numberofthreads, 1, 1 );
+
+    BBStressDevice27<<< grid, threads >>> (
+        para->getParD(level)->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->kN,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        para->getParD(level)->velocityX,
+        para->getParD(level)->velocityY,
+        para->getParD(level)->velocityY,
+        boundaryCondition->normalX,
+        boundaryCondition->normalY,
+        boundaryCondition->normalZ,
+        boundaryCondition->Vx,
+        boundaryCondition->Vy,
+        boundaryCondition->Vz,
+        boundaryCondition->Vx1,
+        boundaryCondition->Vy1,
+        boundaryCondition->Vz1,
+        para->getParD(level)->wallModel.samplingOffset,
+        para->getParD(level)->wallModel.z0,
+        para->getHasWallModelMonitor(),
+        para->getParD(level)->wallModel.u_star,
+        para->getParD(level)->wallModel.Fx,
+        para->getParD(level)->wallModel.Fy,
+        para->getParD(level)->wallModel.Fz,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("BBStressDevice27 execution failed");
 }
 
 //////////////////////////////////////////////////////////////////////////
 void BBStressPressureDev27(Parameter *para,  QforBoundaryConditions* boundaryCondition, const int level)
 {
-   dim3 grid = vf::cuda::getCudaGrid( para->getParD(level)->numberofthreads, boundaryCondition->numberOfBCnodes);
-   dim3 threads(para->getParD(level)->numberofthreads, 1, 1 );
-
-   BBStressPressureDevice27<<< grid, threads >>> (
-      para->getParD(level)->distributions.f[0],
-      boundaryCondition->k,
-      boundaryCondition->kN,
-      boundaryCondition->q27[0],
-      boundaryCondition->numberOfBCnodes,
-      para->getParD(level)->velocityX,
-      para->getParD(level)->velocityY,
-      para->getParD(level)->velocityY,
-      boundaryCondition->normalX,
-      boundaryCondition->normalY,
-      boundaryCondition->normalZ,
-      boundaryCondition->Vx,
-      boundaryCondition->Vy,
-      boundaryCondition->Vz,
-      boundaryCondition->Vx1,
-      boundaryCondition->Vy1,
-      boundaryCondition->Vz1,
-      para->getParD(level)->wallModel.samplingOffset,
-      para->getParD(level)->wallModel.z0,
-      para->getHasWallModelMonitor(),
-      para->getParD(level)->wallModel.u_star,
-      para->getParD(level)->wallModel.Fx,
-      para->getParD(level)->wallModel.Fy,
-      para->getParD(level)->wallModel.Fz,
-      para->getParD(level)->neighborX,
-      para->getParD(level)->neighborY,
-      para->getParD(level)->neighborZ,
-      para->getParD(level)->numberOfNodes,
-      para->getParD(level)->isEvenTimestep);
-      getLastCudaError("BBStressDevice27 execution failed");
+    dim3 grid = vf::cuda::getCudaGrid( para->getParD(level)->numberofthreads, boundaryCondition->numberOfBCnodes);
+    dim3 threads(para->getParD(level)->numberofthreads, 1, 1 );
+
+    BBStressPressureDevice27<<< grid, threads >>> (
+        para->getParD(level)->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->kN,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        para->getParD(level)->velocityX,
+        para->getParD(level)->velocityY,
+        para->getParD(level)->velocityY,
+        boundaryCondition->normalX,
+        boundaryCondition->normalY,
+        boundaryCondition->normalZ,
+        boundaryCondition->Vx,
+        boundaryCondition->Vy,
+        boundaryCondition->Vz,
+        boundaryCondition->Vx1,
+        boundaryCondition->Vy1,
+        boundaryCondition->Vz1,
+        para->getParD(level)->wallModel.samplingOffset,
+        para->getParD(level)->wallModel.z0,
+        para->getHasWallModelMonitor(),
+        para->getParD(level)->wallModel.u_star,
+        para->getParD(level)->wallModel.Fx,
+        para->getParD(level)->wallModel.Fy,
+        para->getParD(level)->wallModel.Fz,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("BBStressPressureDevice27 execution failed");
 }
 
 //////////////////////////////////////////////////////////////////////////
 void QPressDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QPressDevice27<<< grid, threads >>> (
-      boundaryCondition->RhoBC,
-      parameterDevice->distributions.f[0],
-      boundaryCondition->k,
-      boundaryCondition->q27[0],
-      boundaryCondition->numberOfBCnodes,
-      parameterDevice->omega,
-      parameterDevice->neighborX,
-      parameterDevice->neighborY,
-      parameterDevice->neighborZ,
-      parameterDevice->numberOfNodes,
-      parameterDevice->isEvenTimestep);
-   getLastCudaError("QPressDevice27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QPressDevAntiBB27(  unsigned int numberOfThreads,
-                                    real* rhoBC,
-									real* vx,
-									real* vy,
-									real* vz,
-									real* DD,
-									int* k_Q,
-									real* QQ,
-									int numberOfBCnodes,
-									real om1,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									unsigned int size_Mat,
-									bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-    QPressDeviceAntiBB27<<< gridQ, threads >>>( rhoBC,
-												vx,
-												vy,
-												vz,
-												DD,
-												k_Q,
-												QQ,
-												numberOfBCnodes,
-												om1,
-												neighborX,
-												neighborY,
-												neighborZ,
-												size_Mat,
-												isEvenTimestep);
+    QPressDevice27<<< grid, threads >>> (
+        boundaryCondition->RhoBC,
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QPressDevice27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QPressDevAntiBB27(
+    unsigned int numberOfThreads,
+    real* rhoBC,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QPressDeviceAntiBB27<<< grid.grid, grid.threads >>>(
+        rhoBC,
+        vx,
+        vy,
+        vz,
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
     getLastCudaError("QPressDeviceAntiBB27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
-void QPressDevFixBackflow27( unsigned int numberOfThreads,
-                                        real* rhoBC,
-                                        real* DD,
-                                        int* k_Q,
-                                        unsigned int numberOfBCnodes,
-                                        real om1,
-                                        unsigned int* neighborX,
-                                        unsigned int* neighborY,
-                                        unsigned int* neighborZ,
-                                        unsigned int size_Mat,
-                                        bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QPressDeviceFixBackflow27<<< gridQ, threads >>> (  rhoBC,
-                                                         DD,
-                                                         k_Q,
-                                                         numberOfBCnodes,
-                                                         om1,
-                                                         neighborX,
-                                                         neighborY,
-                                                         neighborZ,
-                                                         size_Mat,
-                                                         isEvenTimestep);
-      getLastCudaError("QPressDeviceFixBackflow27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QPressDevDirDepBot27(  unsigned int numberOfThreads,
-                                       real* rhoBC,
-                                       real* DD,
-                                       int* k_Q,
-                                       unsigned int numberOfBCnodes,
-                                       real om1,
-                                       unsigned int* neighborX,
-                                       unsigned int* neighborY,
-                                       unsigned int* neighborZ,
-                                       unsigned int size_Mat,
-                                       bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QPressDeviceDirDepBot27<<< gridQ, threads >>> ( rhoBC,
-                                                      DD,
-                                                      k_Q,
-                                                      numberOfBCnodes,
-                                                      om1,
-                                                      neighborX,
-                                                      neighborY,
-                                                      neighborZ,
-                                                      size_Mat,
-                                                      isEvenTimestep);
-      getLastCudaError("QPressDeviceDirDepBot27 execution failed");
+void QPressDevFixBackflow27(
+    unsigned int numberOfThreads,
+    real* rhoBC,
+    real* DD,
+    int* k_Q,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QPressDeviceFixBackflow27<<< grid.grid, grid.threads >>> (
+        rhoBC,
+        DD,
+        k_Q,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QPressDeviceFixBackflow27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QPressDevDirDepBot27(
+    unsigned int numberOfThreads,
+    real* rhoBC,
+    real* DD,
+    int* k_Q,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QPressDeviceDirDepBot27<<< grid.grid, grid.threads >>> (
+        rhoBC,
+        DD,
+        k_Q,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QPressDeviceDirDepBot27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QPressNoRhoDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+
+    QPressNoRhoDevice27<<< grid, threads >>> (
+        boundaryCondition->RhoBC,
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->kN,
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep,
+        vf::lbm::dir::DIR_P00);
+    getLastCudaError("QPressNoRhoDevice27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QPressZeroRhoOutflowDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
+{
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QPressNoRhoDevice27<<< grid, threads >>> (
-         boundaryCondition->RhoBC,
-         parameterDevice->distributions.f[0],
-         boundaryCondition->k,
-         boundaryCondition->kN,
-         boundaryCondition->numberOfBCnodes,
-         parameterDevice->omega,
-         parameterDevice->neighborX,
-         parameterDevice->neighborY,
-         parameterDevice->neighborZ,
-         parameterDevice->numberOfNodes,
-         parameterDevice->isEvenTimestep);
-   getLastCudaError("QPressNoRhoDevice27 execution failed");
+    QPressZeroRhoOutflowDevice27<<< grid, threads >>> (
+        boundaryCondition->RhoBC,
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->kN,
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep,
+        vf::lbm::dir::DIR_P00,
+        parameterDevice->outflowPressureCorrectionFactor);
+    getLastCudaError("QPressZeroRhoOutflowDevice27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QInflowScaleByPressDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QInflowScaleByPressDevice27<<< grid, threads >>> (
-           boundaryCondition->RhoBC,
-           parameterDevice->distributions.f[0],
-           boundaryCondition->k,
-           boundaryCondition->kN,
-           boundaryCondition->numberOfBCnodes,
-           parameterDevice->omega,
-           parameterDevice->neighborX,
-           parameterDevice->neighborY,
-           parameterDevice->neighborZ,
-           parameterDevice->numberOfNodes,
-           parameterDevice->isEvenTimestep);
-   getLastCudaError("QInflowScaleByPressDevice27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QPressDevOld27(  unsigned int numberOfThreads,
-                                     real* rhoBC,
-                                     real* DD,
-                                     int* k_Q,
-                                     int* k_N,
-                                     unsigned int numberOfBCnodes,
-                                     real om1,
-                                     unsigned int* neighborX,
-                                     unsigned int* neighborY,
-                                     unsigned int* neighborZ,
-                                     unsigned int size_Mat,
-                                     bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QPressDeviceOld27<<< gridQ, threads >>> ( rhoBC,
-                                                DD,
-                                                k_Q,
-                                                k_N,
-                                                numberOfBCnodes,
-                                                om1,
-                                                neighborX,
-                                                neighborY,
-                                                neighborZ,
-                                                size_Mat,
-                                                isEvenTimestep);
-      getLastCudaError("QPressDeviceOld27 execution failed");
+    QInflowScaleByPressDevice27<<< grid, threads >>> (
+        boundaryCondition->RhoBC,
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->kN,
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QInflowScaleByPressDevice27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QPressDevOld27(
+    unsigned int numberOfThreads,
+    real* rhoBC,
+    real* DD,
+    int* k_Q,
+    int* k_N,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QPressDeviceOld27<<< grid.grid, grid.threads >>> (
+        rhoBC,
+        DD,
+        k_Q,
+        k_N,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QPressDeviceOld27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QPressDevIncompNEQ27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QPressDeviceIncompNEQ27<<< grid, threads >>> (
-         boundaryCondition->RhoBC,
-         parameterDevice->distributions.f[0],
-         boundaryCondition->k,
-         boundaryCondition->kN,
-         boundaryCondition->numberOfBCnodes,
-         parameterDevice->omega,
-         parameterDevice->neighborX,
-         parameterDevice->neighborY,
-         parameterDevice->neighborZ,
-         parameterDevice->numberOfNodes,
-         parameterDevice->isEvenTimestep);
-   getLastCudaError("QPressDeviceIncompNEQ27 execution failed");
+    QPressDeviceIncompNEQ27<<< grid, threads >>> (
+        boundaryCondition->RhoBC,
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->kN,
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QPressDeviceIncompNEQ27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QPressDevNEQ27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   QPressDeviceNEQ27<<< grid, threads >>> (
+    QPressDeviceNEQ27<<< grid, threads >>> (
         boundaryCondition->RhoBC,
         parameterDevice->distributions.f[0],
         boundaryCondition->k,
@@ -3907,3241 +3105,2743 @@ void QPressDevNEQ27(LBMSimulationParameter* parameterDevice, QforBoundaryConditi
         parameterDevice->neighborZ,
         parameterDevice->numberOfNodes,
         parameterDevice->isEvenTimestep);
-   getLastCudaError("QPressDevNEQ27 execution failed");
+    getLastCudaError("QPressDevNEQ27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QPressDevEQZ27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-      QPressDeviceEQZ27<<< grid, threads >>> (
-            boundaryCondition->RhoBC,
-            parameterDevice->distributions.f[0],
-            boundaryCondition->k,
-            boundaryCondition->kN,
-            parameterDevice->kDistTestRE.f[0],
-            boundaryCondition->numberOfBCnodes,
-            parameterDevice->omega,
-            parameterDevice->neighborX,
-            parameterDevice->neighborY,
-            parameterDevice->neighborZ,
-            parameterDevice->numberOfNodes,
-            parameterDevice->isEvenTimestep);
-      getLastCudaError("QPressDeviceEQZ27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QPressDevZero27(unsigned int numberOfThreads,
-                                real* DD,
-                                int* k_Q,
-                                unsigned int numberOfBCnodes,
-                                unsigned int* neighborX,
-                                unsigned int* neighborY,
-                                unsigned int* neighborZ,
-                                unsigned int size_Mat,
-                                bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QPressDeviceZero27<<< gridQ, threads >>> (DD,
-                                                k_Q,
-                                                numberOfBCnodes,
-                                                neighborX,
-                                                neighborY,
-                                                neighborZ,
-                                                size_Mat,
-                                                isEvenTimestep);
-      getLastCudaError("QPressDeviceOld27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QPressDevFake27(     unsigned int numberOfThreads,
-                                     real* rhoBC,
-                                     real* DD,
-                                     int* k_Q,
-                                     int* k_N,
-                                     unsigned int numberOfBCnodes,
-                                     real om1,
-                                     unsigned int* neighborX,
-                                     unsigned int* neighborY,
-                                     unsigned int* neighborZ,
-                                     unsigned int size_Mat,
-                                     bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QPressDeviceFake27<<< gridQ, threads >>> (rhoBC,
-                                                DD,
-                                                k_Q,
-                                                k_N,
-                                                numberOfBCnodes,
-                                                om1,
-                                                neighborX,
-                                                neighborY,
-                                                neighborZ,
-                                                size_Mat,
-                                                isEvenTimestep);
-      getLastCudaError("QPressDeviceFake27 execution failed");
+    QPressDeviceEQZ27<<< grid, threads >>> (
+        boundaryCondition->RhoBC,
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->kN,
+        parameterDevice->kDistTestRE.f[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->omega,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QPressDeviceEQZ27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QPressDevZero27(
+    unsigned int numberOfThreads,
+    real* DD,
+    int* k_Q,
+    unsigned int numberOfBCnodes,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QPressDeviceZero27<<< grid.grid, grid.threads >>> (
+        DD,
+        k_Q,
+        numberOfBCnodes,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QPressDeviceOld27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QPressDevFake27(
+    unsigned int numberOfThreads,
+    real* rhoBC,
+    real* DD,
+    int* k_Q,
+    int* k_N,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+
+    QPressDeviceFake27<<< grid.grid, grid.threads >>> (
+        rhoBC,
+        DD,
+        k_Q,
+        k_N,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QPressDeviceFake27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void BBDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
-   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
-   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
 
-   BBDevice27<<< grid, threads >>> (
-         parameterDevice->distributions.f[0],
-         boundaryCondition->k,
-         boundaryCondition->q27[0],
-         boundaryCondition->numberOfBCnodes,
-         parameterDevice->neighborX,
-         parameterDevice->neighborY,
-         parameterDevice->neighborZ,
-         parameterDevice->numberOfNodes,
-         parameterDevice->isEvenTimestep);
-   getLastCudaError("BBDevice27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void QPressDev27_IntBB(  unsigned int numberOfThreads,
-									real* rho,
-									real* DD,
-									int* k_Q,
-									real* QQ,
-									unsigned int numberOfBCnodes,
-									real om1,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									unsigned int size_Mat,
-									bool isEvenTimestep)
-{
-	int Grid = (numberOfBCnodes / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 gridQ(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-		QPressDevice27_IntBB<<< gridQ, threads >>> (rho,
-													DD,
-													k_Q,
-													QQ,
-													numberOfBCnodes,
-													om1,
-													neighborX,
-													neighborY,
-													neighborZ,
-													size_Mat,
-													isEvenTimestep);
-		getLastCudaError("QPressDevice27_IntBB execution failed");
+    BBDevice27<<< grid, threads >>> (
+        parameterDevice->distributions.f[0],
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->numberOfBCnodes,
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("BBDevice27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QPressDev27_IntBB(
+    unsigned int numberOfThreads,
+    real* rho,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    QPressDevice27_IntBB<<< grid.grid, grid.threads >>> (
+        rho,
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("QPressDevice27_IntBB execution failed");
 }
 // TODO: https://git.rz.tu-bs.de/irmb/VirtualFluids_dev/-/issues/29
 //////////////////////////////////////////////////////////////////////////
-void PressSchlaffer27(unsigned int numberOfThreads,
-                                 real* rhoBC,
-                                 real* DD,
-                                 real* vx0,
-                                 real* vy0,
-                                 real* vz0,
-                                 real* deltaVz0,
-                                 int* k_Q,
-                                 int* k_N,
-                                 int numberOfBCnodes,
-                                 real om1,
-                                 unsigned int* neighborX,
-                                 unsigned int* neighborY,
-                                 unsigned int* neighborZ,
-                                 unsigned int size_Mat,
-                                 bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      PressSchlaff27<<< gridQ, threads >>>(  rhoBC,
-                                             DD,
-                                             vx0,
-                                             vy0,
-                                             vz0,
-                                             deltaVz0,
-                                             k_Q,
-                                             k_N,
-                                             numberOfBCnodes,
-                                             om1,
-                                             neighborX,
-                                             neighborY,
-                                             neighborZ,
-                                             size_Mat,
-                                             isEvenTimestep);
-      getLastCudaError("PressSchlaff27 execution failed");
+void PressSchlaffer27(
+    unsigned int numberOfThreads,
+    real* rhoBC,
+    real* DD,
+    real* vx0,
+    real* vy0,
+    real* vz0,
+    real* deltaVz0,
+    int* k_Q,
+    int* k_N,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    PressSchlaff27<<< grid.grid, grid.threads >>>(
+        rhoBC,
+        DD,
+        vx0,
+        vy0,
+        vz0,
+        deltaVz0,
+        k_Q,
+        k_N,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("PressSchlaff27 execution failed");
 }
 // TODO: https://git.rz.tu-bs.de/irmb/VirtualFluids_dev/-/issues/29
 //////////////////////////////////////////////////////////////////////////
-void VelSchlaffer27(  unsigned int numberOfThreads,
-                                 int t,
-                                 real* DD,
-                                 real* vz0,
-                                 real* deltaVz0,
-                                 int* k_Q,
-                                 int* k_N,
-                                 int numberOfBCnodes,
-                                 real om1,
-                                 unsigned int* neighborX,
-                                 unsigned int* neighborY,
-                                 unsigned int* neighborZ,
-                                 unsigned int size_Mat,
-                                 bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      VelSchlaff27<<< gridQ, threads >>>( t,
-                                          DD,
-                                          vz0,
-                                          deltaVz0,
-                                          k_Q,
-                                          k_N,
-                                          numberOfBCnodes,
-                                          om1,
-                                          neighborX,
-                                          neighborY,
-                                          neighborZ,
-                                          size_Mat,
-                                          isEvenTimestep);
-      getLastCudaError("VelSchlaff27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void PropVelo(   unsigned int numberOfThreads,
-                            unsigned int* neighborX,
-                            unsigned int* neighborY,
-                            unsigned int* neighborZ,
-                            real* rho,
-                            real* ux,
-                            real* uy,
-                            real* uz,
-                            int* k_Q,
-							unsigned int size_Prop,
-                            unsigned int size_Mat,
-                            unsigned int* bcMatD,
-                            real* DD,
-                            bool EvenOrOdd)
-{
-   int Grid = (size_Prop / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      PropellerBC<<< grid, threads >>>(neighborX,
-                                       neighborY,
-                                       neighborZ,
-                                       rho,
-                                       ux,
-                                       uy,
-                                       uz,
-									   k_Q,
-									   size_Prop,
-                                       size_Mat,
-									   bcMatD,
-                                       DD,
-                                       EvenOrOdd);
-      getLastCudaError("PropellerBC execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCF27( real* DC,
-                        real* DF,
-                        unsigned int* neighborCX,
-                        unsigned int* neighborCY,
-                        unsigned int* neighborCZ,
-                        unsigned int* neighborFX,
-                        unsigned int* neighborFY,
-                        unsigned int* neighborFZ,
-                        unsigned int size_MatC,
-                        unsigned int size_MatF,
-                        bool isEvenTimestep,
-                        unsigned int* posCSWB,
-                        unsigned int* posFSWB,
-                        unsigned int kCF,
-                        real omCoarse,
-                        real omFine,
-                        real nu,
-                        unsigned int nxC,
-                        unsigned int nyC,
-                        unsigned int nxF,
-                        unsigned int nyF,
-                        unsigned int numberOfThreads)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF27<<< gridINT_CF, threads >>> ( DC,
-                                             DF,
-                                             neighborCX,
-                                             neighborCY,
-                                             neighborCZ,
-                                             neighborFX,
-                                             neighborFY,
-                                             neighborFZ,
-                                             size_MatC,
-                                             size_MatF,
-                                             isEvenTimestep,
-                                             posCSWB,
-                                             posFSWB,
-                                             kCF,
-                                             omCoarse,
-                                             omFine,
-                                             nu,
-                                             nxC,
-                                             nyC,
-                                             nxF,
-                                             nyF);
-      getLastCudaError("scaleCF27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCFEff27(real* DC,
-                             real* DF,
-                             unsigned int* neighborCX,
-                             unsigned int* neighborCY,
-                             unsigned int* neighborCZ,
-                             unsigned int* neighborFX,
-                             unsigned int* neighborFY,
-                             unsigned int* neighborFZ,
-                             unsigned int size_MatC,
-                             unsigned int size_MatF,
-                             bool isEvenTimestep,
-                             unsigned int* posCSWB,
-                             unsigned int* posFSWB,
-                             unsigned int kCF,
-                             real omCoarse,
-                             real omFine,
-                             real nu,
-                             unsigned int nxC,
-                             unsigned int nyC,
-                             unsigned int nxF,
-                             unsigned int nyF,
-                             unsigned int numberOfThreads,
-                             OffCF offCF)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCFEff27<<< gridINT_CF, threads >>> ( DC,
-                                                DF,
-                                                neighborCX,
-                                                neighborCY,
-                                                neighborCZ,
-                                                neighborFX,
-                                                neighborFY,
-                                                neighborFZ,
-                                                size_MatC,
-                                                size_MatF,
-                                                isEvenTimestep,
-                                                posCSWB,
-                                                posFSWB,
-                                                kCF,
-                                                omCoarse,
-                                                omFine,
-                                                nu,
-                                                nxC,
-                                                nyC,
-                                                nxF,
-                                                nyF,
-                                                offCF);
-      getLastCudaError("scaleCFEff27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCFLast27(real* DC,
-                              real* DF,
-                              unsigned int* neighborCX,
-                              unsigned int* neighborCY,
-                              unsigned int* neighborCZ,
-                              unsigned int* neighborFX,
-                              unsigned int* neighborFY,
-                              unsigned int* neighborFZ,
-                              unsigned int size_MatC,
-                              unsigned int size_MatF,
-                              bool isEvenTimestep,
-                              unsigned int* posCSWB,
-                              unsigned int* posFSWB,
-                              unsigned int kCF,
-                              real omCoarse,
-                              real omFine,
-                              real nu,
-                              unsigned int nxC,
-                              unsigned int nyC,
-                              unsigned int nxF,
-                              unsigned int nyF,
-                              unsigned int numberOfThreads,
-                              OffCF offCF)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCFLast27<<< gridINT_CF, threads >>> (DC,
-                                                DF,
-                                                neighborCX,
-                                                neighborCY,
-                                                neighborCZ,
-                                                neighborFX,
-                                                neighborFY,
-                                                neighborFZ,
-                                                size_MatC,
-                                                size_MatF,
-                                                isEvenTimestep,
-                                                posCSWB,
-                                                posFSWB,
-                                                kCF,
-                                                omCoarse,
-                                                omFine,
-                                                nu,
-                                                nxC,
-                                                nyC,
-                                                nxF,
-                                                nyF,
-                                                offCF);
-      getLastCudaError("scaleCFLast27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCFpress27(  real* DC,
-                                 real* DF,
-                                 unsigned int* neighborCX,
-                                 unsigned int* neighborCY,
-                                 unsigned int* neighborCZ,
-                                 unsigned int* neighborFX,
-                                 unsigned int* neighborFY,
-                                 unsigned int* neighborFZ,
-                                 unsigned int size_MatC,
-                                 unsigned int size_MatF,
-                                 bool isEvenTimestep,
-                                 unsigned int* posCSWB,
-                                 unsigned int* posFSWB,
-                                 unsigned int kCF,
-                                 real omCoarse,
-                                 real omFine,
-                                 real nu,
-                                 unsigned int nxC,
-                                 unsigned int nyC,
-                                 unsigned int nxF,
-                                 unsigned int nyF,
-                                 unsigned int numberOfThreads,
-                                 OffCF offCF)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCFpress27<<< gridINT_CF, threads >>>(DC,
-                                                DF,
-                                                neighborCX,
-                                                neighborCY,
-                                                neighborCZ,
-                                                neighborFX,
-                                                neighborFY,
-                                                neighborFZ,
-                                                size_MatC,
-                                                size_MatF,
-                                                isEvenTimestep,
-                                                posCSWB,
-                                                posFSWB,
-                                                kCF,
-                                                omCoarse,
-                                                omFine,
-                                                nu,
-                                                nxC,
-                                                nyC,
-                                                nxF,
-                                                nyF,
-                                                offCF);
-      getLastCudaError("scaleCFpress27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCF_Fix_27(  real* DC,
-                                 real* DF,
-                                 unsigned int* neighborCX,
-                                 unsigned int* neighborCY,
-                                 unsigned int* neighborCZ,
-                                 unsigned int* neighborFX,
-                                 unsigned int* neighborFY,
-                                 unsigned int* neighborFZ,
-                                 unsigned int size_MatC,
-                                 unsigned int size_MatF,
-                                 bool isEvenTimestep,
-                                 unsigned int* posCSWB,
-                                 unsigned int* posFSWB,
-                                 unsigned int kCF,
-                                 real omCoarse,
-                                 real omFine,
-                                 real nu,
-                                 unsigned int nxC,
-                                 unsigned int nyC,
-                                 unsigned int nxF,
-                                 unsigned int nyF,
-                                 unsigned int numberOfThreads,
-                                 OffCF offCF)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_Fix_27<<< gridINT_CF, threads >>>(DC,
-                                                DF,
-                                                neighborCX,
-                                                neighborCY,
-                                                neighborCZ,
-                                                neighborFX,
-                                                neighborFY,
-                                                neighborFZ,
-                                                size_MatC,
-                                                size_MatF,
-                                                isEvenTimestep,
-                                                posCSWB,
-                                                posFSWB,
-                                                kCF,
-                                                omCoarse,
-                                                omFine,
-                                                nu,
-                                                nxC,
-                                                nyC,
-                                                nxF,
-                                                nyF,
-                                                offCF);
-      getLastCudaError("scaleCF_Fix_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCF_Fix_comp_27( real* DC,
-									 real* DF,
-									 unsigned int* neighborCX,
-									 unsigned int* neighborCY,
-									 unsigned int* neighborCZ,
-									 unsigned int* neighborFX,
-									 unsigned int* neighborFY,
-									 unsigned int* neighborFZ,
-									 unsigned int size_MatC,
-									 unsigned int size_MatF,
-									 bool isEvenTimestep,
-									 unsigned int* posCSWB,
-									 unsigned int* posFSWB,
-									 unsigned int kCF,
-									 real omCoarse,
-									 real omFine,
-									 real nu,
-									 unsigned int nxC,
-									 unsigned int nyC,
-									 unsigned int nxF,
-									 unsigned int nyF,
-									 unsigned int numberOfThreads,
-									 OffCF offCF)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_Fix_comp_27<<< gridINT_CF, threads >>>(   DC,
-														DF,
-														neighborCX,
-														neighborCY,
-														neighborCZ,
-														neighborFX,
-														neighborFY,
-														neighborFZ,
-														size_MatC,
-														size_MatF,
-														isEvenTimestep,
-														posCSWB,
-														posFSWB,
-														kCF,
-														omCoarse,
-														omFine,
-														nu,
-														nxC,
-														nyC,
-														nxF,
-														nyF,
-														offCF);
-      getLastCudaError("scaleCF_Fix_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCF_0817_comp_27(real* DC,
-									 real* DF,
-									 unsigned int* neighborCX,
-									 unsigned int* neighborCY,
-									 unsigned int* neighborCZ,
-									 unsigned int* neighborFX,
-									 unsigned int* neighborFY,
-									 unsigned int* neighborFZ,
-									 unsigned int size_MatC,
-									 unsigned int size_MatF,
-									 bool isEvenTimestep,
-									 unsigned int* posCSWB,
-									 unsigned int* posFSWB,
-									 unsigned int kCF,
-									 real omCoarse,
-									 real omFine,
-									 real nu,
-									 unsigned int nxC,
-									 unsigned int nyC,
-									 unsigned int nxF,
-									 unsigned int nyF,
-									 unsigned int numberOfThreads,
-									 OffCF offCF,
-                            CUstream_st *stream)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_0817_comp_27<<< gridINT_CF, threads, 0, stream >>>(  DC,
-														DF,
-														neighborCX,
-														neighborCY,
-														neighborCZ,
-														neighborFX,
-														neighborFY,
-														neighborFZ,
-														size_MatC,
-														size_MatF,
-														isEvenTimestep,
-														posCSWB,
-														posFSWB,
-														kCF,
-														omCoarse,
-														omFine,
-														nu,
-														nxC,
-														nyC,
-														nxF,
-														nyF,
-														offCF);
-      getLastCudaError("scaleCF_0817_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCF_comp_D3Q27F3_2018(real* DC,
-										  real* DF,
-										  real* G6,
-										  unsigned int* neighborCX,
-										  unsigned int* neighborCY,
-										  unsigned int* neighborCZ,
-										  unsigned int* neighborFX,
-										  unsigned int* neighborFY,
-										  unsigned int* neighborFZ,
-										  unsigned int size_MatC,
-										  unsigned int size_MatF,
-										  bool isEvenTimestep,
-										  unsigned int* posCSWB,
-										  unsigned int* posFSWB,
-										  unsigned int kCF,
-										  real omCoarse,
-										  real omFine,
-										  real nu,
-										  unsigned int nxC,
-										  unsigned int nyC,
-										  unsigned int nxF,
-										  unsigned int nyF,
-										  unsigned int numberOfThreads,
-										  OffCF offCF)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_comp_D3Q27F3_2018 <<< gridINT_CF, threads >>>(DC,
-															DF,
-															G6,
-															neighborCX,
-															neighborCY,
-															neighborCZ,
-															neighborFX,
-															neighborFY,
-															neighborFZ,
-															size_MatC,
-															size_MatF,
-															isEvenTimestep,
-															posCSWB,
-															posFSWB,
-															kCF,
-															omCoarse,
-															omFine,
-															nu,
-															nxC,
-															nyC,
-															nxF,
-															nyF,
-															offCF);
-      getLastCudaError("scaleCF_comp_D3Q27F3_2018 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCF_comp_D3Q27F3(real* DC,
-									 real* DF,
-									 real* G6,
-									 unsigned int* neighborCX,
-									 unsigned int* neighborCY,
-									 unsigned int* neighborCZ,
-									 unsigned int* neighborFX,
-									 unsigned int* neighborFY,
-									 unsigned int* neighborFZ,
-									 unsigned int size_MatC,
-									 unsigned int size_MatF,
-									 bool isEvenTimestep,
-									 unsigned int* posCSWB,
-									 unsigned int* posFSWB,
-									 unsigned int kCF,
-									 real omCoarse,
-									 real omFine,
-									 real nu,
-									 unsigned int nxC,
-									 unsigned int nyC,
-									 unsigned int nxF,
-									 unsigned int nyF,
-									 unsigned int numberOfThreads,
-									 OffCF offCF,
-                            CUstream_st *stream)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_comp_D3Q27F3 <<< gridINT_CF, threads, 0, stream >>>( DC,
-														DF,
-														G6,
-														neighborCX,
-														neighborCY,
-														neighborCZ,
-														neighborFX,
-														neighborFY,
-														neighborFZ,
-														size_MatC,
-														size_MatF,
-														isEvenTimestep,
-														posCSWB,
-														posFSWB,
-														kCF,
-														omCoarse,
-														omFine,
-														nu,
-														nxC,
-														nyC,
-														nxF,
-														nyF,
-														offCF);
-      getLastCudaError("scaleCF_comp_D3Q27F3 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCF_staggered_time_comp_27(  real* DC,
-												 real* DF,
-												 unsigned int* neighborCX,
-												 unsigned int* neighborCY,
-												 unsigned int* neighborCZ,
-												 unsigned int* neighborFX,
-												 unsigned int* neighborFY,
-												 unsigned int* neighborFZ,
-												 unsigned int size_MatC,
-												 unsigned int size_MatF,
-												 bool isEvenTimestep,
-												 unsigned int* posCSWB,
-												 unsigned int* posFSWB,
-												 unsigned int kCF,
-												 real omCoarse,
-												 real omFine,
-												 real nu,
-												 unsigned int nxC,
-												 unsigned int nyC,
-												 unsigned int nxF,
-												 unsigned int nyF,
-												 unsigned int numberOfThreads,
-												 OffCF offCF)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_staggered_time_comp_27<<< gridINT_CF, threads >>>(    DC,
-																	DF,
-																	neighborCX,
-																	neighborCY,
-																	neighborCZ,
-																	neighborFX,
-																	neighborFY,
-																	neighborFZ,
-																	size_MatC,
-																	size_MatF,
-																	isEvenTimestep,
-																	posCSWB,
-																	posFSWB,
-																	kCF,
-																	omCoarse,
-																	omFine,
-																	nu,
-																	nxC,
-																	nyC,
-																	nxF,
-																	nyF,
-																	offCF);
-      getLastCudaError("scaleCF_Fix_27 execution failed");
+void VelSchlaffer27(
+    unsigned int numberOfThreads,
+    int t,
+    real* DD,
+    real* vz0,
+    real* deltaVz0,
+    int* k_Q,
+    int* k_N,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    VelSchlaff27<<< grid.grid, grid.threads >>>(
+        t,
+        DD,
+        vz0,
+        deltaVz0,
+        k_Q,
+        k_N,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("VelSchlaff27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void QPrecursorDevCompZeroPress(LBMSimulationParameter* parameterDevice,
+                                QforPrecursorBoundaryConditions* boundaryCondition,
+                                real timeRatio,
+                                real velocityRatio)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+
+    QPrecursorDeviceCompZeroPress<<< grid.grid, grid.threads >>>(
+        boundaryCondition->k,
+        boundaryCondition->numberOfBCnodes,
+        boundaryCondition->numberOfPrecursorNodes,
+        boundaryCondition->sizeQ,
+        parameterDevice->omega,
+        parameterDevice->distributions.f[0],
+        boundaryCondition->q27[0],
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        boundaryCondition->planeNeighbor0PP,
+        boundaryCondition->planeNeighbor0PM,
+        boundaryCondition->planeNeighbor0MP,
+        boundaryCondition->planeNeighbor0MM,
+        boundaryCondition->weights0PP,
+        boundaryCondition->weights0PM,
+        boundaryCondition->weights0MP,
+        boundaryCondition->weights0MM,
+        boundaryCondition->last,
+        boundaryCondition->current,
+        boundaryCondition->velocityX,
+        boundaryCondition->velocityY,
+        boundaryCondition->velocityZ,
+        timeRatio,
+        velocityRatio,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QPrecursorDeviceCompZeroPress execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void PrecursorDevEQ27( LBMSimulationParameter* parameterDevice,
+                        QforPrecursorBoundaryConditions* boundaryCondition,
+                        real timeRatio,
+                        real velocityRatio)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+
+    PrecursorDeviceEQ27<<< grid.grid, grid.threads >>>(
+        boundaryCondition->k,
+        boundaryCondition->numberOfBCnodes,
+        boundaryCondition->numberOfPrecursorNodes,
+        parameterDevice->omega,
+        parameterDevice->distributions.f[0],
+        parameterDevice->neighborX,
+        parameterDevice->neighborX,
+        parameterDevice->neighborX,
+        boundaryCondition->planeNeighbor0PP,
+        boundaryCondition->planeNeighbor0PM,
+        boundaryCondition->planeNeighbor0MP,
+        boundaryCondition->planeNeighbor0MM,
+        boundaryCondition->weights0PP,
+        boundaryCondition->weights0PM,
+        boundaryCondition->weights0MP,
+        boundaryCondition->weights0MM,
+        boundaryCondition->last,
+        boundaryCondition->current,
+        boundaryCondition->velocityX,
+        boundaryCondition->velocityY,
+        boundaryCondition->velocityZ,
+        timeRatio,
+        velocityRatio,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("PrecursorDeviceEQ27 execution failed");
+
+}
+//////////////////////////////////////////////////////////////////////////
+void PrecursorDevDistributions( LBMSimulationParameter* parameterDevice,
+                                QforPrecursorBoundaryConditions* boundaryCondition,
+                                real timeRatio,
+                                real velocityRatio)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+
+    PrecursorDeviceDistributions<<< grid.grid, grid.threads >>>(
+        boundaryCondition->k,
+        boundaryCondition->numberOfBCnodes,
+        boundaryCondition->numberOfPrecursorNodes,
+        parameterDevice->distributions.f[0],
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        boundaryCondition->planeNeighbor0PP,
+        boundaryCondition->planeNeighbor0PM,
+        boundaryCondition->planeNeighbor0MP,
+        boundaryCondition->planeNeighbor0MM,
+        boundaryCondition->weights0PP,
+        boundaryCondition->weights0PM,
+        boundaryCondition->weights0MP,
+        boundaryCondition->weights0MM,
+        boundaryCondition->last,
+        boundaryCondition->current,
+        timeRatio,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("PrecursorDeviceDistributions execution failed");
+
+}
+
+//////////////////////////////////////////////////////////////////////////
+void QPrecursorDevDistributions( LBMSimulationParameter* parameterDevice,
+                                QforPrecursorBoundaryConditions* boundaryCondition,
+                                real timeRatio,
+                                real velocityRatio)
+{
+
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+
+    QPrecursorDeviceDistributions<<< grid.grid, grid.threads >>>(
+        boundaryCondition->k,
+        boundaryCondition->q27[0],
+        boundaryCondition->sizeQ,
+        boundaryCondition->numberOfBCnodes,
+        boundaryCondition->numberOfPrecursorNodes,
+        parameterDevice->distributions.f[0],
+        parameterDevice->neighborX,
+        parameterDevice->neighborY,
+        parameterDevice->neighborZ,
+        boundaryCondition->planeNeighbor0PP,
+        boundaryCondition->planeNeighbor0PM,
+        boundaryCondition->planeNeighbor0MP,
+        boundaryCondition->planeNeighbor0MM,
+        boundaryCondition->weights0PP,
+        boundaryCondition->weights0PM,
+        boundaryCondition->weights0MP,
+        boundaryCondition->weights0MM,
+        boundaryCondition->last,
+        boundaryCondition->current,
+        timeRatio,
+        parameterDevice->numberOfNodes,
+        parameterDevice->isEvenTimestep);
+    getLastCudaError("QPrecursorDeviceCompZeroPress execution failed");
+
+}
+//////////////////////////////////////////////////////////////////////////
+extern "C" void PropVelo(
+    unsigned int numberOfThreads,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* rho,
+    real* ux,
+    real* uy,
+    real* uz,
+    int* k_Q,
+    unsigned int size_Prop,
+    unsigned long long numberOfLBnodes,
+    unsigned int* bcMatD,
+    real* DD,
+    bool EvenOrOdd)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Prop);
+
+    PropellerBC<<< grid.grid, grid.threads >>>(
+        neighborX,
+        neighborY,
+        neighborZ,
+        rho,
+        ux,
+        uy,
+        uz,
+        k_Q,
+        size_Prop,
+        numberOfLBnodes,
+        bcMatD,
+        DD,
+        EvenOrOdd);
+    getLastCudaError("PropellerBC execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCF27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCF27<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF);
+    getLastCudaError("scaleCF27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCFEff27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffCF offCF)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCFEff27<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offCF);
+    getLastCudaError("scaleCFEff27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCFLast27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffCF offCF)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCFLast27<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offCF);
+    getLastCudaError("scaleCFLast27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCFpress27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffCF offCF)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCFpress27<<< grid.grid, grid.threads >>>(
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offCF);
+    getLastCudaError("scaleCFpress27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCF_Fix_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffCF offCF)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCF_Fix_27<<< grid.grid, grid.threads >>>(
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offCF);
+    getLastCudaError("scaleCF_Fix_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCF_Fix_comp_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffCF offCF)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCF_Fix_comp_27<<< grid.grid, grid.threads >>>(
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offCF);
+    getLastCudaError("scaleCF_Fix_comp_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCF_0817_comp_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffCF offCF,
+    CUstream_st *stream)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCF_0817_comp_27<<< grid.grid, grid.threads, 0, stream >>>(
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offCF);
+    getLastCudaError("scaleCF_0817_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCF_comp_D3Q27F3_2018(
+    real* DC,
+    real* DF,
+    real* G6,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffCF offCF)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCF_comp_D3Q27F3_2018 <<< grid.grid, grid.threads >>>(
+        DC,
+        DF,
+        G6,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offCF);
+    getLastCudaError("scaleCF_comp_D3Q27F3_2018 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCF_comp_D3Q27F3(
+    real* DC,
+    real* DF,
+    real* G6,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffCF offCF,
+    CUstream_st *stream)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCF_comp_D3Q27F3 <<< grid.grid, grid.threads, 0, stream >>>(
+        DC,
+        DF,
+        G6,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offCF);
+    getLastCudaError("scaleCF_comp_D3Q27F3 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCF_staggered_time_comp_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffCF offCF)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCF_staggered_time_comp_27<<< grid.grid, grid.threads >>>(
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offCF);
+    getLastCudaError("scaleCF_staggered_time_comp_27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void ScaleCF_RhoSq_comp_27(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellCF * icellCF, OffCF& offsetCF, CUstream_st *stream)
 {
-   dim3 grid = vf::cuda::getCudaGrid(parameterDeviceC->numberofthreads,  icellCF->kCF);
-   dim3 threads(parameterDeviceC->numberofthreads, 1, 1 );
-
-   scaleCF_RhoSq_comp_27<<<grid, threads, 0, stream>>>(
-      parameterDeviceC->distributions.f[0],
-      parameterDeviceF->distributions.f[0],
-      parameterDeviceC->neighborX,
-      parameterDeviceC->neighborY,
-      parameterDeviceC->neighborZ,
-      parameterDeviceF->neighborX,
-      parameterDeviceF->neighborY,
-      parameterDeviceF->neighborZ,
-      parameterDeviceC->numberOfNodes,
-      parameterDeviceF->numberOfNodes,
-      parameterDeviceC->isEvenTimestep,
-      icellCF->ICellCFC,
-      icellCF->ICellCFF,
-      icellCF->kCF,
-      parameterDeviceC->omega,
-      parameterDeviceF->omega,
-      parameterDeviceC->vis,
-      parameterDeviceC->nx,
-      parameterDeviceC->ny,
-      parameterDeviceF->nx,
-      parameterDeviceF->ny,
-      offsetCF);
-   getLastCudaError("scaleCF_RhoSq_27 execution failed");
+    dim3 grid = vf::cuda::getCudaGrid(parameterDeviceC->numberofthreads,  icellCF->kCF);
+    dim3 threads(parameterDeviceC->numberofthreads, 1, 1 );
+
+    scaleCF_RhoSq_comp_27<<<grid, threads, 0, stream>>>(
+        parameterDeviceC->distributions.f[0],
+        parameterDeviceF->distributions.f[0],
+        parameterDeviceC->neighborX,
+        parameterDeviceC->neighborY,
+        parameterDeviceC->neighborZ,
+        parameterDeviceF->neighborX,
+        parameterDeviceF->neighborY,
+        parameterDeviceF->neighborZ,
+        parameterDeviceC->numberOfNodes,
+        parameterDeviceF->numberOfNodes,
+        parameterDeviceC->isEvenTimestep,
+        icellCF->ICellCFC,
+        icellCF->ICellCFF,
+        icellCF->kCF,
+        parameterDeviceC->omega,
+        parameterDeviceF->omega,
+        parameterDeviceC->vis,
+        parameterDeviceC->nx,
+        parameterDeviceC->ny,
+        parameterDeviceF->nx,
+        parameterDeviceF->ny,
+        offsetCF);
+    getLastCudaError("scaleCF_RhoSq_27 execution failed");
 }
 
 void ScaleCF_compressible(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellCF * icellCF, OffCF& offsetCF, CUstream_st *stream)
 {
-   dim3 grid = vf::cuda::getCudaGrid(parameterDeviceC->numberofthreads,  icellCF->kCF);
-   dim3 threads(parameterDeviceC->numberofthreads, 1, 1 );
-
-   scaleCF_compressible<<<grid, threads, 0, stream>>>(
-      parameterDeviceC->distributions.f[0],
-      parameterDeviceF->distributions.f[0],
-      parameterDeviceC->neighborX,
-      parameterDeviceC->neighborY,
-      parameterDeviceC->neighborZ,
-      parameterDeviceF->neighborX,
-      parameterDeviceF->neighborY,
-      parameterDeviceF->neighborZ,
-      parameterDeviceC->numberOfNodes,
-      parameterDeviceF->numberOfNodes,
-      parameterDeviceC->isEvenTimestep,
-      icellCF->ICellCFC,
-      icellCF->ICellCFF,
-      icellCF->kCF,
-      parameterDeviceC->omega,
-      parameterDeviceF->omega,
-      offsetCF);
-   getLastCudaError("scaleCF_compressible execution failed");
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ScaleCF_RhoSq_3rdMom_comp_27(real* DC,
-											 real* DF,
-											 unsigned int* neighborCX,
-											 unsigned int* neighborCY,
-											 unsigned int* neighborCZ,
-											 unsigned int* neighborFX,
-											 unsigned int* neighborFY,
-											 unsigned int* neighborFZ,
-											 unsigned int size_MatC,
-											 unsigned int size_MatF,
-											 bool isEvenTimestep,
-											 unsigned int* posCSWB,
-											 unsigned int* posFSWB,
-											 unsigned int kCF,
-											 real omCoarse,
-											 real omFine,
-											 real nu,
-											 unsigned int nxC,
-											 unsigned int nyC,
-											 unsigned int nxF,
-											 unsigned int nyF,
-											 unsigned int numberOfThreads,
-											 OffCF offCF,
-                                  CUstream_st *stream)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_RhoSq_3rdMom_comp_27<<< gridINT_CF, threads, 0, stream >>>(  DC,
-																DF,
-																neighborCX,
-																neighborCY,
-																neighborCZ,
-																neighborFX,
-																neighborFY,
-																neighborFZ,
-																size_MatC,
-																size_MatF,
-																isEvenTimestep,
-																posCSWB,
-																posFSWB,
-																kCF,
-																omCoarse,
-																omFine,
-																nu,
-																nxC,
-																nyC,
-																nxF,
-																nyF,
-																offCF);
-      getLastCudaError("scaleCF_RhoSq_3rdMom_comp_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCF_AA2016_comp_27(real* DC,
-									   real* DF,
-									   unsigned int* neighborCX,
-									   unsigned int* neighborCY,
-									   unsigned int* neighborCZ,
-									   unsigned int* neighborFX,
-									   unsigned int* neighborFY,
-									   unsigned int* neighborFZ,
-									   unsigned int size_MatC,
-									   unsigned int size_MatF,
-									   bool isEvenTimestep,
-									   unsigned int* posCSWB,
-									   unsigned int* posFSWB,
-									   unsigned int kCF,
-									   real omCoarse,
-									   real omFine,
-									   real nu,
-									   unsigned int nxC,
-									   unsigned int nyC,
-									   unsigned int nxF,
-									   unsigned int nyF,
-									   unsigned int numberOfThreads,
-									   OffCF offCF,
-                              CUstream_st *stream)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_AA2016_comp_27<<< gridINT_CF, threads, 0, stream >>>(DC,
-														DF,
-														neighborCX,
-														neighborCY,
-														neighborCZ,
-														neighborFX,
-														neighborFY,
-														neighborFZ,
-														size_MatC,
-														size_MatF,
-														isEvenTimestep,
-														posCSWB,
-														posFSWB,
-														kCF,
-														omCoarse,
-														omFine,
-														nu,
-														nxC,
-														nyC,
-														nxF,
-														nyF,
-														offCF);
-      getLastCudaError("scaleCF_AA2016_comp_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCF_NSPress_27(  real* DC,
-									 real* DF,
-									 unsigned int* neighborCX,
-									 unsigned int* neighborCY,
-									 unsigned int* neighborCZ,
-									 unsigned int* neighborFX,
-									 unsigned int* neighborFY,
-									 unsigned int* neighborFZ,
-									 unsigned int size_MatC,
-									 unsigned int size_MatF,
-									 bool isEvenTimestep,
-									 unsigned int* posCSWB,
-									 unsigned int* posFSWB,
-									 unsigned int kCF,
-									 real omCoarse,
-									 real omFine,
-									 real nu,
-									 unsigned int nxC,
-									 unsigned int nyC,
-									 unsigned int nxF,
-									 unsigned int nyF,
-									 unsigned int numberOfThreads,
-									 OffCF offCF)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_NSPress_27<<< gridINT_CF, threads >>>(DC,
-													DF,
-													neighborCX,
-													neighborCY,
-													neighborCZ,
-													neighborFX,
-													neighborFY,
-													neighborFZ,
-													size_MatC,
-													size_MatF,
-													isEvenTimestep,
-													posCSWB,
-													posFSWB,
-													kCF,
-													omCoarse,
-													omFine,
-													nu,
-													nxC,
-													nyC,
-													nxF,
-													nyF,
-													offCF);
-      getLastCudaError("scaleCF_Fix_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCFThSMG7(   real* DC,
-                                 real* DF,
-                                 real* DD7C,
-                                 real* DD7F,
-                                 unsigned int* neighborCX,
-                                 unsigned int* neighborCY,
-                                 unsigned int* neighborCZ,
-                                 unsigned int* neighborFX,
-                                 unsigned int* neighborFY,
-                                 unsigned int* neighborFZ,
-                                 unsigned int size_MatC,
-                                 unsigned int size_MatF,
-                                 bool isEvenTimestep,
-                                 unsigned int* posCSWB,
-                                 unsigned int* posFSWB,
-                                 unsigned int kCF,
-                                 real nu,
-                                 real diffusivity_fine,
-                                 unsigned int numberOfThreads,
-                                 OffCF offCF)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCFThSMG7<<< gridINT_CF, threads >>> (DC,
-                                                DF,
-                                                DD7C,
-                                                DD7F,
-                                                neighborCX,
-                                                neighborCY,
-                                                neighborCZ,
-                                                neighborFX,
-                                                neighborFY,
-                                                neighborFZ,
-                                                size_MatC,
-                                                size_MatF,
-                                                isEvenTimestep,
-                                                posCSWB,
-                                                posFSWB,
-                                                kCF,
-                                                nu,
-                                                diffusivity_fine,
-                                                offCF);
-      getLastCudaError("scaleCFThSMG7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCFThS7(  real* DC,
-                              real* DF,
-                              real* DD7C,
-                              real* DD7F,
-                              unsigned int* neighborCX,
-                              unsigned int* neighborCY,
-                              unsigned int* neighborCZ,
-                              unsigned int* neighborFX,
-                              unsigned int* neighborFY,
-                              unsigned int* neighborFZ,
-                              unsigned int size_MatC,
-                              unsigned int size_MatF,
-                              bool isEvenTimestep,
-                              unsigned int* posCSWB,
-                              unsigned int* posFSWB,
-                              unsigned int kCF,
-                              real nu,
-                              real diffusivity_fine,
-                              unsigned int numberOfThreads)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCFThS7<<< gridINT_CF, threads >>> (  DC,
-                                                DF,
-                                                DD7C,
-                                                DD7F,
-                                                neighborCX,
-                                                neighborCY,
-                                                neighborCZ,
-                                                neighborFX,
-                                                neighborFY,
-                                                neighborFZ,
-                                                size_MatC,
-                                                size_MatF,
-                                                isEvenTimestep,
-                                                posCSWB,
-                                                posFSWB,
-                                                kCF,
-                                                nu,
-                                                diffusivity_fine);
-      getLastCudaError("scaleCFThS7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleCFThS27( real* DC,
-                              real* DF,
-                              real* DD27C,
-                              real* DD27F,
-                              unsigned int* neighborCX,
-                              unsigned int* neighborCY,
-                              unsigned int* neighborCZ,
-                              unsigned int* neighborFX,
-                              unsigned int* neighborFY,
-                              unsigned int* neighborFZ,
-                              unsigned int size_MatC,
-                              unsigned int size_MatF,
-                              bool isEvenTimestep,
-                              unsigned int* posCSWB,
-                              unsigned int* posFSWB,
-                              unsigned int kCF,
-                              real nu,
-                              real diffusivity_fine,
-                              unsigned int numberOfThreads,
-							  OffCF offCF)
-{
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCFThS27<<< gridINT_CF, threads >>> ( DC,
-                                                DF,
-                                                DD27C,
-                                                DD27F,
-                                                neighborCX,
-                                                neighborCY,
-                                                neighborCZ,
-                                                neighborFX,
-                                                neighborFY,
-                                                neighborFZ,
-                                                size_MatC,
-                                                size_MatF,
-                                                isEvenTimestep,
-                                                posCSWB,
-                                                posFSWB,
-                                                kCF,
-                                                nu,
-                                                diffusivity_fine,
-										        offCF);
-      getLastCudaError("scaleCFThS27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFC27( real* DC,
-                           real* DF,
-                           unsigned int* neighborCX,
-                           unsigned int* neighborCY,
-                           unsigned int* neighborCZ,
-                           unsigned int* neighborFX,
-                           unsigned int* neighborFY,
-                           unsigned int* neighborFZ,
-                           unsigned int size_MatC,
-                           unsigned int size_MatF,
-                           bool isEvenTimestep,
-                           unsigned int* posC,
-                           unsigned int* posFSWB,
-                           unsigned int kFC,
-                           real omCoarse,
-                           real omFine,
-                           real nu,
-                           unsigned int nxC,
-                           unsigned int nyC,
-                           unsigned int nxF,
-                           unsigned int nyF,
-                           unsigned int numberOfThreads)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC27<<< gridINT_FC, threads >>> ( DC,
-                                             DF,
-                                             neighborCX,
-                                             neighborCY,
-                                             neighborCZ,
-                                             neighborFX,
-                                             neighborFY,
-                                             neighborFZ,
-                                             size_MatC,
-                                             size_MatF,
-                                             isEvenTimestep,
-                                             posC,
-                                             posFSWB,
-                                             kFC,
-                                             omCoarse,
-                                             omFine,
-                                             nu,
-                                             nxC,
-                                             nyC,
-                                             nxF,
-                                             nyF);
-      getLastCudaError("scaleFC27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFCEff27(real* DC,
-                             real* DF,
-                             unsigned int* neighborCX,
-                             unsigned int* neighborCY,
-                             unsigned int* neighborCZ,
-                             unsigned int* neighborFX,
-                             unsigned int* neighborFY,
-                             unsigned int* neighborFZ,
-                             unsigned int size_MatC,
-                             unsigned int size_MatF,
-                             bool isEvenTimestep,
-                             unsigned int* posC,
-                             unsigned int* posFSWB,
-                             unsigned int kFC,
-                             real omCoarse,
-                             real omFine,
-                             real nu,
-                             unsigned int nxC,
-                             unsigned int nyC,
-                             unsigned int nxF,
-                             unsigned int nyF,
-                             unsigned int numberOfThreads,
-                             OffFC offFC)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFCEff27<<< gridINT_FC, threads >>> ( DC,
-                                                DF,
-                                                neighborCX,
-                                                neighborCY,
-                                                neighborCZ,
-                                                neighborFX,
-                                                neighborFY,
-                                                neighborFZ,
-                                                size_MatC,
-                                                size_MatF,
-                                                isEvenTimestep,
-                                                posC,
-                                                posFSWB,
-                                                kFC,
-                                                omCoarse,
-                                                omFine,
-                                                nu,
-                                                nxC,
-                                                nyC,
-                                                nxF,
-                                                nyF,
-                                                offFC);
-      getLastCudaError("scaleFCEff27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFCLast27(real* DC,
-                              real* DF,
-                              unsigned int* neighborCX,
-                              unsigned int* neighborCY,
-                              unsigned int* neighborCZ,
-                              unsigned int* neighborFX,
-                              unsigned int* neighborFY,
-                              unsigned int* neighborFZ,
-                              unsigned int size_MatC,
-                              unsigned int size_MatF,
-                              bool isEvenTimestep,
-                              unsigned int* posC,
-                              unsigned int* posFSWB,
-                              unsigned int kFC,
-                              real omCoarse,
-                              real omFine,
-                              real nu,
-                              unsigned int nxC,
-                              unsigned int nyC,
-                              unsigned int nxF,
-                              unsigned int nyF,
-                              unsigned int numberOfThreads,
-                              OffFC offFC)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFCLast27<<< gridINT_FC, threads >>> (DC,
-                                                DF,
-                                                neighborCX,
-                                                neighborCY,
-                                                neighborCZ,
-                                                neighborFX,
-                                                neighborFY,
-                                                neighborFZ,
-                                                size_MatC,
-                                                size_MatF,
-                                                isEvenTimestep,
-                                                posC,
-                                                posFSWB,
-                                                kFC,
-                                                omCoarse,
-                                                omFine,
-                                                nu,
-                                                nxC,
-                                                nyC,
-                                                nxF,
-                                                nyF,
-                                                offFC);
-      getLastCudaError("Kernel execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFCpress27(real* DC,
-                              real* DF,
-                              unsigned int* neighborCX,
-                              unsigned int* neighborCY,
-                              unsigned int* neighborCZ,
-                              unsigned int* neighborFX,
-                              unsigned int* neighborFY,
-                              unsigned int* neighborFZ,
-                              unsigned int size_MatC,
-                              unsigned int size_MatF,
-                              bool isEvenTimestep,
-                              unsigned int* posC,
-                              unsigned int* posFSWB,
-                              unsigned int kFC,
-                              real omCoarse,
-                              real omFine,
-                              real nu,
-                              unsigned int nxC,
-                              unsigned int nyC,
-                              unsigned int nxF,
-                              unsigned int nyF,
-                              unsigned int numberOfThreads,
-                              OffFC offFC)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFCpress27<<< gridINT_FC, threads >>> (  DC,
-                                                   DF,
-                                                   neighborCX,
-                                                   neighborCY,
-                                                   neighborCZ,
-                                                   neighborFX,
-                                                   neighborFY,
-                                                   neighborFZ,
-                                                   size_MatC,
-                                                   size_MatF,
-                                                   isEvenTimestep,
-                                                   posC,
-                                                   posFSWB,
-                                                   kFC,
-                                                   omCoarse,
-                                                   omFine,
-                                                   nu,
-                                                   nxC,
-                                                   nyC,
-                                                   nxF,
-                                                   nyF,
-                                                   offFC);
-      getLastCudaError("scaleFCpress27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFC_Fix_27(real* DC,
-                              real* DF,
-                              unsigned int* neighborCX,
-                              unsigned int* neighborCY,
-                              unsigned int* neighborCZ,
-                              unsigned int* neighborFX,
-                              unsigned int* neighborFY,
-                              unsigned int* neighborFZ,
-                              unsigned int size_MatC,
-                              unsigned int size_MatF,
-                              bool isEvenTimestep,
-                              unsigned int* posC,
-                              unsigned int* posFSWB,
-                              unsigned int kFC,
-                              real omCoarse,
-                              real omFine,
-                              real nu,
-                              unsigned int nxC,
-                              unsigned int nyC,
-                              unsigned int nxF,
-                              unsigned int nyF,
-                              unsigned int numberOfThreads,
-                              OffFC offFC)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_Fix_27<<< gridINT_FC, threads >>> (  DC,
-                                                   DF,
-                                                   neighborCX,
-                                                   neighborCY,
-                                                   neighborCZ,
-                                                   neighborFX,
-                                                   neighborFY,
-                                                   neighborFZ,
-                                                   size_MatC,
-                                                   size_MatF,
-                                                   isEvenTimestep,
-                                                   posC,
-                                                   posFSWB,
-                                                   kFC,
-                                                   omCoarse,
-                                                   omFine,
-                                                   nu,
-                                                   nxC,
-                                                   nyC,
-                                                   nxF,
-                                                   nyF,
-                                                   offFC);
-      getLastCudaError("scaleFC_Fix_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFC_Fix_comp_27(  real* DC,
-									  real* DF,
-									  unsigned int* neighborCX,
-									  unsigned int* neighborCY,
-									  unsigned int* neighborCZ,
-									  unsigned int* neighborFX,
-									  unsigned int* neighborFY,
-									  unsigned int* neighborFZ,
-									  unsigned int size_MatC,
-									  unsigned int size_MatF,
-									  bool isEvenTimestep,
-									  unsigned int* posC,
-									  unsigned int* posFSWB,
-									  unsigned int kFC,
-									  real omCoarse,
-									  real omFine,
-									  real nu,
-									  unsigned int nxC,
-									  unsigned int nyC,
-									  unsigned int nxF,
-									  unsigned int nyF,
-									  unsigned int numberOfThreads,
-									  OffFC offFC)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_Fix_comp_27<<< gridINT_FC, threads >>> ( DC,
-													   DF,
-													   neighborCX,
-													   neighborCY,
-													   neighborCZ,
-													   neighborFX,
-													   neighborFY,
-													   neighborFZ,
-													   size_MatC,
-													   size_MatF,
-													   isEvenTimestep,
-													   posC,
-													   posFSWB,
-													   kFC,
-													   omCoarse,
-													   omFine,
-													   nu,
-													   nxC,
-													   nyC,
-													   nxF,
-													   nyF,
-													   offFC);
-      getLastCudaError("scaleFC_Fix_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFC_0817_comp_27( real* DC,
-									  real* DF,
-									  unsigned int* neighborCX,
-									  unsigned int* neighborCY,
-									  unsigned int* neighborCZ,
-									  unsigned int* neighborFX,
-									  unsigned int* neighborFY,
-									  unsigned int* neighborFZ,
-									  unsigned int size_MatC,
-									  unsigned int size_MatF,
-									  bool isEvenTimestep,
-									  unsigned int* posC,
-									  unsigned int* posFSWB,
-									  unsigned int kFC,
-									  real omCoarse,
-									  real omFine,
-									  real nu,
-									  unsigned int nxC,
-									  unsigned int nyC,
-									  unsigned int nxF,
-									  unsigned int nyF,
-									  unsigned int numberOfThreads,
-									  OffFC offFC,
-                             CUstream_st *stream)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_0817_comp_27<<< gridINT_FC, threads, 0, stream >>> (DC,
-													   DF,
-													   neighborCX,
-													   neighborCY,
-													   neighborCZ,
-													   neighborFX,
-													   neighborFY,
-													   neighborFZ,
-													   size_MatC,
-													   size_MatF,
-													   isEvenTimestep,
-													   posC,
-													   posFSWB,
-													   kFC,
-													   omCoarse,
-													   omFine,
-													   nu,
-													   nxC,
-													   nyC,
-													   nxF,
-													   nyF,
-													   offFC);
-      getLastCudaError("scaleFC_0817_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFC_comp_D3Q27F3_2018( real* DC,
-										   real* DF,
-										   real* G6,
-										   unsigned int* neighborCX,
-										   unsigned int* neighborCY,
-										   unsigned int* neighborCZ,
-										   unsigned int* neighborFX,
-										   unsigned int* neighborFY,
-										   unsigned int* neighborFZ,
-										   unsigned int size_MatC,
-										   unsigned int size_MatF,
-										   bool isEvenTimestep,
-										   unsigned int* posC,
-										   unsigned int* posFSWB,
-										   unsigned int kFC,
-										   real omCoarse,
-										   real omFine,
-										   real nu,
-										   unsigned int nxC,
-										   unsigned int nyC,
-										   unsigned int nxF,
-										   unsigned int nyF,
-										   unsigned int numberOfThreads,
-										   OffFC offFC)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-     scaleFC_comp_D3Q27F3_2018 <<< gridINT_FC, threads >>> (DC,
-															DF,
-															G6,
-															neighborCX,
-															neighborCY,
-															neighborCZ,
-															neighborFX,
-															neighborFY,
-															neighborFZ,
-															size_MatC,
-															size_MatF,
-															isEvenTimestep,
-															posC,
-															posFSWB,
-															kFC,
-															omCoarse,
-															omFine,
-															nu,
-															nxC,
-															nyC,
-															nxF,
-															nyF,
-															offFC);
-      getLastCudaError("scaleFC_comp_D3Q27F3_2018 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFC_comp_D3Q27F3( real* DC,
-									  real* DF,
-									  real* G6,
-									  unsigned int* neighborCX,
-									  unsigned int* neighborCY,
-									  unsigned int* neighborCZ,
-									  unsigned int* neighborFX,
-									  unsigned int* neighborFY,
-									  unsigned int* neighborFZ,
-									  unsigned int size_MatC,
-									  unsigned int size_MatF,
-									  bool isEvenTimestep,
-									  unsigned int* posC,
-									  unsigned int* posFSWB,
-									  unsigned int kFC,
-									  real omCoarse,
-									  real omFine,
-									  real nu,
-									  unsigned int nxC,
-									  unsigned int nyC,
-									  unsigned int nxF,
-									  unsigned int nyF,
-									  unsigned int numberOfThreads,
-									  OffFC offFC,
-                             CUstream_st *stream)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-     scaleFC_comp_D3Q27F3 <<< gridINT_FC, threads, 0, stream >>> (DC,
-													   DF,
-													   G6,
-													   neighborCX,
-													   neighborCY,
-													   neighborCZ,
-													   neighborFX,
-													   neighborFY,
-													   neighborFZ,
-													   size_MatC,
-													   size_MatF,
-													   isEvenTimestep,
-													   posC,
-													   posFSWB,
-													   kFC,
-													   omCoarse,
-													   omFine,
-													   nu,
-													   nxC,
-													   nyC,
-													   nxF,
-													   nyF,
-													   offFC);
-      getLastCudaError("scaleFC_0817_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFC_staggered_time_comp_27(   real* DC,
-												  real* DF,
-												  unsigned int* neighborCX,
-												  unsigned int* neighborCY,
-												  unsigned int* neighborCZ,
-												  unsigned int* neighborFX,
-												  unsigned int* neighborFY,
-												  unsigned int* neighborFZ,
-												  unsigned int size_MatC,
-												  unsigned int size_MatF,
-												  bool isEvenTimestep,
-												  unsigned int* posC,
-												  unsigned int* posFSWB,
-												  unsigned int kFC,
-												  real omCoarse,
-												  real omFine,
-												  real nu,
-												  unsigned int nxC,
-												  unsigned int nyC,
-												  unsigned int nxF,
-												  unsigned int nyF,
-												  unsigned int numberOfThreads,
-												  OffFC offFC)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_staggered_time_comp_27<<< gridINT_FC, threads >>> (  DC,
-																   DF,
-																   neighborCX,
-																   neighborCY,
-																   neighborCZ,
-																   neighborFX,
-																   neighborFY,
-																   neighborFZ,
-																   size_MatC,
-																   size_MatF,
-																   isEvenTimestep,
-																   posC,
-																   posFSWB,
-																   kFC,
-																   omCoarse,
-																   omFine,
-																   nu,
-																   nxC,
-																   nyC,
-																   nxF,
-																   nyF,
-																   offFC);
-      getLastCudaError("scaleFC_Fix_27 execution failed");
+    dim3 grid = vf::cuda::getCudaGrid(parameterDeviceC->numberofthreads,  icellCF->kCF);
+    dim3 threads(parameterDeviceC->numberofthreads, 1, 1 );
+
+    scaleCF_compressible<<<grid, threads, 0, stream>>>(
+        parameterDeviceC->distributions.f[0],
+        parameterDeviceF->distributions.f[0],
+        parameterDeviceC->neighborX,
+        parameterDeviceC->neighborY,
+        parameterDeviceC->neighborZ,
+        parameterDeviceF->neighborX,
+        parameterDeviceF->neighborY,
+        parameterDeviceF->neighborZ,
+        parameterDeviceC->numberOfNodes,
+        parameterDeviceF->numberOfNodes,
+        parameterDeviceC->isEvenTimestep,
+        icellCF->ICellCFC,
+        icellCF->ICellCFF,
+        icellCF->kCF,
+        parameterDeviceC->omega,
+        parameterDeviceF->omega,
+        offsetCF);
+    getLastCudaError("scaleCF_compressible execution failed");
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ScaleCF_RhoSq_3rdMom_comp_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffCF offCF,
+    CUstream_st *stream)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCF_RhoSq_3rdMom_comp_27<<< grid.grid, grid.threads, 0, stream >>>(
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offCF);
+    getLastCudaError("scaleCF_RhoSq_3rdMom_comp_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCF_AA2016_comp_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffCF offCF,
+    CUstream_st *stream)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCF_AA2016_comp_27<<< grid.grid, grid.threads, 0, stream >>>(
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offCF);
+    getLastCudaError("scaleCF_AA2016_comp_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCF_NSPress_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffCF offCF)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCF_NSPress_27<<< grid.grid, grid.threads >>>(
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offCF);
+    getLastCudaError("scaleCF_NSPress_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCFThSMG7(
+    real* DC,
+    real* DF,
+    real* DD7C,
+    real* DD7F,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real nu,
+    real diffusivity_fine,
+    unsigned int numberOfThreads,
+    OffCF offCF)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCFThSMG7<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        DD7C,
+        DD7F,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        nu,
+        diffusivity_fine,
+        offCF);
+    getLastCudaError("scaleCFThSMG7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCFThS7(
+    real* DC,
+    real* DF,
+    real* DD7C,
+    real* DD7F,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real nu,
+    real diffusivity_fine,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCFThS7<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        DD7C,
+        DD7F,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        nu,
+        diffusivity_fine);
+    getLastCudaError("scaleCFThS7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleCFThS27(
+    real* DC,
+    real* DF,
+    real* DD27C,
+    real* DD27F,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posCSWB,
+    unsigned int* posFSWB,
+    unsigned int kCF,
+    real nu,
+    real diffusivity_fine,
+    unsigned int numberOfThreads,
+    OffCF offCF)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+    scaleCFThS27<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        DD27C,
+        DD27F,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posCSWB,
+        posFSWB,
+        kCF,
+        nu,
+        diffusivity_fine,
+        offCF);
+    getLastCudaError("scaleCFThS27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFC27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFC27<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF);
+    getLastCudaError("scaleFC27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFCEff27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffFC offFC)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFCEff27<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offFC);
+    getLastCudaError("scaleFCEff27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFCLast27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffFC offFC)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFCLast27<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offFC);
+    getLastCudaError("Kernel execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFCpress27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffFC offFC)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFCpress27<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offFC);
+    getLastCudaError("scaleFCpress27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFC_Fix_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffFC offFC)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFC_Fix_27<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offFC);
+    getLastCudaError("scaleFC_Fix_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFC_Fix_comp_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffFC offFC)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFC_Fix_comp_27<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offFC);
+    getLastCudaError("scaleFC_Fix_comp_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFC_0817_comp_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffFC offFC,
+    CUstream_st *stream)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFC_0817_comp_27<<< grid.grid, grid.threads, 0, stream >>> (
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offFC);
+    getLastCudaError("scaleFC_0817_comp_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFC_comp_D3Q27F3_2018(
+    real* DC,
+    real* DF,
+    real* G6,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffFC offFC)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFC_comp_D3Q27F3_2018 <<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        G6,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offFC);
+    getLastCudaError("scaleFC_comp_D3Q27F3_2018 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFC_comp_D3Q27F3(
+    real* DC,
+    real* DF,
+    real* G6,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffFC offFC,
+    CUstream_st *stream)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFC_comp_D3Q27F3 <<< grid.grid, grid.threads, 0, stream >>> (
+        DC,
+        DF,
+        G6,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offFC);
+    getLastCudaError("scaleFC_comp_D3Q27F3 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFC_staggered_time_comp_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffFC offFC)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFC_staggered_time_comp_27<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offFC);
+    getLastCudaError("scaleFC_staggered_time_comp_27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void ScaleFC_RhoSq_comp_27(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellFC * icellFC, OffFC &offsetFC, CUstream_st *stream)
 {
-   dim3 grid = vf::cuda::getCudaGrid(parameterDeviceC->numberofthreads,  icellFC->kFC);
-   dim3 threads(parameterDeviceC->numberofthreads, 1, 1 );
-
-   scaleFC_RhoSq_comp_27<<<grid, threads, 0, stream>>>(
-      parameterDeviceC->distributions.f[0],
-      parameterDeviceF->distributions.f[0],
-      parameterDeviceC->neighborX,
-      parameterDeviceC->neighborY,
-      parameterDeviceC->neighborZ,
-      parameterDeviceF->neighborX,
-      parameterDeviceF->neighborY,
-      parameterDeviceF->neighborZ,
-      parameterDeviceC->numberOfNodes,
-      parameterDeviceF->numberOfNodes,
-      parameterDeviceC->isEvenTimestep,
-      icellFC->ICellFCC,
-      icellFC->ICellFCF,
-      icellFC->kFC,
-      parameterDeviceC->omega,
-      parameterDeviceF->omega,
-      parameterDeviceC->vis,
-      parameterDeviceC->nx,
-      parameterDeviceC->ny,
-      parameterDeviceF->nx,
-      parameterDeviceF->ny,
-      offsetFC);
-   getLastCudaError("scaleFC_RhoSq_27 execution failed");
+    dim3 grid = vf::cuda::getCudaGrid(parameterDeviceC->numberofthreads,  icellFC->kFC);
+    dim3 threads(parameterDeviceC->numberofthreads, 1, 1 );
+
+    scaleFC_RhoSq_comp_27<<<grid, threads, 0, stream>>>(
+        parameterDeviceC->distributions.f[0],
+        parameterDeviceF->distributions.f[0],
+        parameterDeviceC->neighborX,
+        parameterDeviceC->neighborY,
+        parameterDeviceC->neighborZ,
+        parameterDeviceF->neighborX,
+        parameterDeviceF->neighborY,
+        parameterDeviceF->neighborZ,
+        parameterDeviceC->numberOfNodes,
+        parameterDeviceF->numberOfNodes,
+        parameterDeviceC->isEvenTimestep,
+        icellFC->ICellFCC,
+        icellFC->ICellFCF,
+        icellFC->kFC,
+        parameterDeviceC->omega,
+        parameterDeviceF->omega,
+        parameterDeviceC->vis,
+        parameterDeviceC->nx,
+        parameterDeviceC->ny,
+        parameterDeviceF->nx,
+        parameterDeviceF->ny,
+        offsetFC);
+    getLastCudaError("scaleFC_RhoSq_comp_27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void ScaleFC_compressible(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellFC * icellFC, OffFC &offsetFC, CUstream_st *stream)
 {
-   dim3 grid = vf::cuda::getCudaGrid(parameterDeviceC->numberofthreads,  icellFC->kFC);
-   dim3 threads(parameterDeviceC->numberofthreads, 1, 1 );
-
-   scaleFC_compressible<<<grid, threads, 0, stream>>>(
-      parameterDeviceC->distributions.f[0],
-      parameterDeviceF->distributions.f[0],
-      parameterDeviceC->neighborX,
-      parameterDeviceC->neighborY,
-      parameterDeviceC->neighborZ,
-      parameterDeviceF->neighborX,
-      parameterDeviceF->neighborY,
-      parameterDeviceF->neighborZ,
-      parameterDeviceC->numberOfNodes,
-      parameterDeviceF->numberOfNodes,
-      parameterDeviceC->isEvenTimestep,
-      icellFC->ICellFCC,
-      icellFC->ICellFCF,
-      icellFC->kFC,
-      parameterDeviceC->omega,
-      parameterDeviceF->omega,
-      offsetFC);
-   getLastCudaError("scaleFC_compressible execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFC_RhoSq_3rdMom_comp_27( real* DC,
-											  real* DF,
-											  unsigned int* neighborCX,
-											  unsigned int* neighborCY,
-											  unsigned int* neighborCZ,
-											  unsigned int* neighborFX,
-											  unsigned int* neighborFY,
-											  unsigned int* neighborFZ,
-											  unsigned int size_MatC,
-											  unsigned int size_MatF,
-											  bool isEvenTimestep,
-											  unsigned int* posC,
-											  unsigned int* posFSWB,
-											  unsigned int kFC,
-											  real omCoarse,
-											  real omFine,
-											  real nu,
-											  unsigned int nxC,
-											  unsigned int nyC,
-											  unsigned int nxF,
-											  unsigned int nyF,
-											  unsigned int numberOfThreads,
-											  OffFC offFC,
-                                   CUstream_st *stream)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_RhoSq_3rdMom_comp_27<<< gridINT_FC, threads, 0, stream >>>(DC,
-															  DF,
-															  neighborCX,
-															  neighborCY,
-															  neighborCZ,
-															  neighborFX,
-															  neighborFY,
-															  neighborFZ,
-															  size_MatC,
-															  size_MatF,
-															  isEvenTimestep,
-															  posC,
-															  posFSWB,
-															  kFC,
-															  omCoarse,
-															  omFine,
-															  nu,
-															  nxC,
-															  nyC,
-															  nxF,
-															  nyF,
-															  offFC);
-      getLastCudaError("scaleFC_RhoSq_3rdMom_comp_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFC_AA2016_comp_27( real* DC,
-										real* DF,
-										unsigned int* neighborCX,
-										unsigned int* neighborCY,
-										unsigned int* neighborCZ,
-										unsigned int* neighborFX,
-										unsigned int* neighborFY,
-										unsigned int* neighborFZ,
-										unsigned int size_MatC,
-										unsigned int size_MatF,
-										bool isEvenTimestep,
-										unsigned int* posC,
-										unsigned int* posFSWB,
-										unsigned int kFC,
-										real omCoarse,
-										real omFine,
-										real nu,
-										unsigned int nxC,
-										unsigned int nyC,
-										unsigned int nxF,
-										unsigned int nyF,
-										unsigned int numberOfThreads,
-										OffFC offFC,
-                              CUstream_st *stream)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_AA2016_comp_27<<< gridINT_FC, threads, 0, stream >>>(DC,
-														DF,
-														neighborCX,
-														neighborCY,
-														neighborCZ,
-														neighborFX,
-														neighborFY,
-														neighborFZ,
-														size_MatC,
-														size_MatF,
-														isEvenTimestep,
-														posC,
-														posFSWB,
-														kFC,
-														omCoarse,
-														omFine,
-														nu,
-														nxC,
-														nyC,
-														nxF,
-														nyF,
-														offFC);
-      getLastCudaError("scaleFC_AA2016_comp_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFC_NSPress_27(real* DC,
-								  real* DF,
-								  unsigned int* neighborCX,
-								  unsigned int* neighborCY,
-								  unsigned int* neighborCZ,
-								  unsigned int* neighborFX,
-								  unsigned int* neighborFY,
-								  unsigned int* neighborFZ,
-								  unsigned int size_MatC,
-								  unsigned int size_MatF,
-								  bool isEvenTimestep,
-								  unsigned int* posC,
-								  unsigned int* posFSWB,
-								  unsigned int kFC,
-								  real omCoarse,
-								  real omFine,
-								  real nu,
-								  unsigned int nxC,
-								  unsigned int nyC,
-								  unsigned int nxF,
-								  unsigned int nyF,
-								  unsigned int numberOfThreads,
-								  OffFC offFC)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_NSPress_27<<< gridINT_FC, threads >>> (  DC,
-													   DF,
-													   neighborCX,
-													   neighborCY,
-													   neighborCZ,
-													   neighborFX,
-													   neighborFY,
-													   neighborFZ,
-													   size_MatC,
-													   size_MatF,
-													   isEvenTimestep,
-													   posC,
-													   posFSWB,
-													   kFC,
-													   omCoarse,
-													   omFine,
-													   nu,
-													   nxC,
-													   nyC,
-													   nxF,
-													   nyF,
-													   offFC);
-      getLastCudaError("scaleFC_Fix_27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFCThSMG7(real* DC,
-                              real* DF,
-                              real* DD7C,
-                              real* DD7F,
-                              unsigned int* neighborCX,
-                              unsigned int* neighborCY,
-                              unsigned int* neighborCZ,
-                              unsigned int* neighborFX,
-                              unsigned int* neighborFY,
-                              unsigned int* neighborFZ,
-                              unsigned int size_MatC,
-                              unsigned int size_MatF,
-                              bool isEvenTimestep,
-                              unsigned int* posC,
-                              unsigned int* posFSWB,
-                              unsigned int kFC,
-                              real nu,
-                              real diffusivity_coarse,
-                              unsigned int numberOfThreads,
-                              OffFC offFC)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFCThSMG7<<< gridINT_FC, threads >>>( DC,
-                                                DF,
-                                                DD7C,
-                                                DD7F,
-                                                neighborCX,
-                                                neighborCY,
-                                                neighborCZ,
-                                                neighborFX,
-                                                neighborFY,
-                                                neighborFZ,
-                                                size_MatC,
-                                                size_MatF,
-                                                isEvenTimestep,
-                                                posC,
-                                                posFSWB,
-                                                kFC,
-                                                nu,
-                                                diffusivity_coarse,
-                                                offFC);
-      getLastCudaError("scaleFCThSMG7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFCThS7(  real* DC,
-                              real* DF,
-                              real* DD7C,
-                              real* DD7F,
-                              unsigned int* neighborCX,
-                              unsigned int* neighborCY,
-                              unsigned int* neighborCZ,
-                              unsigned int* neighborFX,
-                              unsigned int* neighborFY,
-                              unsigned int* neighborFZ,
-                              unsigned int size_MatC,
-                              unsigned int size_MatF,
-                              bool isEvenTimestep,
-                              unsigned int* posC,
-                              unsigned int* posFSWB,
-                              unsigned int kFC,
-                              real nu,
-                              real diffusivity_coarse,
-                              unsigned int numberOfThreads)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFCThS7<<< gridINT_FC, threads >>>(DC,
-                                             DF,
-                                             DD7C,
-                                             DD7F,
-                                             neighborCX,
-                                             neighborCY,
-                                             neighborCZ,
-                                             neighborFX,
-                                             neighborFY,
-                                             neighborFZ,
-                                             size_MatC,
-                                             size_MatF,
-                                             isEvenTimestep,
-                                             posC,
-                                             posFSWB,
-                                             kFC,
-                                             nu,
-                                             diffusivity_coarse);
-      getLastCudaError("scaleFCThS7 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void ScaleFCThS27( real* DC,
-                              real* DF,
-                              real* DD27C,
-                              real* DD27F,
-                              unsigned int* neighborCX,
-                              unsigned int* neighborCY,
-                              unsigned int* neighborCZ,
-                              unsigned int* neighborFX,
-                              unsigned int* neighborFY,
-                              unsigned int* neighborFZ,
-                              unsigned int size_MatC,
-                              unsigned int size_MatF,
-                              bool isEvenTimestep,
-                              unsigned int* posC,
-                              unsigned int* posFSWB,
-                              unsigned int kFC,
-                              real nu,
-                              real diffusivity_coarse,
-                              unsigned int numberOfThreads,
-							  OffFC offFC)
-{
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFCThS27<<< gridINT_FC, threads >>>(  DC,
-                                                DF,
-                                                DD27C,
-                                                DD27F,
-                                                neighborCX,
-                                                neighborCY,
-                                                neighborCZ,
-                                                neighborFX,
-                                                neighborFY,
-                                                neighborFZ,
-                                                size_MatC,
-                                                size_MatF,
-                                                isEvenTimestep,
-                                                posC,
-                                                posFSWB,
-                                                kFC,
-                                                nu,
-                                                diffusivity_coarse,
-												offFC);
-      getLastCudaError("scaleFCThS27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void DragLiftPostD27(real* DD,
-								int* k_Q,
-								real* QQ,
-								int numberOfBCnodes,
-								double *DragX,
-								double *DragY,
-								double *DragZ,
-								unsigned int* neighborX,
-								unsigned int* neighborY,
-								unsigned int* neighborZ,
-								unsigned int size_Mat,
-								bool isEvenTimestep,
-								unsigned int numberOfThreads)
-{
-	int Grid = (numberOfBCnodes / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	DragLiftPost27<<< grid, threads >>>(DD,
-										k_Q,
-										QQ,
-										numberOfBCnodes,
-										DragX,
-										DragY,
-										DragZ,
-										neighborX,
-										neighborY,
-										neighborZ,
-										size_Mat,
-										isEvenTimestep);
-	getLastCudaError("DragLift27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void DragLiftPreD27( real* DD,
-								int* k_Q,
-								real* QQ,
-								int numberOfBCnodes,
-								double *DragX,
-								double *DragY,
-								double *DragZ,
-								unsigned int* neighborX,
-								unsigned int* neighborY,
-								unsigned int* neighborZ,
-								unsigned int size_Mat,
-								bool isEvenTimestep,
-								unsigned int numberOfThreads)
-{
-	int Grid = (numberOfBCnodes / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	DragLiftPre27<<< grid, threads >>>( DD,
-										k_Q,
-										QQ,
-										numberOfBCnodes,
-										DragX,
-										DragY,
-										DragZ,
-										neighborX,
-										neighborY,
-										neighborZ,
-										size_Mat,
-										isEvenTimestep);
-	getLastCudaError("DragLift27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CalcCPtop27(real* DD,
-							int* cpIndex,
-							int nonCp,
-							double *cpPress,
-							unsigned int* neighborX,
-							unsigned int* neighborY,
-							unsigned int* neighborZ,
-							unsigned int size_Mat,
-							bool isEvenTimestep,
-							unsigned int numberOfThreads)
-{
-	int Grid = (nonCp / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	CalcCP27<<< grid, threads >>>(DD,
-								  cpIndex,
-								  nonCp,
-								  cpPress,
-								  neighborX,
-								  neighborY,
-								  neighborZ,
-								  size_Mat,
-								  isEvenTimestep);
-	getLastCudaError("CalcCP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void CalcCPbottom27( real* DD,
-								int* cpIndex,
-								int nonCp,
-								double *cpPress,
-								unsigned int* neighborX,
-								unsigned int* neighborY,
-								unsigned int* neighborZ,
-								unsigned int size_Mat,
-								bool isEvenTimestep,
-								unsigned int numberOfThreads)
-{
-	int Grid = (nonCp / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	CalcCP27<<< grid, threads >>>(DD,
-								  cpIndex,
-								  nonCp,
-								  cpPress,
-								  neighborX,
-								  neighborY,
-								  neighborZ,
-								  size_Mat,
-								  isEvenTimestep);
-	getLastCudaError("CalcCP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void GetSendFsPreDev27(real* DD,
-								  real* bufferFs,
-								  int* sendIndex,
-								  int buffmax,
-								  unsigned int* neighborX,
-								  unsigned int* neighborY,
-								  unsigned int* neighborZ,
-								  unsigned int size_Mat,
-								  bool isEvenTimestep,
-								  unsigned int numberOfThreads,
-								  cudaStream_t stream)
-{
-	int Grid = (buffmax / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	getSendFsPre27<<< grid, threads, 0, stream >>>(DD,
-										bufferFs,
-										sendIndex,
-										buffmax,
-										neighborX,
-										neighborY,
-										neighborZ,
-										size_Mat,
-										isEvenTimestep);
-	getLastCudaError("getSendFsPre27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void GetSendFsPostDev27(real* DD,
-								   real* bufferFs,
-								   int* sendIndex,
-								   int buffmax,
-								   unsigned int* neighborX,
-								   unsigned int* neighborY,
-								   unsigned int* neighborZ,
-								   unsigned int size_Mat,
-								   bool isEvenTimestep,
-								   unsigned int numberOfThreads,
-								   cudaStream_t stream)
-{
-	int Grid = (buffmax / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	getSendFsPost27<<< grid, threads, 0, stream >>>(DD,
-										 bufferFs,
-										 sendIndex,
-										 buffmax,
-										 neighborX,
-										 neighborY,
-										 neighborZ,
-										 size_Mat,
-										 isEvenTimestep);
-	getLastCudaError("getSendFsPost27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void SetRecvFsPreDev27(real* DD,
-								  real* bufferFs,
-								  int* recvIndex,
-								  int buffmax,
-								  unsigned int* neighborX,
-								  unsigned int* neighborY,
-								  unsigned int* neighborZ,
-								  unsigned int size_Mat,
-								  bool isEvenTimestep,
-								  unsigned int numberOfThreads,
-	                              cudaStream_t stream)
-{
-	int Grid = (buffmax / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	setRecvFsPre27<<< grid, threads, 0, stream >>>(DD,
-										bufferFs,
-										recvIndex,
-										buffmax,
-										neighborX,
-										neighborY,
-										neighborZ,
-										size_Mat,
-										isEvenTimestep);
-	getLastCudaError("setRecvFsPre27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void SetRecvFsPostDev27(real* DD,
-								   real* bufferFs,
-								   int* recvIndex,
-								   int buffmax,
-								   unsigned int* neighborX,
-								   unsigned int* neighborY,
-								   unsigned int* neighborZ,
-								   unsigned int size_Mat,
-								   bool isEvenTimestep,
-	                               unsigned int numberOfThreads,
-	                               cudaStream_t stream)
-{
-	int Grid = (buffmax / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	setRecvFsPost27<<< grid, threads, 0, stream >>>(DD,
-										 bufferFs,
-										 recvIndex,
-										 buffmax,
-										 neighborX,
-										 neighborY,
-										 neighborZ,
-										 size_Mat,
-										 isEvenTimestep);
-	getLastCudaError("setRecvFsPost27 execution failed");
+    dim3 grid = vf::cuda::getCudaGrid(parameterDeviceC->numberofthreads,  icellFC->kFC);
+    dim3 threads(parameterDeviceC->numberofthreads, 1, 1 );
+
+    scaleFC_compressible<<<grid, threads, 0, stream>>>(
+        parameterDeviceC->distributions.f[0],
+        parameterDeviceF->distributions.f[0],
+        parameterDeviceC->neighborX,
+        parameterDeviceC->neighborY,
+        parameterDeviceC->neighborZ,
+        parameterDeviceF->neighborX,
+        parameterDeviceF->neighborY,
+        parameterDeviceF->neighborZ,
+        parameterDeviceC->numberOfNodes,
+        parameterDeviceF->numberOfNodes,
+        parameterDeviceC->isEvenTimestep,
+        icellFC->ICellFCC,
+        icellFC->ICellFCF,
+        icellFC->kFC,
+        parameterDeviceC->omega,
+        parameterDeviceF->omega,
+        offsetFC);
+    getLastCudaError("scaleFC_compressible execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFC_RhoSq_3rdMom_comp_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffFC offFC,
+    CUstream_st *stream)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFC_RhoSq_3rdMom_comp_27<<< grid.grid, grid.threads, 0, stream >>>(
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offFC);
+    getLastCudaError("scaleFC_RhoSq_3rdMom_comp_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFC_AA2016_comp_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffFC offFC,
+    CUstream_st *stream)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFC_AA2016_comp_27<<< grid.grid, grid.threads, 0, stream >>>(
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offFC);
+    getLastCudaError("scaleFC_AA2016_comp_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFC_NSPress_27(
+    real* DC,
+    real* DF,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real omCoarse,
+    real omFine,
+    real nu,
+    unsigned int nxC,
+    unsigned int nyC,
+    unsigned int nxF,
+    unsigned int nyF,
+    unsigned int numberOfThreads,
+    OffFC offFC)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFC_NSPress_27<<< grid.grid, grid.threads >>> (
+        DC,
+        DF,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        omCoarse,
+        omFine,
+        nu,
+        nxC,
+        nyC,
+        nxF,
+        nyF,
+        offFC);
+    getLastCudaError("scaleFC_NSPress_27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFCThSMG7(
+    real* DC,
+    real* DF,
+    real* DD7C,
+    real* DD7F,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real nu,
+    real diffusivity_coarse,
+    unsigned int numberOfThreads,
+    OffFC offFC)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFCThSMG7<<< grid.grid, grid.threads >>>(
+        DC,
+        DF,
+        DD7C,
+        DD7F,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        nu,
+        diffusivity_coarse,
+        offFC);
+    getLastCudaError("scaleFCThSMG7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFCThS7(
+    real* DC,
+    real* DF,
+    real* DD7C,
+    real* DD7F,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real nu,
+    real diffusivity_coarse,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFCThS7<<< grid.grid, grid.threads >>>(
+        DC,
+        DF,
+        DD7C,
+        DD7F,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        nu,
+        diffusivity_coarse);
+    getLastCudaError("scaleFCThS7 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void ScaleFCThS27(
+    real* DC,
+    real* DF,
+    real* DD27C,
+    real* DD27F,
+    unsigned int* neighborCX,
+    unsigned int* neighborCY,
+    unsigned int* neighborCZ,
+    unsigned int* neighborFX,
+    unsigned int* neighborFY,
+    unsigned int* neighborFZ,
+    unsigned long long numberOfLBnodesC,
+    unsigned long long numberOfLBnodesF,
+    bool isEvenTimestep,
+    unsigned int* posC,
+    unsigned int* posFSWB,
+    unsigned int kFC,
+    real nu,
+    real diffusivity_coarse,
+    unsigned int numberOfThreads,
+    OffFC offFC)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+    scaleFCThS27<<< grid.grid, grid.threads >>>(
+        DC,
+        DF,
+        DD27C,
+        DD27F,
+        neighborCX,
+        neighborCY,
+        neighborCZ,
+        neighborFX,
+        neighborFY,
+        neighborFZ,
+        numberOfLBnodesC,
+        numberOfLBnodesF,
+        isEvenTimestep,
+        posC,
+        posFSWB,
+        kFC,
+        nu,
+        diffusivity_coarse,
+        offFC);
+    getLastCudaError("scaleFCThS27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void DragLiftPostD27(
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    int numberOfBCnodes,
+    double *DragX,
+    double *DragY,
+    double *DragZ,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    DragLiftPost27<<< grid.grid, grid.threads >>>(
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        DragX,
+        DragY,
+        DragZ,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("DragLiftPost27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void DragLiftPreD27(
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    int numberOfBCnodes,
+    double *DragX,
+    double *DragY,
+    double *DragZ,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    DragLiftPre27<<< grid.grid, grid.threads >>>(
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        DragX,
+        DragY,
+        DragZ,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("DragLiftPre27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CalcCPtop27(
+    real* DD,
+    int* cpIndex,
+    int nonCp,
+    double *cpPress,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, nonCp);
+
+    CalcCP27<<< grid.grid, grid.threads >>>(
+        DD,
+        cpIndex,
+        nonCp,
+        cpPress,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("CalcCP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void CalcCPbottom27(
+    real* DD,
+    int* cpIndex,
+    int nonCp,
+    double *cpPress,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, nonCp);
+
+    CalcCP27<<< grid.grid, grid.threads >>>(
+        DD,
+        cpIndex,
+        nonCp,
+        cpPress,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("CalcCP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void GetSendFsPreDev27(
+    real* DD,
+    real* bufferFs,
+    int* sendIndex,
+    int buffmax,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    unsigned int numberOfThreads,
+    cudaStream_t stream)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, buffmax);
+
+    getSendFsPre27<<< grid.grid, grid.threads, 0, stream >>>(
+        DD,
+        bufferFs,
+        sendIndex,
+        buffmax,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("getSendFsPre27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void GetSendFsPostDev27(
+    real* DD,
+    real* bufferFs,
+    int* sendIndex,
+    int buffmax,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    unsigned int numberOfThreads,
+    cudaStream_t stream)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, buffmax);
+
+    getSendFsPost27<<< grid.grid, grid.threads, 0, stream >>>(
+        DD,
+        bufferFs,
+        sendIndex,
+        buffmax,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("getSendFsPost27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void SetRecvFsPreDev27(
+    real* DD,
+    real* bufferFs,
+    int* recvIndex,
+    int buffmax,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    unsigned int numberOfThreads,
+    cudaStream_t stream)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, buffmax);
+
+    setRecvFsPre27<<< grid.grid, grid.threads, 0, stream >>>(
+        DD,
+        bufferFs,
+        recvIndex,
+        buffmax,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("setRecvFsPre27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void SetRecvFsPostDev27(
+    real* DD,
+    real* bufferFs,
+    int* recvIndex,
+    int buffmax,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    unsigned int numberOfThreads,
+    cudaStream_t stream)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, buffmax);
+
+    setRecvFsPost27<<< grid.grid, grid.threads, 0, stream >>>(
+        DD,
+        bufferFs,
+        recvIndex,
+        buffmax,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("setRecvFsPost27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void getSendGsDevF3(
-	real* G6,
-	real* bufferGs,
-	int* sendIndex,
-	int buffmax,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	unsigned int size_Mat,
-	bool isEvenTimestep,
-	unsigned int numberOfThreads)
-{
-	int Grid = (buffmax / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	getSendGsF3 <<< grid, threads >>> (
-		G6,
-		bufferGs,
-		sendIndex,
-		buffmax,
-		neighborX,
-		neighborY,
-		neighborZ,
-		size_Mat,
-		isEvenTimestep);
-	getLastCudaError("getSendGsF3 execution failed");
+    real* G6,
+    real* bufferGs,
+    int* sendIndex,
+    int buffmax,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, buffmax);
+
+    getSendGsF3 <<< grid.grid, grid.threads >>> (
+        G6,
+        bufferGs,
+        sendIndex,
+        buffmax,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("getSendGsF3 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void setRecvGsDevF3(
-	real* G6,
-	real* bufferGs,
-	int* recvIndex,
-	int buffmax,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	unsigned int size_Mat,
-	bool isEvenTimestep,
-	unsigned int numberOfThreads)
-{
-	int Grid = (buffmax / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	setRecvGsF3 <<< grid, threads >>> (
-		G6,
-		bufferGs,
-		recvIndex,
-		buffmax,
-		neighborX,
-		neighborY,
-		neighborZ,
-		size_Mat,
-		isEvenTimestep);
-	getLastCudaError("setRecvGsF3 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void WallFuncDev27(unsigned int numberOfThreads,
-							  real* vx,
-							  real* vy,
-							  real* vz,
-							  real* DD,
-							  int* k_Q,
-							  real* QQ,
-							  unsigned int numberOfBCnodes,
-							  real om1,
-							  unsigned int* neighborX,
-							  unsigned int* neighborY,
-							  unsigned int* neighborZ,
-							  unsigned int size_Mat,
-							  bool isEvenTimestep)
-{
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      WallFunction27<<< gridQ, threads >>> (
-											  vx,
-											  vy,
-											  vz,
-											  DD,
-											  k_Q,
-											  QQ,
-											  numberOfBCnodes,
-											  om1,
-											  neighborX,
-											  neighborY,
-											  neighborZ,
-											  size_Mat,
-											  isEvenTimestep);
-      getLastCudaError("WallFunction27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void SetOutputWallVelocitySP27(unsigned int numberOfThreads,
-										  real* vxD,
-										  real* vyD,
-										  real* vzD,
-										  real* vxWall,
-										  real* vyWall,
-										  real* vzWall,
-										  int numberOfWallNodes,
-										  int* kWallNodes,
-										  real* rhoD,
-										  real* pressD,
-										  unsigned int* geoD,
-										  unsigned int* neighborX,
-										  unsigned int* neighborY,
-										  unsigned int* neighborZ,
-										  unsigned int size_Mat,
-										  real* DD,
-										  bool isEvenTimestep)
-{
-   int Grid = (numberOfWallNodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBSetOutputWallVelocitySP27<<< gridQ, threads >>> (	vxD,
-															vyD,
-															vzD,
-															vxWall,
-															vyWall,
-															vzWall,
-															numberOfWallNodes,
-															kWallNodes,
-															rhoD,
-															pressD,
-															geoD,
-															neighborX,
-															neighborY,
-															neighborZ,
-															size_Mat,
-															DD,
-															isEvenTimestep);
-      getLastCudaError("LBSetOutputWallVelocitySP27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void GetVelotoForce27(unsigned int numberOfThreads,
-								 real* DD,
-								 int* bcIndex,
-								 int nonAtBC,
-								 real* Vx,
-								 real* Vy,
-								 real* Vz,
-								 unsigned int* neighborX,
-								 unsigned int* neighborY,
-								 unsigned int* neighborZ,
-								 unsigned int size_Mat,
-								 bool isEvenTimestep)
-{
-   int Grid = (nonAtBC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      GetVeloforForcing27<<< gridQ, threads >>> (DD,
-												bcIndex,
-												nonAtBC,
-												Vx,
-												Vy,
-												Vz,
-												neighborX,
-												neighborY,
-												neighborZ,
-												size_Mat,
-												isEvenTimestep);
-      getLastCudaError("GetVeloforForcing27 execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void InitParticlesDevice(real* coordX,
-									real* coordY,
-									real* coordZ,
-									real* coordParticleXlocal,
-									real* coordParticleYlocal,
-									real* coordParticleZlocal,
-									real* coordParticleXglobal,
-									real* coordParticleYglobal,
-									real* coordParticleZglobal,
-									real* veloParticleX,
-									real* veloParticleY,
-									real* veloParticleZ,
-									real* randArray,
-									unsigned int* particleID,
-									unsigned int* cellBaseID,
-									unsigned int* bcMatD,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									unsigned int* neighborWSB,
-							        int level,
-									unsigned int numberOfParticles,
-									unsigned int size_Mat,
-									unsigned int numberOfThreads)
-{
-   int Grid = (numberOfParticles / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   InitParticles<<< gridQ, threads >>> (coordX,
-										coordY,
-										coordZ,
-										coordParticleXlocal,
-										coordParticleYlocal,
-										coordParticleZlocal,
-										coordParticleXglobal,
-										coordParticleYglobal,
-										coordParticleZglobal,
-										veloParticleX,
-										veloParticleY,
-										veloParticleZ,
-										randArray,
-										particleID,
-										cellBaseID,
-										bcMatD,
-										neighborX,
-										neighborY,
-										neighborZ,
-										neighborWSB,
-										level,
-										numberOfParticles,
-										size_Mat);
-      getLastCudaError("InitParticles execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void MoveParticlesDevice(real* coordX,
-									real* coordY,
-									real* coordZ,
-									real* coordParticleXlocal,
-									real* coordParticleYlocal,
-									real* coordParticleZlocal,
-									real* coordParticleXglobal,
-									real* coordParticleYglobal,
-									real* coordParticleZglobal,
-									real* veloParticleX,
-									real* veloParticleY,
-									real* veloParticleZ,
-									real* DD,
-									real  omega,
-									unsigned int* particleID,
-									unsigned int* cellBaseID,
-									unsigned int* bcMatD,
-									unsigned int* neighborX,
-									unsigned int* neighborY,
-									unsigned int* neighborZ,
-									unsigned int* neighborWSB,
-							        int level,
-									unsigned int timestep,
-									unsigned int numberOfTimesteps,
-									unsigned int numberOfParticles,
-									unsigned int size_Mat,
-									unsigned int numberOfThreads,
-									bool isEvenTimestep)
-{
-   int Grid = (numberOfParticles / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   MoveParticles<<< gridQ, threads >>> (coordX,
-										coordY,
-										coordZ,
-										coordParticleXlocal,
-										coordParticleYlocal,
-										coordParticleZlocal,
-										coordParticleXglobal,
-										coordParticleYglobal,
-										coordParticleZglobal,
-										veloParticleX,
-										veloParticleY,
-										veloParticleZ,
-										DD,
-										omega,
-										particleID,
-										cellBaseID,
-										bcMatD,
-										neighborX,
-										neighborY,
-										neighborZ,
-										neighborWSB,
-										level,
-										timestep,
-										numberOfTimesteps,
-										numberOfParticles,
-										size_Mat,
-										isEvenTimestep);
-      getLastCudaError("MoveParticles execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void initRandomDevice(curandState* state,
-								 unsigned int size_Mat,
-								 unsigned int numberOfThreads)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   initRandom<<< gridQ, threads >>> (state);
-   getLastCudaError("initRandom execution failed");
-}
-//////////////////////////////////////////////////////////////////////////
-void generateRandomValuesDevice( curandState* state,
-											unsigned int size_Mat,
-											real* randArray,
-											unsigned int numberOfThreads)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   generateRandomValues<<< gridQ, threads >>> (state,randArray);
-   getLastCudaError("generateRandomValues execution failed");
+    real* G6,
+    real* bufferGs,
+    int* recvIndex,
+    int buffmax,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, buffmax);
+
+    setRecvGsF3 <<< grid.grid, grid.threads >>> (
+        G6,
+        bufferGs,
+        recvIndex,
+        buffmax,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("setRecvGsF3 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void WallFuncDev27(
+    unsigned int numberOfThreads,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+    WallFunction27<<< grid.grid, grid.threads >>> (
+        vx,
+        vy,
+        vz,
+        DD,
+        k_Q,
+        QQ,
+        numberOfBCnodes,
+        om1,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("WallFunction27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void SetOutputWallVelocitySP27(
+    unsigned int numberOfThreads,
+    real* vxD,
+    real* vyD,
+    real* vzD,
+    real* vxWall,
+    real* vyWall,
+    real* vzWall,
+    int numberOfWallNodes,
+    int* kWallNodes,
+    real* rhoD,
+    real* pressD,
+    unsigned int* geoD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    real* DD,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfWallNodes);
+
+    LBSetOutputWallVelocitySP27<<< grid.grid, grid.threads >>> (
+        vxD,
+        vyD,
+        vzD,
+        vxWall,
+        vyWall,
+        vzWall,
+        numberOfWallNodes,
+        kWallNodes,
+        rhoD,
+        pressD,
+        geoD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        DD,
+        isEvenTimestep);
+    getLastCudaError("LBSetOutputWallVelocitySP27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void GetVelotoForce27(
+    unsigned int numberOfThreads,
+    real* DD,
+    int* bcIndex,
+    int nonAtBC,
+    real* Vx,
+    real* Vy,
+    real* Vz,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, nonAtBC);
+
+    GetVeloforForcing27<<< grid.grid, grid.threads >>> (
+        DD,
+        bcIndex,
+        nonAtBC,
+        Vx,
+        Vy,
+        Vz,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("GetVeloforForcing27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void InitParticlesDevice(
+    real* coordX,
+    real* coordY,
+    real* coordZ,
+    real* coordParticleXlocal,
+    real* coordParticleYlocal,
+    real* coordParticleZlocal,
+    real* coordParticleXglobal,
+    real* coordParticleYglobal,
+    real* coordParticleZglobal,
+    real* veloParticleX,
+    real* veloParticleY,
+    real* veloParticleZ,
+    real* randArray,
+    unsigned int* particleID,
+    unsigned int* cellBaseID,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned int* neighborWSB,
+    int level,
+    unsigned int numberOfParticles,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfParticles);
+
+    InitParticles<<< grid.grid, grid.threads >>> (
+        coordX,
+        coordY,
+        coordZ,
+        coordParticleXlocal,
+        coordParticleYlocal,
+        coordParticleZlocal,
+        coordParticleXglobal,
+        coordParticleYglobal,
+        coordParticleZglobal,
+        veloParticleX,
+        veloParticleY,
+        veloParticleZ,
+        randArray,
+        particleID,
+        cellBaseID,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        neighborWSB,
+        level,
+        numberOfParticles,
+        numberOfLBnodes);
+    getLastCudaError("InitParticles execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void MoveParticlesDevice(
+    real* coordX,
+    real* coordY,
+    real* coordZ,
+    real* coordParticleXlocal,
+    real* coordParticleYlocal,
+    real* coordParticleZlocal,
+    real* coordParticleXglobal,
+    real* coordParticleYglobal,
+    real* coordParticleZglobal,
+    real* veloParticleX,
+    real* veloParticleY,
+    real* veloParticleZ,
+    real* DD,
+    real  omega,
+    unsigned int* particleID,
+    unsigned int* cellBaseID,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned int* neighborWSB,
+    int level,
+    unsigned int timestep,
+    unsigned int numberOfTimesteps,
+    unsigned int numberOfParticles,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads,
+    bool isEvenTimestep)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfParticles);
+
+    MoveParticles<<< grid.grid, grid.threads >>> (
+        coordX,
+        coordY,
+        coordZ,
+        coordParticleXlocal,
+        coordParticleYlocal,
+        coordParticleZlocal,
+        coordParticleXglobal,
+        coordParticleYglobal,
+        coordParticleZglobal,
+        veloParticleX,
+        veloParticleY,
+        veloParticleZ,
+        DD,
+        omega,
+        particleID,
+        cellBaseID,
+        bcMatD,
+        neighborX,
+        neighborY,
+        neighborZ,
+        neighborWSB,
+        level,
+        timestep,
+        numberOfTimesteps,
+        numberOfParticles,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("MoveParticles execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void initRandomDevice(
+    curandState* state,
+    unsigned long long numberOfLBnodes,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+    initRandom<<< grid.grid, grid.threads >>> (state);
+    getLastCudaError("initRandom execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
+void generateRandomValuesDevice(
+    curandState* state,
+    unsigned long long numberOfLBnodes,
+    real* randArray,
+    unsigned int numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+    generateRandomValues<<< grid.grid, grid.threads >>> (state,randArray);
+    getLastCudaError("generateRandomValues execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void CalcTurbulenceIntensityDevice(
-   real* vxx,
-   real* vyy,
-   real* vzz,
-   real* vxy,
-   real* vxz,
-   real* vyz,
-   real* vx_mean,
-   real* vy_mean,
-   real* vz_mean,
-   real* DD,
-   uint* typeOfGridNode,
-   unsigned int* neighborX,
-   unsigned int* neighborY,
-   unsigned int* neighborZ,
-   unsigned int size_Mat,
-   bool isEvenTimestep,
-   uint numberOfThreads)
-{
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   CalcTurbulenceIntensity<<<gridQ, threads>>>(
-     vxx,
-     vyy,
-     vzz,
-	 vxy,
-     vxz,
-     vyz,
-     vx_mean,
-     vy_mean,
-     vz_mean,
-     DD,
-     typeOfGridNode,
-     neighborX,
-     neighborY,
-     neighborZ,
-     size_Mat,
-     isEvenTimestep);
-
-   getLastCudaError("CalcTurbulenceIntensity execution failed");
+    real* vxx,
+    real* vyy,
+    real* vzz,
+    real* vxy,
+    real* vxz,
+    real* vyz,
+    real* vx_mean,
+    real* vy_mean,
+    real* vz_mean,
+    real* DD,
+    uint* typeOfGridNode,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    uint numberOfThreads)
+{
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfLBnodes);
+    CalcTurbulenceIntensity<<<grid.grid, grid.threads>>>(
+        vxx,
+        vyy,
+        vzz,
+        vxy,
+        vxz,
+        vyz,
+        vx_mean,
+        vy_mean,
+        vz_mean,
+        DD,
+        typeOfGridNode,
+        neighborX,
+        neighborY,
+        neighborZ,
+        numberOfLBnodes,
+        isEvenTimestep);
+    getLastCudaError("CalcTurbulenceIntensity execution failed");
 }
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/src/gpu/VirtualFluids_GPU/GPU/NoSlipBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/NoSlipBCs27.cu
index 314687c4b29a32962b386d7c083f72b754388e5b..79dedee58afb7b11c4c3ede9911f54df65cf859f 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/NoSlipBCs27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/NoSlipBCs27.cu
@@ -1,92 +1,117 @@
-//  _    ___      __              __________      _     __        ______________   __
-// | |  / (_)____/ /___  ______ _/ / ____/ /_  __(_)___/ /____   /  ___/ __  / /  / /
-// | | / / / ___/ __/ / / / __ `/ / /_  / / / / / / __  / ___/  / /___/ /_/ / /  / /
-// | |/ / / /  / /_/ /_/ / /_/ / / __/ / / /_/ / / /_/ (__  )  / /_) / ____/ /__/ / 
-// |___/_/_/   \__/\__,_/\__,_/_/_/   /_/\__,_/_/\__,_/____/   \____/_/    \_____/
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
 //
-//////////////////////////////////////////////////////////////////////////
-/* Device code */
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NoSlipBCs27.cu
+//! \ingroup GPU
+//! \author Martin Schoenherr, Anna Wellmann
+//======================================================================================
 #include "LBM/LB.h" 
 #include "lbm/constants/D3Q27.h"
 #include <lbm/constants/NumericConstants.h>
-#include "KernelUtilities.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
 
 using namespace vf::lbm::constant;
 using namespace vf::lbm::dir;
+using namespace vf::gpu;
 
 //////////////////////////////////////////////////////////////////////////////
 __global__ void QDevice3rdMomentsComp27(
-													 real* distributions, 
-													 int* subgridDistanceIndices, 
-													 real* subgridDistances,
-													 unsigned int numberOfBCnodes, 
-													 real omega, 
-													 unsigned int* neighborX,
-													 unsigned int* neighborY,
-													 unsigned int* neighborZ,
-													 unsigned int numberOfLBnodes, 
-													 bool isEvenTimestep)
+    real* distributions, 
+    int* subgridDistanceIndices, 
+    real* subgridDistances,
+    unsigned int numberOfBCnodes, 
+    real omega, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &distributions[DIR_P00   *numberOfLBnodes];
-      D.f[DIR_M00   ] = &distributions[DIR_M00   *numberOfLBnodes];
-      D.f[DIR_0P0   ] = &distributions[DIR_0P0   *numberOfLBnodes];
-      D.f[DIR_0M0   ] = &distributions[DIR_0M0   *numberOfLBnodes];
-      D.f[DIR_00P   ] = &distributions[DIR_00P   *numberOfLBnodes];
-      D.f[DIR_00M   ] = &distributions[DIR_00M   *numberOfLBnodes];
-      D.f[DIR_PP0  ] = &distributions[DIR_PP0  *numberOfLBnodes];
-      D.f[DIR_MM0  ] = &distributions[DIR_MM0  *numberOfLBnodes];
-      D.f[DIR_PM0  ] = &distributions[DIR_PM0  *numberOfLBnodes];
-      D.f[DIR_MP0  ] = &distributions[DIR_MP0  *numberOfLBnodes];
-      D.f[DIR_P0P  ] = &distributions[DIR_P0P  *numberOfLBnodes];
-      D.f[DIR_M0M  ] = &distributions[DIR_M0M  *numberOfLBnodes];
-      D.f[DIR_P0M  ] = &distributions[DIR_P0M  *numberOfLBnodes];
-      D.f[DIR_M0P  ] = &distributions[DIR_M0P  *numberOfLBnodes];
-      D.f[DIR_0PP  ] = &distributions[DIR_0PP  *numberOfLBnodes];
-      D.f[DIR_0MM  ] = &distributions[DIR_0MM  *numberOfLBnodes];
-      D.f[DIR_0PM  ] = &distributions[DIR_0PM  *numberOfLBnodes];
-      D.f[DIR_0MP  ] = &distributions[DIR_0MP  *numberOfLBnodes];
-      D.f[DIR_000] = &distributions[DIR_000*numberOfLBnodes];
-      D.f[DIR_PPP ] = &distributions[DIR_PPP *numberOfLBnodes];
-      D.f[DIR_MMP ] = &distributions[DIR_MMP *numberOfLBnodes];
-      D.f[DIR_PMP ] = &distributions[DIR_PMP *numberOfLBnodes];
-      D.f[DIR_MPP ] = &distributions[DIR_MPP *numberOfLBnodes];
-      D.f[DIR_PPM ] = &distributions[DIR_PPM *numberOfLBnodes];
-      D.f[DIR_MMM ] = &distributions[DIR_MMM *numberOfLBnodes];
-      D.f[DIR_PMM ] = &distributions[DIR_PMM *numberOfLBnodes];
-      D.f[DIR_MPM ] = &distributions[DIR_MPM *numberOfLBnodes];
+      D.f[DIR_P00] = &distributions[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &distributions[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &distributions[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &distributions[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &distributions[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &distributions[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &distributions[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &distributions[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &distributions[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &distributions[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &distributions[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &distributions[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &distributions[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &distributions[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &distributions[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &distributions[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &distributions[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &distributions[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &distributions[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &distributions[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &distributions[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &distributions[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &distributions[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &distributions[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &distributions[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &distributions[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &distributions[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &distributions[DIR_P00   *numberOfLBnodes];
-      D.f[DIR_P00   ] = &distributions[DIR_M00   *numberOfLBnodes];
-      D.f[DIR_0M0   ] = &distributions[DIR_0P0   *numberOfLBnodes];
-      D.f[DIR_0P0   ] = &distributions[DIR_0M0   *numberOfLBnodes];
-      D.f[DIR_00M   ] = &distributions[DIR_00P   *numberOfLBnodes];
-      D.f[DIR_00P   ] = &distributions[DIR_00M   *numberOfLBnodes];
-      D.f[DIR_MM0  ] = &distributions[DIR_PP0  *numberOfLBnodes];
-      D.f[DIR_PP0  ] = &distributions[DIR_MM0  *numberOfLBnodes];
-      D.f[DIR_MP0  ] = &distributions[DIR_PM0  *numberOfLBnodes];
-      D.f[DIR_PM0  ] = &distributions[DIR_MP0  *numberOfLBnodes];
-      D.f[DIR_M0M  ] = &distributions[DIR_P0P  *numberOfLBnodes];
-      D.f[DIR_P0P  ] = &distributions[DIR_M0M  *numberOfLBnodes];
-      D.f[DIR_M0P  ] = &distributions[DIR_P0M  *numberOfLBnodes];
-      D.f[DIR_P0M  ] = &distributions[DIR_M0P  *numberOfLBnodes];
-      D.f[DIR_0MM  ] = &distributions[DIR_0PP  *numberOfLBnodes];
-      D.f[DIR_0PP  ] = &distributions[DIR_0MM  *numberOfLBnodes];
-      D.f[DIR_0MP  ] = &distributions[DIR_0PM  *numberOfLBnodes];
-      D.f[DIR_0PM  ] = &distributions[DIR_0MP  *numberOfLBnodes];
-      D.f[DIR_000] = &distributions[DIR_000*numberOfLBnodes];
-      D.f[DIR_PPP ] = &distributions[DIR_MMM *numberOfLBnodes];
-      D.f[DIR_MMP ] = &distributions[DIR_PPM *numberOfLBnodes];
-      D.f[DIR_PMP ] = &distributions[DIR_MPM *numberOfLBnodes];
-      D.f[DIR_MPP ] = &distributions[DIR_PMM *numberOfLBnodes];
-      D.f[DIR_PPM ] = &distributions[DIR_MMP *numberOfLBnodes];
-      D.f[DIR_MMM ] = &distributions[DIR_PPP *numberOfLBnodes];
-      D.f[DIR_PMM ] = &distributions[DIR_MPP *numberOfLBnodes];
-      D.f[DIR_MPM ] = &distributions[DIR_PMP *numberOfLBnodes];
+      D.f[DIR_M00] = &distributions[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &distributions[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &distributions[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &distributions[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &distributions[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &distributions[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &distributions[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &distributions[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &distributions[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &distributions[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &distributions[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &distributions[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &distributions[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &distributions[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &distributions[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &distributions[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &distributions[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &distributions[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &distributions[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &distributions[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &distributions[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &distributions[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &distributions[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &distributions[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &distributions[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &distributions[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &distributions[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -107,24 +132,24 @@ __global__ void QDevice3rdMomentsComp27(
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &subgridDistances[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &subgridDistances[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &subgridDistances[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &subgridDistances[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &subgridDistances[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &subgridDistances[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &subgridDistances[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &subgridDistances[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &subgridDistances[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &subgridDistances[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &subgridDistances[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &subgridDistances[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &subgridDistances[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &subgridDistances[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &subgridDistances[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &subgridDistances[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &subgridDistances[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &subgridDistances[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &subgridDistances[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &subgridDistances[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &subgridDistances[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &subgridDistances[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &subgridDistances[DIR_00P * numberOfBCnodes];
+      q_dirB   = &subgridDistances[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &subgridDistances[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &subgridDistances[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &subgridDistances[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &subgridDistances[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &subgridDistances[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &subgridDistances[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &subgridDistances[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &subgridDistances[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &subgridDistances[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &subgridDistances[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &subgridDistances[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &subgridDistances[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &subgridDistances[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &subgridDistances[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &subgridDistances[DIR_PMP * numberOfBCnodes];
@@ -167,32 +192,32 @@ __global__ void QDevice3rdMomentsComp27(
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
             f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho, feq, q, m3;
       drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -217,63 +242,63 @@ __global__ void QDevice3rdMomentsComp27(
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &distributions[DIR_P00   *numberOfLBnodes];
-         D.f[DIR_M00   ] = &distributions[DIR_M00   *numberOfLBnodes];
-         D.f[DIR_0P0   ] = &distributions[DIR_0P0   *numberOfLBnodes];
-         D.f[DIR_0M0   ] = &distributions[DIR_0M0   *numberOfLBnodes];
-         D.f[DIR_00P   ] = &distributions[DIR_00P   *numberOfLBnodes];
-         D.f[DIR_00M   ] = &distributions[DIR_00M   *numberOfLBnodes];
-         D.f[DIR_PP0  ] = &distributions[DIR_PP0  *numberOfLBnodes];
-         D.f[DIR_MM0  ] = &distributions[DIR_MM0  *numberOfLBnodes];
-         D.f[DIR_PM0  ] = &distributions[DIR_PM0  *numberOfLBnodes];
-         D.f[DIR_MP0  ] = &distributions[DIR_MP0  *numberOfLBnodes];
-         D.f[DIR_P0P  ] = &distributions[DIR_P0P  *numberOfLBnodes];
-         D.f[DIR_M0M  ] = &distributions[DIR_M0M  *numberOfLBnodes];
-         D.f[DIR_P0M  ] = &distributions[DIR_P0M  *numberOfLBnodes];
-         D.f[DIR_M0P  ] = &distributions[DIR_M0P  *numberOfLBnodes];
-         D.f[DIR_0PP  ] = &distributions[DIR_0PP  *numberOfLBnodes];
-         D.f[DIR_0MM  ] = &distributions[DIR_0MM  *numberOfLBnodes];
-         D.f[DIR_0PM  ] = &distributions[DIR_0PM  *numberOfLBnodes];
-         D.f[DIR_0MP  ] = &distributions[DIR_0MP  *numberOfLBnodes];
-         D.f[DIR_000] = &distributions[DIR_000*numberOfLBnodes];
-         D.f[DIR_PPP ] = &distributions[DIR_PPP *numberOfLBnodes];
-         D.f[DIR_MMP ] = &distributions[DIR_MMP *numberOfLBnodes];
-         D.f[DIR_PMP ] = &distributions[DIR_PMP *numberOfLBnodes];
-         D.f[DIR_MPP ] = &distributions[DIR_MPP *numberOfLBnodes];
-         D.f[DIR_PPM ] = &distributions[DIR_PPM *numberOfLBnodes];
-         D.f[DIR_MMM ] = &distributions[DIR_MMM *numberOfLBnodes];
-         D.f[DIR_PMM ] = &distributions[DIR_PMM *numberOfLBnodes];
-         D.f[DIR_MPM ] = &distributions[DIR_MPM *numberOfLBnodes];
+         D.f[DIR_P00] = &distributions[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &distributions[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &distributions[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &distributions[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &distributions[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &distributions[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &distributions[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &distributions[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &distributions[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &distributions[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &distributions[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &distributions[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &distributions[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &distributions[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &distributions[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &distributions[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &distributions[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &distributions[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &distributions[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &distributions[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &distributions[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &distributions[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &distributions[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &distributions[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &distributions[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &distributions[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &distributions[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &distributions[DIR_P00   *numberOfLBnodes];
-         D.f[DIR_P00   ] = &distributions[DIR_M00   *numberOfLBnodes];
-         D.f[DIR_0M0   ] = &distributions[DIR_0P0   *numberOfLBnodes];
-         D.f[DIR_0P0   ] = &distributions[DIR_0M0   *numberOfLBnodes];
-         D.f[DIR_00M   ] = &distributions[DIR_00P   *numberOfLBnodes];
-         D.f[DIR_00P   ] = &distributions[DIR_00M   *numberOfLBnodes];
-         D.f[DIR_MM0  ] = &distributions[DIR_PP0  *numberOfLBnodes];
-         D.f[DIR_PP0  ] = &distributions[DIR_MM0  *numberOfLBnodes];
-         D.f[DIR_MP0  ] = &distributions[DIR_PM0  *numberOfLBnodes];
-         D.f[DIR_PM0  ] = &distributions[DIR_MP0  *numberOfLBnodes];
-         D.f[DIR_M0M  ] = &distributions[DIR_P0P  *numberOfLBnodes];
-         D.f[DIR_P0P  ] = &distributions[DIR_M0M  *numberOfLBnodes];
-         D.f[DIR_M0P  ] = &distributions[DIR_P0M  *numberOfLBnodes];
-         D.f[DIR_P0M  ] = &distributions[DIR_M0P  *numberOfLBnodes];
-         D.f[DIR_0MM  ] = &distributions[DIR_0PP  *numberOfLBnodes];
-         D.f[DIR_0PP  ] = &distributions[DIR_0MM  *numberOfLBnodes];
-         D.f[DIR_0MP  ] = &distributions[DIR_0PM  *numberOfLBnodes];
-         D.f[DIR_0PM  ] = &distributions[DIR_0MP  *numberOfLBnodes];
-         D.f[DIR_000] = &distributions[DIR_000*numberOfLBnodes];
-         D.f[DIR_PPP ] = &distributions[DIR_MMM *numberOfLBnodes];
-         D.f[DIR_MMP ] = &distributions[DIR_PPM *numberOfLBnodes];
-         D.f[DIR_PMP ] = &distributions[DIR_MPM *numberOfLBnodes];
-         D.f[DIR_MPP ] = &distributions[DIR_PMM *numberOfLBnodes];
-         D.f[DIR_PPM ] = &distributions[DIR_MMP *numberOfLBnodes];
-         D.f[DIR_MMM ] = &distributions[DIR_PPP *numberOfLBnodes];
-         D.f[DIR_PMM ] = &distributions[DIR_MPP *numberOfLBnodes];
-         D.f[DIR_MPM ] = &distributions[DIR_PMP *numberOfLBnodes];
+         D.f[DIR_M00] = &distributions[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &distributions[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &distributions[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &distributions[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &distributions[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &distributions[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &distributions[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &distributions[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &distributions[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &distributions[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &distributions[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &distributions[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &distributions[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &distributions[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &distributions[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &distributions[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &distributions[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &distributions[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &distributions[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &distributions[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &distributions[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &distributions[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &distributions[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &distributions[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &distributions[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &distributions[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &distributions[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
@@ -559,77 +584,78 @@ __global__ void QDevice3rdMomentsComp27(
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QDeviceIncompHighNu27(real* DD, 
-												 int* k_Q, 
-												 real* QQ,
-												 unsigned int numberOfBCnodes,
-												 real om1, 
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int numberOfLBnodes, 
-												 bool isEvenTimestep)
+__global__ void QDeviceIncompHighNu27(
+    real* DD, 
+    int* k_Q, 
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *numberOfLBnodes];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *numberOfLBnodes];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *numberOfLBnodes];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *numberOfLBnodes];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *numberOfLBnodes];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *numberOfLBnodes];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *numberOfLBnodes];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *numberOfLBnodes];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *numberOfLBnodes];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *numberOfLBnodes];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *numberOfLBnodes];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *numberOfLBnodes];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *numberOfLBnodes];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *numberOfLBnodes];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *numberOfLBnodes];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *numberOfLBnodes];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *numberOfLBnodes];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *numberOfLBnodes];
-      D.f[DIR_000] = &DD[DIR_000*numberOfLBnodes];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *numberOfLBnodes];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *numberOfLBnodes];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *numberOfLBnodes];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *numberOfLBnodes];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *numberOfLBnodes];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *numberOfLBnodes];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *numberOfLBnodes];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *numberOfLBnodes];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *numberOfLBnodes];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *numberOfLBnodes];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *numberOfLBnodes];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *numberOfLBnodes];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *numberOfLBnodes];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *numberOfLBnodes];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *numberOfLBnodes];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *numberOfLBnodes];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *numberOfLBnodes];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *numberOfLBnodes];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *numberOfLBnodes];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *numberOfLBnodes];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *numberOfLBnodes];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *numberOfLBnodes];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *numberOfLBnodes];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *numberOfLBnodes];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *numberOfLBnodes];
-      D.f[DIR_000] = &DD[DIR_000*numberOfLBnodes];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *numberOfLBnodes];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *numberOfLBnodes];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *numberOfLBnodes];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *numberOfLBnodes];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *numberOfLBnodes];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *numberOfLBnodes];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *numberOfLBnodes];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -650,24 +676,24 @@ __global__ void QDeviceIncompHighNu27(real* DD,
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -710,32 +736,32 @@ __global__ void QDeviceIncompHighNu27(real* DD,
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
             f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_E   = (D.f[DIR_P00   ])[ke   ];
-      f_W   = (D.f[DIR_M00   ])[kw   ];
-      f_N   = (D.f[DIR_0P0   ])[kn   ];
-      f_S   = (D.f[DIR_0M0   ])[ks   ];
-      f_T   = (D.f[DIR_00P   ])[kt   ];
-      f_B   = (D.f[DIR_00M   ])[kb   ];
-      f_NE  = (D.f[DIR_PP0  ])[kne  ];
-      f_SW  = (D.f[DIR_MM0  ])[ksw  ];
-      f_SE  = (D.f[DIR_PM0  ])[kse  ];
-      f_NW  = (D.f[DIR_MP0  ])[knw  ];
-      f_TE  = (D.f[DIR_P0P  ])[kte  ];
-      f_BW  = (D.f[DIR_M0M  ])[kbw  ];
-      f_BE  = (D.f[DIR_P0M  ])[kbe  ];
-      f_TW  = (D.f[DIR_M0P  ])[ktw  ];
-      f_TN  = (D.f[DIR_0PP  ])[ktn  ];
-      f_BS  = (D.f[DIR_0MM  ])[kbs  ];
-      f_BN  = (D.f[DIR_0PM  ])[kbn  ];
-      f_TS  = (D.f[DIR_0MP  ])[kts  ];
-      f_TNE = (D.f[DIR_PPP ])[ktne ];
-      f_TSW = (D.f[DIR_MMP ])[ktsw ];
-      f_TSE = (D.f[DIR_PMP ])[ktse ];
-      f_TNW = (D.f[DIR_MPP ])[ktnw ];
-      f_BNE = (D.f[DIR_PPM ])[kbne ];
-      f_BSW = (D.f[DIR_MMM ])[kbsw ];
-      f_BSE = (D.f[DIR_PMM ])[kbse ];
-      f_BNW = (D.f[DIR_MPM ])[kbnw ];
+      f_E   = (D.f[DIR_P00])[ke   ];
+      f_W   = (D.f[DIR_M00])[kw   ];
+      f_N   = (D.f[DIR_0P0])[kn   ];
+      f_S   = (D.f[DIR_0M0])[ks   ];
+      f_T   = (D.f[DIR_00P])[kt   ];
+      f_B   = (D.f[DIR_00M])[kb   ];
+      f_NE  = (D.f[DIR_PP0])[kne  ];
+      f_SW  = (D.f[DIR_MM0])[ksw  ];
+      f_SE  = (D.f[DIR_PM0])[kse  ];
+      f_NW  = (D.f[DIR_MP0])[knw  ];
+      f_TE  = (D.f[DIR_P0P])[kte  ];
+      f_BW  = (D.f[DIR_M0M])[kbw  ];
+      f_BE  = (D.f[DIR_P0M])[kbe  ];
+      f_TW  = (D.f[DIR_M0P])[ktw  ];
+      f_TN  = (D.f[DIR_0PP])[ktn  ];
+      f_BS  = (D.f[DIR_0MM])[kbs  ];
+      f_BN  = (D.f[DIR_0PM])[kbn  ];
+      f_TS  = (D.f[DIR_0MP])[kts  ];
+      f_TNE = (D.f[DIR_PPP])[ktne ];
+      f_TSW = (D.f[DIR_MMP])[ktsw ];
+      f_TSE = (D.f[DIR_PMP])[ktse ];
+      f_TNW = (D.f[DIR_MPP])[ktnw ];
+      f_BNE = (D.f[DIR_PPM])[kbne ];
+      f_BSW = (D.f[DIR_MMM])[kbsw ];
+      f_BSE = (D.f[DIR_PMM])[kbse ];
+      f_BNW = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho, feq, q;
       drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -760,63 +786,63 @@ __global__ void QDeviceIncompHighNu27(real* DD,
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *numberOfLBnodes];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *numberOfLBnodes];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *numberOfLBnodes];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *numberOfLBnodes];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *numberOfLBnodes];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *numberOfLBnodes];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *numberOfLBnodes];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *numberOfLBnodes];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *numberOfLBnodes];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *numberOfLBnodes];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *numberOfLBnodes];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *numberOfLBnodes];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *numberOfLBnodes];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *numberOfLBnodes];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *numberOfLBnodes];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *numberOfLBnodes];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *numberOfLBnodes];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *numberOfLBnodes];
-         D.f[DIR_000] = &DD[DIR_000*numberOfLBnodes];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *numberOfLBnodes];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *numberOfLBnodes];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *numberOfLBnodes];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *numberOfLBnodes];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *numberOfLBnodes];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *numberOfLBnodes];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *numberOfLBnodes];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *numberOfLBnodes];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *numberOfLBnodes];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *numberOfLBnodes];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *numberOfLBnodes];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *numberOfLBnodes];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *numberOfLBnodes];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *numberOfLBnodes];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *numberOfLBnodes];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *numberOfLBnodes];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *numberOfLBnodes];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *numberOfLBnodes];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *numberOfLBnodes];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *numberOfLBnodes];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *numberOfLBnodes];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *numberOfLBnodes];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *numberOfLBnodes];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *numberOfLBnodes];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *numberOfLBnodes];
-         D.f[DIR_000] = &DD[DIR_000*numberOfLBnodes];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *numberOfLBnodes];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *numberOfLBnodes];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *numberOfLBnodes];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *numberOfLBnodes];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *numberOfLBnodes];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *numberOfLBnodes];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *numberOfLBnodes];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
@@ -1055,77 +1081,77 @@ __global__ void QDeviceIncompHighNu27(real* DD,
 
 //////////////////////////////////////////////////////////////////////////////
 __global__ void QDeviceCompHighNu27(
-												 real* DD, 
-												 int* k_Q, 
-												 real* QQ,
-												 unsigned int numberOfBCnodes, 
-												 real om1, 
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int size_Mat, 
-												 bool isEvenTimestep)
+    real* DD, 
+    int* k_Q, 
+    real* QQ,
+    unsigned int numberOfBCnodes, 
+    real om1, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -1146,24 +1172,24 @@ __global__ void QDeviceCompHighNu27(
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -1206,58 +1232,58 @@ __global__ void QDeviceCompHighNu27(
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
             f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_E   = (D.f[DIR_P00   ])[ke   ];
-      f_W   = (D.f[DIR_M00   ])[kw   ];
-      f_N   = (D.f[DIR_0P0   ])[kn   ];
-      f_S   = (D.f[DIR_0M0   ])[ks   ];
-      f_T   = (D.f[DIR_00P   ])[kt   ];
-      f_B   = (D.f[DIR_00M   ])[kb   ];
-      f_NE  = (D.f[DIR_PP0  ])[kne  ];
-      f_SW  = (D.f[DIR_MM0  ])[ksw  ];
-      f_SE  = (D.f[DIR_PM0  ])[kse  ];
-      f_NW  = (D.f[DIR_MP0  ])[knw  ];
-      f_TE  = (D.f[DIR_P0P  ])[kte  ];
-      f_BW  = (D.f[DIR_M0M  ])[kbw  ];
-      f_BE  = (D.f[DIR_P0M  ])[kbe  ];
-      f_TW  = (D.f[DIR_M0P  ])[ktw  ];
-      f_TN  = (D.f[DIR_0PP  ])[ktn  ];
-      f_BS  = (D.f[DIR_0MM  ])[kbs  ];
-      f_BN  = (D.f[DIR_0PM  ])[kbn  ];
-      f_TS  = (D.f[DIR_0MP  ])[kts  ];
-      f_TNE = (D.f[DIR_PPP ])[ktne ];
-      f_TSW = (D.f[DIR_MMP ])[ktsw ];
-      f_TSE = (D.f[DIR_PMP ])[ktse ];
-      f_TNW = (D.f[DIR_MPP ])[ktnw ];
-      f_BNE = (D.f[DIR_PPM ])[kbne ];
-      f_BSW = (D.f[DIR_MMM ])[kbsw ];
-      f_BSE = (D.f[DIR_PMM ])[kbse ];
-      f_BNW = (D.f[DIR_MPM ])[kbnw ];
-      //f_W    = (D.f[DIR_P00   ])[ke   ];
-      //f_E    = (D.f[DIR_M00   ])[kw   ];
-      //f_S    = (D.f[DIR_0P0   ])[kn   ];
-      //f_N    = (D.f[DIR_0M0   ])[ks   ];
-      //f_B    = (D.f[DIR_00P   ])[kt   ];
-      //f_T    = (D.f[DIR_00M   ])[kb   ];
-      //f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      //f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      //f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      //f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      //f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      //f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      //f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      //f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      //f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      //f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      //f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      //f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      //f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      //f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      //f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      //f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      //f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      //f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      //f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      //f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_E   = (D.f[DIR_P00])[ke   ];
+      f_W   = (D.f[DIR_M00])[kw   ];
+      f_N   = (D.f[DIR_0P0])[kn   ];
+      f_S   = (D.f[DIR_0M0])[ks   ];
+      f_T   = (D.f[DIR_00P])[kt   ];
+      f_B   = (D.f[DIR_00M])[kb   ];
+      f_NE  = (D.f[DIR_PP0])[kne  ];
+      f_SW  = (D.f[DIR_MM0])[ksw  ];
+      f_SE  = (D.f[DIR_PM0])[kse  ];
+      f_NW  = (D.f[DIR_MP0])[knw  ];
+      f_TE  = (D.f[DIR_P0P])[kte  ];
+      f_BW  = (D.f[DIR_M0M])[kbw  ];
+      f_BE  = (D.f[DIR_P0M])[kbe  ];
+      f_TW  = (D.f[DIR_M0P])[ktw  ];
+      f_TN  = (D.f[DIR_0PP])[ktn  ];
+      f_BS  = (D.f[DIR_0MM])[kbs  ];
+      f_BN  = (D.f[DIR_0PM])[kbn  ];
+      f_TS  = (D.f[DIR_0MP])[kts  ];
+      f_TNE = (D.f[DIR_PPP])[ktne ];
+      f_TSW = (D.f[DIR_MMP])[ktsw ];
+      f_TSE = (D.f[DIR_PMP])[ktse ];
+      f_TNW = (D.f[DIR_MPP])[ktnw ];
+      f_BNE = (D.f[DIR_PPM])[kbne ];
+      f_BSW = (D.f[DIR_MMM])[kbsw ];
+      f_BSE = (D.f[DIR_PMM])[kbse ];
+      f_BNW = (D.f[DIR_MPM])[kbnw ];
+      //f_W    = (D.f[DIR_P00])[ke   ];
+      //f_E    = (D.f[DIR_M00])[kw   ];
+      //f_S    = (D.f[DIR_0P0])[kn   ];
+      //f_N    = (D.f[DIR_0M0])[ks   ];
+      //f_B    = (D.f[DIR_00P])[kt   ];
+      //f_T    = (D.f[DIR_00M])[kb   ];
+      //f_SW   = (D.f[DIR_PP0])[kne  ];
+      //f_NE   = (D.f[DIR_MM0])[ksw  ];
+      //f_NW   = (D.f[DIR_PM0])[kse  ];
+      //f_SE   = (D.f[DIR_MP0])[knw  ];
+      //f_BW   = (D.f[DIR_P0P])[kte  ];
+      //f_TE   = (D.f[DIR_M0M])[kbw  ];
+      //f_TW   = (D.f[DIR_P0M])[kbe  ];
+      //f_BE   = (D.f[DIR_M0P])[ktw  ];
+      //f_BS   = (D.f[DIR_0PP])[ktn  ];
+      //f_TN   = (D.f[DIR_0MM])[kbs  ];
+      //f_TS   = (D.f[DIR_0PM])[kbn  ];
+      //f_BN   = (D.f[DIR_0MP])[kts  ];
+      //f_BSW  = (D.f[DIR_PPP])[ktne ];
+      //f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      //f_BNW  = (D.f[DIR_PMP])[ktse ];
+      //f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      //f_TSW  = (D.f[DIR_PPM])[kbne ];
+      //f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      //f_TNW  = (D.f[DIR_PMM])[kbse ];
+      //f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho, feq, q;
       drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -1282,63 +1308,63 @@ __global__ void QDeviceCompHighNu27(
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
@@ -1629,16 +1655,16 @@ __global__ void QDeviceCompHighNu27(
 
 //////////////////////////////////////////////////////////////////////////////
 __global__ void QDeviceComp27(
-										 real* distributions, 
-										 int* subgridDistanceIndices, 
-										 real* subgridDistances,
-										 unsigned int numberOfBCnodes, 
-										 real omega, 
-										 unsigned int* neighborX,
-										 unsigned int* neighborY,
-										 unsigned int* neighborZ,
-										 unsigned int numberOfLBnodes, 
-										 bool isEvenTimestep)
+    real* distributions, 
+    int* subgridDistanceIndices, 
+    real* subgridDistances,
+    unsigned int numberOfBCnodes, 
+    real omega, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    //////////////////////////////////////////////////////////////////////////
    //! The no-slip boundary condition is executed in the following steps
@@ -1646,16 +1672,9 @@ __global__ void QDeviceComp27(
    ////////////////////////////////////////////////////////////////////////////////
    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
    //!
-   const unsigned  x = threadIdx.x;  // global x-index 
-   const unsigned  y = blockIdx.x;   // global y-index 
-   const unsigned  z = blockIdx.y;   // global z-index 
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
+   const unsigned nodeIndex = getNodeIndex();
 
-   const unsigned k = nx*(ny*z + y) + x;
-
-   if(k < numberOfBCnodes)
+   if(nodeIndex < numberOfBCnodes)
    {
       //////////////////////////////////////////////////////////////////////////
       //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
@@ -1673,7 +1692,7 @@ __global__ void QDeviceComp27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set neighbor indices (necessary for indirect addressing)
       //!
-      unsigned int indexOfBCnode  = subgridDistanceIndices[k];
+      unsigned int indexOfBCnode  = subgridDistanceIndices[nodeIndex];
       unsigned int kzero= indexOfBCnode;
       unsigned int ke   = indexOfBCnode;
       unsigned int kw   = neighborX[indexOfBCnode];
@@ -1705,32 +1724,32 @@ __global__ void QDeviceComp27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local distributions
       //!
-      real f_W    = (dist.f[DIR_P00   ])[ke   ];
-      real f_E    = (dist.f[DIR_M00   ])[kw   ];
-      real f_S    = (dist.f[DIR_0P0   ])[kn   ];
-      real f_N    = (dist.f[DIR_0M0   ])[ks   ];
-      real f_B    = (dist.f[DIR_00P   ])[kt   ];
-      real f_T    = (dist.f[DIR_00M   ])[kb   ];
-      real f_SW   = (dist.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (dist.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (dist.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (dist.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (dist.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (dist.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (dist.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (dist.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (dist.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (dist.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (dist.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (dist.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (dist.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (dist.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (dist.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (dist.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (dist.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (dist.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (dist.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (dist.f[DIR_MPM ])[kbnw ];
+      real f_W    = (dist.f[DIR_P00])[ke   ];
+      real f_E    = (dist.f[DIR_M00])[kw   ];
+      real f_S    = (dist.f[DIR_0P0])[kn   ];
+      real f_N    = (dist.f[DIR_0M0])[ks   ];
+      real f_B    = (dist.f[DIR_00P])[kt   ];
+      real f_T    = (dist.f[DIR_00M])[kb   ];
+      real f_SW   = (dist.f[DIR_PP0])[kne  ];
+      real f_NE   = (dist.f[DIR_MM0])[ksw  ];
+      real f_NW   = (dist.f[DIR_PM0])[kse  ];
+      real f_SE   = (dist.f[DIR_MP0])[knw  ];
+      real f_BW   = (dist.f[DIR_P0P])[kte  ];
+      real f_TE   = (dist.f[DIR_M0M])[kbw  ];
+      real f_TW   = (dist.f[DIR_P0M])[kbe  ];
+      real f_BE   = (dist.f[DIR_M0P])[ktw  ];
+      real f_BS   = (dist.f[DIR_0PP])[ktn  ];
+      real f_TN   = (dist.f[DIR_0MM])[kbs  ];
+      real f_TS   = (dist.f[DIR_0PM])[kbn  ];
+      real f_BN   = (dist.f[DIR_0MP])[kts  ];
+      real f_BSW  = (dist.f[DIR_PPP])[ktne ];
+      real f_BNE  = (dist.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (dist.f[DIR_PMP])[ktse ];
+      real f_BSE  = (dist.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (dist.f[DIR_PPM])[kbne ];
+      real f_TNE  = (dist.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (dist.f[DIR_PMM])[kbse ];
+      real f_TSE  = (dist.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Calculate macroscopic quantities
@@ -1761,7 +1780,7 @@ __global__ void QDeviceComp27(
        ////////////////////////////////////////////////////////////////////////////////
       //! - Update distributions with subgrid distance (q) between zero and one
       real feq, q, velocityLB;
-      q = (subgridD.q[DIR_P00])[k];
+      q = (subgridD.q[DIR_P00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1) // only update distribution for q between zero and one
       {
          velocityLB = vx1;
@@ -1769,7 +1788,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_M00])[kw] = getInterpolatedDistributionForNoSlipBC(q, f_E, f_W, feq, omega);
       }
 
-      q = (subgridD.q[DIR_M00])[k];
+      q = (subgridD.q[DIR_M00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1;
@@ -1777,7 +1796,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_P00])[ke] = getInterpolatedDistributionForNoSlipBC(q, f_W, f_E, feq, omega);
       }
 
-      q = (subgridD.q[DIR_0P0])[k];
+      q = (subgridD.q[DIR_0P0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx2;
@@ -1785,7 +1804,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_0M0])[ks] = getInterpolatedDistributionForNoSlipBC(q, f_N, f_S, feq, omega);
       }
 
-      q = (subgridD.q[DIR_0M0])[k];
+      q = (subgridD.q[DIR_0M0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx2;
@@ -1793,7 +1812,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_0P0])[kn] = getInterpolatedDistributionForNoSlipBC(q, f_S, f_N, feq, omega);
       }
 
-      q = (subgridD.q[DIR_00P])[k];
+      q = (subgridD.q[DIR_00P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx3;
@@ -1801,7 +1820,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_00M])[kb] = getInterpolatedDistributionForNoSlipBC(q, f_T, f_B, feq, omega);
       }
 
-      q = (subgridD.q[DIR_00M])[k];
+      q = (subgridD.q[DIR_00M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx3;
@@ -1809,7 +1828,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_00P])[kt] = getInterpolatedDistributionForNoSlipBC(q, f_B, f_T, feq, omega);
       }
 
-      q = (subgridD.q[DIR_PP0])[k];
+      q = (subgridD.q[DIR_PP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx2;
@@ -1817,7 +1836,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_MM0])[ksw] = getInterpolatedDistributionForNoSlipBC(q, f_NE, f_SW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_MM0])[k];
+      q = (subgridD.q[DIR_MM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx2;
@@ -1825,7 +1844,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_PP0])[kne] = getInterpolatedDistributionForNoSlipBC(q, f_SW, f_NE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_PM0])[k];
+      q = (subgridD.q[DIR_PM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx2;
@@ -1833,7 +1852,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_MP0])[knw] = getInterpolatedDistributionForNoSlipBC(q, f_SE, f_NW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_MP0])[k];
+      q = (subgridD.q[DIR_MP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx2;
@@ -1841,7 +1860,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_PM0])[kse] = getInterpolatedDistributionForNoSlipBC(q, f_NW, f_SE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_P0P])[k];
+      q = (subgridD.q[DIR_P0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx3;
@@ -1849,7 +1868,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_M0M])[kbw] = getInterpolatedDistributionForNoSlipBC(q, f_TE, f_BW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_M0M])[k];
+      q = (subgridD.q[DIR_M0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx3;
@@ -1857,7 +1876,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_P0P])[kte] = getInterpolatedDistributionForNoSlipBC(q, f_BW, f_TE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_P0M])[k];
+      q = (subgridD.q[DIR_P0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx3;
@@ -1865,7 +1884,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_M0P])[ktw] = getInterpolatedDistributionForNoSlipBC(q, f_BE, f_TW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_M0P])[k];
+      q = (subgridD.q[DIR_M0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx3;
@@ -1873,7 +1892,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_P0M])[kbe] = getInterpolatedDistributionForNoSlipBC(q, f_TW, f_BE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_0PP])[k];
+      q = (subgridD.q[DIR_0PP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx2 + vx3;
@@ -1881,7 +1900,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_0MM])[kbs] = getInterpolatedDistributionForNoSlipBC(q, f_TN, f_BS, feq, omega);
       }
 
-      q = (subgridD.q[DIR_0MM])[k];
+      q = (subgridD.q[DIR_0MM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx2 - vx3;
@@ -1889,7 +1908,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_0PP])[ktn] = getInterpolatedDistributionForNoSlipBC(q, f_BS, f_TN, feq, omega);
       }
 
-      q = (subgridD.q[DIR_0PM])[k];
+      q = (subgridD.q[DIR_0PM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx2 - vx3;
@@ -1897,7 +1916,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_0MP])[kts] = getInterpolatedDistributionForNoSlipBC(q, f_BN, f_TS, feq, omega);
       }
 
-      q = (subgridD.q[DIR_0MP])[k];
+      q = (subgridD.q[DIR_0MP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx2 + vx3;
@@ -1905,7 +1924,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_0PM])[kbn] = getInterpolatedDistributionForNoSlipBC(q, f_TS, f_BN, feq, omega);
       }
 
-      q = (subgridD.q[DIR_PPP])[k];
+      q = (subgridD.q[DIR_PPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx2 + vx3;
@@ -1913,7 +1932,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_MMM])[kbsw] = getInterpolatedDistributionForNoSlipBC(q, f_TNE, f_BSW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_MMM])[k];
+      q = (subgridD.q[DIR_MMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx2 - vx3;
@@ -1921,7 +1940,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_PPP])[ktne] = getInterpolatedDistributionForNoSlipBC(q, f_BSW, f_TNE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_PPM])[k];
+      q = (subgridD.q[DIR_PPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx2 - vx3;
@@ -1929,7 +1948,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_MMP])[ktsw] = getInterpolatedDistributionForNoSlipBC(q, f_BNE, f_TSW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_MMP])[k];
+      q = (subgridD.q[DIR_MMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx2 + vx3;
@@ -1937,7 +1956,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_PPM])[kbne] = getInterpolatedDistributionForNoSlipBC(q, f_TSW, f_BNE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_PMP])[k];
+      q = (subgridD.q[DIR_PMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx2 + vx3;
@@ -1945,7 +1964,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_MPM])[kbnw] = getInterpolatedDistributionForNoSlipBC(q, f_TSE, f_BNW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_MPM])[k];
+      q = (subgridD.q[DIR_MPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx2 - vx3;
@@ -1953,7 +1972,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_PMP])[ktse] = getInterpolatedDistributionForNoSlipBC(q, f_BNW, f_TSE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_PMM])[k];
+      q = (subgridD.q[DIR_PMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx2 - vx3;
@@ -1961,7 +1980,7 @@ __global__ void QDeviceComp27(
          (dist.f[DIR_MPP])[ktnw] = getInterpolatedDistributionForNoSlipBC(q, f_BSE, f_TNW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_MPP])[k];
+      q = (subgridD.q[DIR_MPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx2 + vx3;
@@ -2011,16 +2030,17 @@ __global__ void QDeviceComp27(
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QDevice27(real* distributions, 
-                                     int* subgridDistanceIndices, 
-                                     real* subgridDistances,
-                                     unsigned int numberOfBCnodes, 
-                                     real omega, 
-                                     unsigned int* neighborX,
-                                     unsigned int* neighborY,
-                                     unsigned int* neighborZ,
-                                     unsigned int numberOfLBnodes, 
-                                     bool isEvenTimestep)
+__global__ void QDevice27(
+    real* distributions, 
+    int* subgridDistanceIndices, 
+    real* subgridDistances,
+    unsigned int numberOfBCnodes, 
+    real omega, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    //////////////////////////////////////////////////////////////////////////
    //! The no-slip boundary condition is executed in the following steps
@@ -2028,19 +2048,12 @@ __global__ void QDevice27(real* distributions,
    ////////////////////////////////////////////////////////////////////////////////
    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
    //!
-   const unsigned  x = threadIdx.x;  // global x-index 
-   const unsigned  y = blockIdx.x;   // global y-index 
-   const unsigned  z = blockIdx.y;   // global z-index 
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
-
-   const unsigned k = nx*(ny*z + y) + x;
+   const unsigned nodeIndex = getNodeIndex();
 
    //////////////////////////////////////////////////////////////////////////
    //! - Run for all indices in size of boundary condition (numberOfBCnodes)
    //!
-   if(k < numberOfBCnodes)
+   if(nodeIndex < numberOfBCnodes)
    {
 
       //////////////////////////////////////////////////////////////////////////
@@ -2059,7 +2072,7 @@ __global__ void QDevice27(real* distributions,
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set neighbor indices (necessary for indirect addressing)
       //!
-      unsigned int indexOfBCnode  = subgridDistanceIndices[k];
+      unsigned int indexOfBCnode  = subgridDistanceIndices[nodeIndex];
       unsigned int kzero= indexOfBCnode;
       unsigned int ke   = indexOfBCnode;
       unsigned int kw   = neighborX[indexOfBCnode];
@@ -2091,32 +2104,32 @@ __global__ void QDevice27(real* distributions,
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local distributions
       //!
-      real f_W    = (dist.f[DIR_P00   ])[ke   ];
-      real f_E    = (dist.f[DIR_M00   ])[kw   ];
-      real f_S    = (dist.f[DIR_0P0   ])[kn   ];
-      real f_N    = (dist.f[DIR_0M0   ])[ks   ];
-      real f_B    = (dist.f[DIR_00P   ])[kt   ];
-      real f_T    = (dist.f[DIR_00M   ])[kb   ];
-      real f_SW   = (dist.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (dist.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (dist.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (dist.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (dist.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (dist.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (dist.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (dist.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (dist.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (dist.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (dist.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (dist.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (dist.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (dist.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (dist.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (dist.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (dist.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (dist.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (dist.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (dist.f[DIR_MPM ])[kbnw ];
+      real f_W    = (dist.f[DIR_P00])[ke   ];
+      real f_E    = (dist.f[DIR_M00])[kw   ];
+      real f_S    = (dist.f[DIR_0P0])[kn   ];
+      real f_N    = (dist.f[DIR_0M0])[ks   ];
+      real f_B    = (dist.f[DIR_00P])[kt   ];
+      real f_T    = (dist.f[DIR_00M])[kb   ];
+      real f_SW   = (dist.f[DIR_PP0])[kne  ];
+      real f_NE   = (dist.f[DIR_MM0])[ksw  ];
+      real f_NW   = (dist.f[DIR_PM0])[kse  ];
+      real f_SE   = (dist.f[DIR_MP0])[knw  ];
+      real f_BW   = (dist.f[DIR_P0P])[kte  ];
+      real f_TE   = (dist.f[DIR_M0M])[kbw  ];
+      real f_TW   = (dist.f[DIR_P0M])[kbe  ];
+      real f_BE   = (dist.f[DIR_M0P])[ktw  ];
+      real f_BS   = (dist.f[DIR_0PP])[ktn  ];
+      real f_TN   = (dist.f[DIR_0MM])[kbs  ];
+      real f_TS   = (dist.f[DIR_0PM])[kbn  ];
+      real f_BN   = (dist.f[DIR_0MP])[kts  ];
+      real f_BSW  = (dist.f[DIR_PPP])[ktne ];
+      real f_BNE  = (dist.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (dist.f[DIR_PMP])[ktse ];
+      real f_BSE  = (dist.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (dist.f[DIR_PPM])[kbne ];
+      real f_TNE  = (dist.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (dist.f[DIR_PMM])[kbse ];
+      real f_TSE  = (dist.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Calculate macroscopic quantities
@@ -2148,7 +2161,7 @@ __global__ void QDevice27(real* distributions,
       //! - Update distributions with subgrid distance (q) between zero and one
       //!
       real feq, q, velocityLB;
-      q = (subgridD.q[DIR_P00])[k];
+      q = (subgridD.q[DIR_P00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1) // only update distribution for q between zero and one
       {
          velocityLB = vx1;
@@ -2156,7 +2169,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_M00])[kw] = getInterpolatedDistributionForNoSlipBC(q, f_E, f_W, feq, omega);
       }
 
-      q = (subgridD.q[DIR_M00])[k];
+      q = (subgridD.q[DIR_M00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1;
@@ -2164,7 +2177,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_P00])[ke] = getInterpolatedDistributionForNoSlipBC(q, f_W, f_E, feq, omega);
       }
 
-      q = (subgridD.q[DIR_0P0])[k];
+      q = (subgridD.q[DIR_0P0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx2;
@@ -2172,7 +2185,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_0M0])[ks] = getInterpolatedDistributionForNoSlipBC(q, f_N, f_S, feq, omega);
       }
 
-      q = (subgridD.q[DIR_0M0])[k];
+      q = (subgridD.q[DIR_0M0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx2;
@@ -2180,7 +2193,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_0P0])[kn] = getInterpolatedDistributionForNoSlipBC(q, f_S, f_N, feq, omega);
       }
 
-      q = (subgridD.q[DIR_00P])[k];
+      q = (subgridD.q[DIR_00P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx3;
@@ -2188,7 +2201,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_00M])[kb] = getInterpolatedDistributionForNoSlipBC(q, f_T, f_B, feq, omega);
       }
 
-      q = (subgridD.q[DIR_00M])[k];
+      q = (subgridD.q[DIR_00M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx3;
@@ -2196,7 +2209,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_00P])[kt] = getInterpolatedDistributionForNoSlipBC(q, f_B, f_T, feq, omega);
       }
 
-      q = (subgridD.q[DIR_PP0])[k];
+      q = (subgridD.q[DIR_PP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx2;
@@ -2204,7 +2217,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_MM0])[ksw] = getInterpolatedDistributionForNoSlipBC(q, f_NE, f_SW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_MM0])[k];
+      q = (subgridD.q[DIR_MM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx2;
@@ -2212,7 +2225,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_PP0])[kne] = getInterpolatedDistributionForNoSlipBC(q, f_SW, f_NE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_PM0])[k];
+      q = (subgridD.q[DIR_PM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx2;
@@ -2220,7 +2233,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_MP0])[knw] = getInterpolatedDistributionForNoSlipBC(q, f_SE, f_NW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_MP0])[k];
+      q = (subgridD.q[DIR_MP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx2;
@@ -2228,7 +2241,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_PM0])[kse] = getInterpolatedDistributionForNoSlipBC(q, f_NW, f_SE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_P0P])[k];
+      q = (subgridD.q[DIR_P0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx3;
@@ -2236,7 +2249,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_M0M])[kbw] = getInterpolatedDistributionForNoSlipBC(q, f_TE, f_BW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_M0M])[k];
+      q = (subgridD.q[DIR_M0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx3;
@@ -2244,7 +2257,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_P0P])[kte] = getInterpolatedDistributionForNoSlipBC(q, f_BW, f_TE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_P0M])[k];
+      q = (subgridD.q[DIR_P0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx3;
@@ -2252,7 +2265,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_M0P])[ktw] = getInterpolatedDistributionForNoSlipBC(q, f_BE, f_TW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_M0P])[k];
+      q = (subgridD.q[DIR_M0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx3;
@@ -2260,7 +2273,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_P0M])[kbe] = getInterpolatedDistributionForNoSlipBC(q, f_TW, f_BE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_0PP])[k];
+      q = (subgridD.q[DIR_0PP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx2 + vx3;
@@ -2268,7 +2281,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_0MM])[kbs] = getInterpolatedDistributionForNoSlipBC(q, f_TN, f_BS, feq, omega);
       }
 
-      q = (subgridD.q[DIR_0MM])[k];
+      q = (subgridD.q[DIR_0MM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx2 - vx3;
@@ -2276,7 +2289,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_0PP])[ktn] = getInterpolatedDistributionForNoSlipBC(q, f_BS, f_TN, feq, omega);
       }
 
-      q = (subgridD.q[DIR_0PM])[k];
+      q = (subgridD.q[DIR_0PM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx2 - vx3;
@@ -2284,7 +2297,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_0MP])[kts] = getInterpolatedDistributionForNoSlipBC(q, f_BN, f_TS, feq, omega);
       }
 
-      q = (subgridD.q[DIR_0MP])[k];
+      q = (subgridD.q[DIR_0MP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx2 + vx3;
@@ -2292,7 +2305,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_0PM])[kbn] = getInterpolatedDistributionForNoSlipBC(q, f_TS, f_BN, feq, omega);
       }
 
-      q = (subgridD.q[DIR_PPP])[k];
+      q = (subgridD.q[DIR_PPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx2 + vx3;
@@ -2300,7 +2313,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_MMM])[kbsw] = getInterpolatedDistributionForNoSlipBC(q, f_TNE, f_BSW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_MMM])[k];
+      q = (subgridD.q[DIR_MMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx2 - vx3;
@@ -2308,7 +2321,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_PPP])[ktne] = getInterpolatedDistributionForNoSlipBC(q, f_BSW, f_TNE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_PPM])[k];
+      q = (subgridD.q[DIR_PPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx2 - vx3;
@@ -2316,7 +2329,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_MMP])[ktsw] = getInterpolatedDistributionForNoSlipBC(q, f_BNE, f_TSW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_MMP])[k];
+      q = (subgridD.q[DIR_MMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx2 + vx3;
@@ -2324,7 +2337,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_PPM])[kbne] = getInterpolatedDistributionForNoSlipBC(q, f_TSW, f_BNE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_PMP])[k];
+      q = (subgridD.q[DIR_PMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx2 + vx3;
@@ -2332,7 +2345,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_MPM])[kbnw] = getInterpolatedDistributionForNoSlipBC(q, f_TSE, f_BNW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_MPM])[k];
+      q = (subgridD.q[DIR_MPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx2 - vx3;
@@ -2340,7 +2353,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_PMP])[ktse] = getInterpolatedDistributionForNoSlipBC(q, f_BNW, f_TSE, feq, omega);
       }
 
-      q = (subgridD.q[DIR_PMM])[k];
+      q = (subgridD.q[DIR_PMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx2 - vx3;
@@ -2348,7 +2361,7 @@ __global__ void QDevice27(real* distributions,
          (dist.f[DIR_MPP])[ktnw] = getInterpolatedDistributionForNoSlipBC(q, f_BSE, f_TNW, feq, omega);
       }
 
-      q = (subgridD.q[DIR_MPP])[k];
+      q = (subgridD.q[DIR_MPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx2 + vx3;
@@ -2398,15 +2411,16 @@ __global__ void QDevice27(real* distributions,
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void BBDevice27(real* distributions, 
-                                     int* subgridDistanceIndices, 
-                                     real* subgridDistances,
-                                     unsigned int numberOfBCnodes, 
-                                     unsigned int* neighborX,
-                                     unsigned int* neighborY,
-                                     unsigned int* neighborZ,
-                                     unsigned int numberOfLBnodes, 
-                                     bool isEvenTimestep)
+__global__ void BBDevice27(
+    real* distributions, 
+    int* subgridDistanceIndices, 
+    real* subgridDistances,
+    unsigned int numberOfBCnodes, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    //////////////////////////////////////////////////////////////////////////
    //! The no-slip boundary condition is executed in the following steps
@@ -2414,18 +2428,11 @@ __global__ void BBDevice27(real* distributions,
    ////////////////////////////////////////////////////////////////////////////////
    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
    //!
-   const unsigned  x = threadIdx.x;   // global x-index
-   const unsigned  y = blockIdx.x;    // global y-index
-   const unsigned  z = blockIdx.y;    // global z-index
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
-
-   const unsigned k = nx*(ny*z + y) + x;
+   const unsigned nodeIndex = getNodeIndex();
 
    //////////////////////////////////////////////////////////////////////////
    // run for all indices in size of boundary condition (numberOfBCnodes)
-   if(k < numberOfBCnodes)
+   if(nodeIndex < numberOfBCnodes)
    {
       //////////////////////////////////////////////////////////////////////////
       //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
@@ -2443,7 +2450,7 @@ __global__ void BBDevice27(real* distributions,
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set neighbor indices (necessary for indirect addressing)
       //!
-      unsigned int indexOfBCnode  = subgridDistanceIndices[k];
+      unsigned int indexOfBCnode  = subgridDistanceIndices[nodeIndex];
       unsigned int ke   = indexOfBCnode;
       unsigned int kw   = neighborX[indexOfBCnode];
       unsigned int kn   = indexOfBCnode;
@@ -2474,32 +2481,32 @@ __global__ void BBDevice27(real* distributions,
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local distributions
       //!
-      real f_W    = (dist.f[DIR_P00   ])[ke   ];
-      real f_E    = (dist.f[DIR_M00   ])[kw   ];
-      real f_S    = (dist.f[DIR_0P0   ])[kn   ];
-      real f_N    = (dist.f[DIR_0M0   ])[ks   ];
-      real f_B    = (dist.f[DIR_00P   ])[kt   ];
-      real f_T    = (dist.f[DIR_00M   ])[kb   ];
-      real f_SW   = (dist.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (dist.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (dist.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (dist.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (dist.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (dist.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (dist.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (dist.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (dist.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (dist.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (dist.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (dist.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (dist.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (dist.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (dist.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (dist.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (dist.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (dist.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (dist.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (dist.f[DIR_MPM ])[kbnw ];
+      real f_W    = (dist.f[DIR_P00])[ke   ];
+      real f_E    = (dist.f[DIR_M00])[kw   ];
+      real f_S    = (dist.f[DIR_0P0])[kn   ];
+      real f_N    = (dist.f[DIR_0M0])[ks   ];
+      real f_B    = (dist.f[DIR_00P])[kt   ];
+      real f_T    = (dist.f[DIR_00M])[kb   ];
+      real f_SW   = (dist.f[DIR_PP0])[kne  ];
+      real f_NE   = (dist.f[DIR_MM0])[ksw  ];
+      real f_NW   = (dist.f[DIR_PM0])[kse  ];
+      real f_SE   = (dist.f[DIR_MP0])[knw  ];
+      real f_BW   = (dist.f[DIR_P0P])[kte  ];
+      real f_TE   = (dist.f[DIR_M0M])[kbw  ];
+      real f_TW   = (dist.f[DIR_P0M])[kbe  ];
+      real f_BE   = (dist.f[DIR_M0P])[ktw  ];
+      real f_BS   = (dist.f[DIR_0PP])[ktn  ];
+      real f_TN   = (dist.f[DIR_0MM])[kbs  ];
+      real f_TS   = (dist.f[DIR_0PM])[kbn  ];
+      real f_BN   = (dist.f[DIR_0MP])[kts  ];
+      real f_BSW  = (dist.f[DIR_PPP])[ktne ];
+      real f_BNE  = (dist.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (dist.f[DIR_PMP])[ktse ];
+      real f_BSE  = (dist.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (dist.f[DIR_PPM])[kbne ];
+      real f_TNE  = (dist.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (dist.f[DIR_PMM])[kbse ];
+      real f_TSE  = (dist.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - change the pointer to write the results in the correct array
@@ -2509,32 +2516,32 @@ __global__ void BBDevice27(real* distributions,
       ////////////////////////////////////////////////////////////////////////////////
       //! - rewrite distributions if there is a sub-grid distance (q) in same direction
       real q;
-      q = (subgridD.q[DIR_P00  ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_M00  ])[kw  ]=f_E  ;
-      q = (subgridD.q[DIR_M00  ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_P00  ])[ke  ]=f_W  ;
-      q = (subgridD.q[DIR_0P0  ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0M0  ])[ks  ]=f_N  ;
-      q = (subgridD.q[DIR_0M0  ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0P0  ])[kn  ]=f_S  ;
-      q = (subgridD.q[DIR_00P  ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_00M  ])[kb  ]=f_T  ;
-      q = (subgridD.q[DIR_00M  ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_00P  ])[kt  ]=f_B  ;
-      q = (subgridD.q[DIR_PP0 ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MM0 ])[ksw ]=f_NE ;
-      q = (subgridD.q[DIR_MM0 ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PP0 ])[kne ]=f_SW ;
-      q = (subgridD.q[DIR_PM0 ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MP0 ])[knw ]=f_SE ;
-      q = (subgridD.q[DIR_MP0 ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PM0 ])[kse ]=f_NW ;
-      q = (subgridD.q[DIR_P0P ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_M0M ])[kbw ]=f_TE ;
-      q = (subgridD.q[DIR_M0M ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_P0P ])[kte ]=f_BW ;
-      q = (subgridD.q[DIR_P0M ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_M0P ])[ktw ]=f_BE ;
-      q = (subgridD.q[DIR_M0P ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_P0M ])[kbe ]=f_TW ;
-      q = (subgridD.q[DIR_0PP ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0MM ])[kbs ]=f_TN ;
-      q = (subgridD.q[DIR_0MM ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0PP ])[ktn ]=f_BS ;
-      q = (subgridD.q[DIR_0PM ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0MP ])[kts ]=f_BN ;
-      q = (subgridD.q[DIR_0MP ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0PM ])[kbn ]=f_TS ;
-      q = (subgridD.q[DIR_PPP])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MMM])[kbsw]=f_TNE;
-      q = (subgridD.q[DIR_MMM])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PPP])[ktne]=f_BSW;
-      q = (subgridD.q[DIR_PPM])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MMP])[ktsw]=f_BNE;
-      q = (subgridD.q[DIR_MMP])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PPM])[kbne]=f_TSW;
-      q = (subgridD.q[DIR_PMP])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MPM])[kbnw]=f_TSE;
-      q = (subgridD.q[DIR_MPM])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PMP])[ktse]=f_BNW;
-      q = (subgridD.q[DIR_PMM])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MPP])[ktnw]=f_BSE;
-      q = (subgridD.q[DIR_MPP])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PMM])[kbse]=f_TNW;
+      q = (subgridD.q[DIR_P00])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_M00])[kw  ]=f_E  ;
+      q = (subgridD.q[DIR_M00])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_P00])[ke  ]=f_W  ;
+      q = (subgridD.q[DIR_0P0])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0M0])[ks  ]=f_N  ;
+      q = (subgridD.q[DIR_0M0])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0P0])[kn  ]=f_S  ;
+      q = (subgridD.q[DIR_00P])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_00M])[kb  ]=f_T  ;
+      q = (subgridD.q[DIR_00M])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_00P])[kt  ]=f_B  ;
+      q = (subgridD.q[DIR_PP0])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MM0])[ksw ]=f_NE ;
+      q = (subgridD.q[DIR_MM0])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PP0])[kne ]=f_SW ;
+      q = (subgridD.q[DIR_PM0])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MP0])[knw ]=f_SE ;
+      q = (subgridD.q[DIR_MP0])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PM0])[kse ]=f_NW ;
+      q = (subgridD.q[DIR_P0P])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_M0M])[kbw ]=f_TE ;
+      q = (subgridD.q[DIR_M0M])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_P0P])[kte ]=f_BW ;
+      q = (subgridD.q[DIR_P0M])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_M0P])[ktw ]=f_BE ;
+      q = (subgridD.q[DIR_M0P])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_P0M])[kbe ]=f_TW ;
+      q = (subgridD.q[DIR_0PP])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0MM])[kbs ]=f_TN ;
+      q = (subgridD.q[DIR_0MM])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0PP])[ktn ]=f_BS ;
+      q = (subgridD.q[DIR_0PM])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0MP])[kts ]=f_BN ;
+      q = (subgridD.q[DIR_0MP])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0PM])[kbn ]=f_TS ;
+      q = (subgridD.q[DIR_PPP])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MMM])[kbsw]=f_TNE;
+      q = (subgridD.q[DIR_MMM])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PPP])[ktne]=f_BSW;
+      q = (subgridD.q[DIR_PPM])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MMP])[ktsw]=f_BNE;
+      q = (subgridD.q[DIR_MMP])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PPM])[kbne]=f_TSW;
+      q = (subgridD.q[DIR_PMP])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MPM])[kbnw]=f_TSE;
+      q = (subgridD.q[DIR_MPM])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PMP])[ktse]=f_BNW;
+      q = (subgridD.q[DIR_PMM])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MPP])[ktnw]=f_BSE;
+      q = (subgridD.q[DIR_MPP])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PMM])[kbse]=f_TNW;
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/GPU/Particles.cu b/src/gpu/VirtualFluids_GPU/GPU/Particles.cu
index 3a3ab784e6a7901c41d402629172c3c6154ffde9..22d9df4a3b4ae706dcf9b76d93940122015248f1 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/Particles.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/Particles.cu
@@ -29,7 +29,7 @@ __global__ void InitParticles( real* coordX,
 										  unsigned int* neighborWSB,
 										  int level,
 									      unsigned int numberOfParticles, 
-										  unsigned int size_Mat)
+										  unsigned long long numberOfLBnodes)
 {
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -72,12 +72,12 @@ __global__ void InitParticles( real* coordX,
 
 		////////////////////////////////////////////////////////////////////////////////
 		//find random node of the fluid domain
-		unsigned int cbID = (unsigned int)(randArray[k]*size_Mat);
-		for(int i = 0; i < size_Mat;i++)
+		unsigned int cbID = (unsigned int)(randArray[k]*numberOfLBnodes);
+		for(int i = 0; i < numberOfLBnodes;i++)
 		{
 			//if (coordX[cbID] < 15 && coordX[cbID] > 5 && coordY[cbID] < 15 && coordY[cbID] > 5 && coordZ[cbID] < 15 && coordZ[cbID] > 5)	break;
 			if (coordX[cbID] < 5 && coordX[cbID] > 2)	break;
-			cbID = (unsigned int)(randArray[k]*(size_Mat - i)); 
+			cbID = (unsigned int)(randArray[k]*(numberOfLBnodes - i)); 
 		}
 	   
 		real coordinateX;
@@ -183,7 +183,7 @@ __global__ void MoveParticles( real* coordX,
 										  unsigned int timestep, 
 										  unsigned int numberOfTimesteps, 
 									      unsigned int numberOfParticles, 
-										  unsigned int size_Mat,
+										  unsigned long long numberOfLBnodes,
 										  bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -248,63 +248,63 @@ __global__ void MoveParticles( real* coordX,
 		{
 		   if (isEvenTimestep==true)
 		   {
-			  feC    = &DD[DIR_P00   *size_Mat];
-			  fwC    = &DD[DIR_M00   *size_Mat];
-			  fnC    = &DD[DIR_0P0   *size_Mat];
-			  fsC    = &DD[DIR_0M0   *size_Mat];
-			  ftC    = &DD[DIR_00P   *size_Mat];
-			  fbC    = &DD[DIR_00M   *size_Mat];
-			  fneC   = &DD[DIR_PP0  *size_Mat];
-			  fswC   = &DD[DIR_MM0  *size_Mat];
-			  fseC   = &DD[DIR_PM0  *size_Mat];
-			  fnwC   = &DD[DIR_MP0  *size_Mat];
-			  fteC   = &DD[DIR_P0P  *size_Mat];
-			  fbwC   = &DD[DIR_M0M  *size_Mat];
-			  fbeC   = &DD[DIR_P0M  *size_Mat];
-			  ftwC   = &DD[DIR_M0P  *size_Mat];
-			  ftnC   = &DD[DIR_0PP  *size_Mat];
-			  fbsC   = &DD[DIR_0MM  *size_Mat];
-			  fbnC   = &DD[DIR_0PM  *size_Mat];
-			  ftsC   = &DD[DIR_0MP  *size_Mat];
-			  fzeroC = &DD[DIR_000*size_Mat];
-			  ftneC  = &DD[DIR_PPP *size_Mat];
-			  ftswC  = &DD[DIR_MMP *size_Mat];
-			  ftseC  = &DD[DIR_PMP *size_Mat];
-			  ftnwC  = &DD[DIR_MPP *size_Mat];
-			  fbneC  = &DD[DIR_PPM *size_Mat];
-			  fbswC  = &DD[DIR_MMM *size_Mat];
-			  fbseC  = &DD[DIR_PMM *size_Mat];
-			  fbnwC  = &DD[DIR_MPM *size_Mat];
+			  feC    = &DD[DIR_P00 * numberOfLBnodes];
+			  fwC    = &DD[DIR_M00 * numberOfLBnodes];
+			  fnC    = &DD[DIR_0P0 * numberOfLBnodes];
+			  fsC    = &DD[DIR_0M0 * numberOfLBnodes];
+			  ftC    = &DD[DIR_00P * numberOfLBnodes];
+			  fbC    = &DD[DIR_00M * numberOfLBnodes];
+			  fneC   = &DD[DIR_PP0 * numberOfLBnodes];
+			  fswC   = &DD[DIR_MM0 * numberOfLBnodes];
+			  fseC   = &DD[DIR_PM0 * numberOfLBnodes];
+			  fnwC   = &DD[DIR_MP0 * numberOfLBnodes];
+			  fteC   = &DD[DIR_P0P * numberOfLBnodes];
+			  fbwC   = &DD[DIR_M0M * numberOfLBnodes];
+			  fbeC   = &DD[DIR_P0M * numberOfLBnodes];
+			  ftwC   = &DD[DIR_M0P * numberOfLBnodes];
+			  ftnC   = &DD[DIR_0PP * numberOfLBnodes];
+			  fbsC   = &DD[DIR_0MM * numberOfLBnodes];
+			  fbnC   = &DD[DIR_0PM * numberOfLBnodes];
+			  ftsC   = &DD[DIR_0MP * numberOfLBnodes];
+			  fzeroC = &DD[DIR_000 * numberOfLBnodes];
+			  ftneC  = &DD[DIR_PPP * numberOfLBnodes];
+			  ftswC  = &DD[DIR_MMP * numberOfLBnodes];
+			  ftseC  = &DD[DIR_PMP * numberOfLBnodes];
+			  ftnwC  = &DD[DIR_MPP * numberOfLBnodes];
+			  fbneC  = &DD[DIR_PPM * numberOfLBnodes];
+			  fbswC  = &DD[DIR_MMM * numberOfLBnodes];
+			  fbseC  = &DD[DIR_PMM * numberOfLBnodes];
+			  fbnwC  = &DD[DIR_MPM * numberOfLBnodes];
 		   } 			 
 		   else			 
 		   {			 
-			  fwC    = &DD[DIR_P00   *size_Mat];
-			  feC    = &DD[DIR_M00   *size_Mat];
-			  fsC    = &DD[DIR_0P0   *size_Mat];
-			  fnC    = &DD[DIR_0M0   *size_Mat];
-			  fbC    = &DD[DIR_00P   *size_Mat];
-			  ftC    = &DD[DIR_00M   *size_Mat];
-			  fswC   = &DD[DIR_PP0  *size_Mat];
-			  fneC   = &DD[DIR_MM0  *size_Mat];
-			  fnwC   = &DD[DIR_PM0  *size_Mat];
-			  fseC   = &DD[DIR_MP0  *size_Mat];
-			  fbwC   = &DD[DIR_P0P  *size_Mat];
-			  fteC   = &DD[DIR_M0M  *size_Mat];
-			  ftwC   = &DD[DIR_P0M  *size_Mat];
-			  fbeC   = &DD[DIR_M0P  *size_Mat];
-			  fbsC   = &DD[DIR_0PP  *size_Mat];
-			  ftnC   = &DD[DIR_0MM  *size_Mat];
-			  ftsC   = &DD[DIR_0PM  *size_Mat];
-			  fbnC   = &DD[DIR_0MP  *size_Mat];
-			  fzeroC = &DD[DIR_000*size_Mat];
-			  fbswC  = &DD[DIR_PPP *size_Mat];
-			  fbneC  = &DD[DIR_MMP *size_Mat];
-			  fbnwC  = &DD[DIR_PMP *size_Mat];
-			  fbseC  = &DD[DIR_MPP *size_Mat];
-			  ftswC  = &DD[DIR_PPM *size_Mat];
-			  ftneC  = &DD[DIR_MMM *size_Mat];
-			  ftnwC  = &DD[DIR_PMM *size_Mat];
-			  ftseC  = &DD[DIR_MPM *size_Mat];
+			  fwC    = &DD[DIR_P00 * numberOfLBnodes];
+			  feC    = &DD[DIR_M00 * numberOfLBnodes];
+			  fsC    = &DD[DIR_0P0 * numberOfLBnodes];
+			  fnC    = &DD[DIR_0M0 * numberOfLBnodes];
+			  fbC    = &DD[DIR_00P * numberOfLBnodes];
+			  ftC    = &DD[DIR_00M * numberOfLBnodes];
+			  fswC   = &DD[DIR_PP0 * numberOfLBnodes];
+			  fneC   = &DD[DIR_MM0 * numberOfLBnodes];
+			  fnwC   = &DD[DIR_PM0 * numberOfLBnodes];
+			  fseC   = &DD[DIR_MP0 * numberOfLBnodes];
+			  fbwC   = &DD[DIR_P0P * numberOfLBnodes];
+			  fteC   = &DD[DIR_M0M * numberOfLBnodes];
+			  ftwC   = &DD[DIR_P0M * numberOfLBnodes];
+			  fbeC   = &DD[DIR_M0P * numberOfLBnodes];
+			  fbsC   = &DD[DIR_0PP * numberOfLBnodes];
+			  ftnC   = &DD[DIR_0MM * numberOfLBnodes];
+			  ftsC   = &DD[DIR_0PM * numberOfLBnodes];
+			  fbnC   = &DD[DIR_0MP * numberOfLBnodes];
+			  fzeroC = &DD[DIR_000 * numberOfLBnodes];
+			  fbswC  = &DD[DIR_PPP * numberOfLBnodes];
+			  fbneC  = &DD[DIR_MMP * numberOfLBnodes];
+			  fbnwC  = &DD[DIR_PMP * numberOfLBnodes];
+			  fbseC  = &DD[DIR_MPP * numberOfLBnodes];
+			  ftswC  = &DD[DIR_PPM * numberOfLBnodes];
+			  ftneC  = &DD[DIR_MMM * numberOfLBnodes];
+			  ftnwC  = &DD[DIR_PMM * numberOfLBnodes];
+			  ftseC  = &DD[DIR_MPM * numberOfLBnodes];
 		   }
 
 			  //////////////////////////////////////////////////////////////////////////
@@ -1055,7 +1055,7 @@ __global__ void MoveParticlesWithoutBCs(   real* coordX,
 													  unsigned int timestep, 
 													  unsigned int numberOfTimesteps, 
 													  unsigned int numberOfParticles, 
-													  unsigned int size_Mat,
+													  unsigned long long numberOfLBnodes,
 													  bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -1114,63 +1114,63 @@ __global__ void MoveParticlesWithoutBCs(   real* coordX,
 		{
 		   if (isEvenTimestep==true)
 		   {
-			  feC    = &DD[DIR_P00   *size_Mat];
-			  fwC    = &DD[DIR_M00   *size_Mat];
-			  fnC    = &DD[DIR_0P0   *size_Mat];
-			  fsC    = &DD[DIR_0M0   *size_Mat];
-			  ftC    = &DD[DIR_00P   *size_Mat];
-			  fbC    = &DD[DIR_00M   *size_Mat];
-			  fneC   = &DD[DIR_PP0  *size_Mat];
-			  fswC   = &DD[DIR_MM0  *size_Mat];
-			  fseC   = &DD[DIR_PM0  *size_Mat];
-			  fnwC   = &DD[DIR_MP0  *size_Mat];
-			  fteC   = &DD[DIR_P0P  *size_Mat];
-			  fbwC   = &DD[DIR_M0M  *size_Mat];
-			  fbeC   = &DD[DIR_P0M  *size_Mat];
-			  ftwC   = &DD[DIR_M0P  *size_Mat];
-			  ftnC   = &DD[DIR_0PP  *size_Mat];
-			  fbsC   = &DD[DIR_0MM  *size_Mat];
-			  fbnC   = &DD[DIR_0PM  *size_Mat];
-			  ftsC   = &DD[DIR_0MP  *size_Mat];
-			  fzeroC = &DD[DIR_000*size_Mat];
-			  ftneC  = &DD[DIR_PPP *size_Mat];
-			  ftswC  = &DD[DIR_MMP *size_Mat];
-			  ftseC  = &DD[DIR_PMP *size_Mat];
-			  ftnwC  = &DD[DIR_MPP *size_Mat];
-			  fbneC  = &DD[DIR_PPM *size_Mat];
-			  fbswC  = &DD[DIR_MMM *size_Mat];
-			  fbseC  = &DD[DIR_PMM *size_Mat];
-			  fbnwC  = &DD[DIR_MPM *size_Mat];
+			  feC    = &DD[DIR_P00 * numberOfLBnodes];
+			  fwC    = &DD[DIR_M00 * numberOfLBnodes];
+			  fnC    = &DD[DIR_0P0 * numberOfLBnodes];
+			  fsC    = &DD[DIR_0M0 * numberOfLBnodes];
+			  ftC    = &DD[DIR_00P * numberOfLBnodes];
+			  fbC    = &DD[DIR_00M * numberOfLBnodes];
+			  fneC   = &DD[DIR_PP0 * numberOfLBnodes];
+			  fswC   = &DD[DIR_MM0 * numberOfLBnodes];
+			  fseC   = &DD[DIR_PM0 * numberOfLBnodes];
+			  fnwC   = &DD[DIR_MP0 * numberOfLBnodes];
+			  fteC   = &DD[DIR_P0P * numberOfLBnodes];
+			  fbwC   = &DD[DIR_M0M * numberOfLBnodes];
+			  fbeC   = &DD[DIR_P0M * numberOfLBnodes];
+			  ftwC   = &DD[DIR_M0P * numberOfLBnodes];
+			  ftnC   = &DD[DIR_0PP * numberOfLBnodes];
+			  fbsC   = &DD[DIR_0MM * numberOfLBnodes];
+			  fbnC   = &DD[DIR_0PM * numberOfLBnodes];
+			  ftsC   = &DD[DIR_0MP * numberOfLBnodes];
+			  fzeroC = &DD[DIR_000 * numberOfLBnodes];
+			  ftneC  = &DD[DIR_PPP * numberOfLBnodes];
+			  ftswC  = &DD[DIR_MMP * numberOfLBnodes];
+			  ftseC  = &DD[DIR_PMP * numberOfLBnodes];
+			  ftnwC  = &DD[DIR_MPP * numberOfLBnodes];
+			  fbneC  = &DD[DIR_PPM * numberOfLBnodes];
+			  fbswC  = &DD[DIR_MMM * numberOfLBnodes];
+			  fbseC  = &DD[DIR_PMM * numberOfLBnodes];
+			  fbnwC  = &DD[DIR_MPM * numberOfLBnodes];
 		   } 			 
 		   else			 
 		   {			 
-			  fwC    = &DD[DIR_P00   *size_Mat];
-			  feC    = &DD[DIR_M00   *size_Mat];
-			  fsC    = &DD[DIR_0P0   *size_Mat];
-			  fnC    = &DD[DIR_0M0   *size_Mat];
-			  fbC    = &DD[DIR_00P   *size_Mat];
-			  ftC    = &DD[DIR_00M   *size_Mat];
-			  fswC   = &DD[DIR_PP0  *size_Mat];
-			  fneC   = &DD[DIR_MM0  *size_Mat];
-			  fnwC   = &DD[DIR_PM0  *size_Mat];
-			  fseC   = &DD[DIR_MP0  *size_Mat];
-			  fbwC   = &DD[DIR_P0P  *size_Mat];
-			  fteC   = &DD[DIR_M0M  *size_Mat];
-			  ftwC   = &DD[DIR_P0M  *size_Mat];
-			  fbeC   = &DD[DIR_M0P  *size_Mat];
-			  fbsC   = &DD[DIR_0PP  *size_Mat];
-			  ftnC   = &DD[DIR_0MM  *size_Mat];
-			  ftsC   = &DD[DIR_0PM  *size_Mat];
-			  fbnC   = &DD[DIR_0MP  *size_Mat];
-			  fzeroC = &DD[DIR_000*size_Mat];
-			  fbswC  = &DD[DIR_PPP *size_Mat];
-			  fbneC  = &DD[DIR_MMP *size_Mat];
-			  fbnwC  = &DD[DIR_PMP *size_Mat];
-			  fbseC  = &DD[DIR_MPP *size_Mat];
-			  ftswC  = &DD[DIR_PPM *size_Mat];
-			  ftneC  = &DD[DIR_MMM *size_Mat];
-			  ftnwC  = &DD[DIR_PMM *size_Mat];
-			  ftseC  = &DD[DIR_MPM *size_Mat];
+			  fwC    = &DD[DIR_P00 * numberOfLBnodes];
+			  feC    = &DD[DIR_M00 * numberOfLBnodes];
+			  fsC    = &DD[DIR_0P0 * numberOfLBnodes];
+			  fnC    = &DD[DIR_0M0 * numberOfLBnodes];
+			  fbC    = &DD[DIR_00P * numberOfLBnodes];
+			  ftC    = &DD[DIR_00M * numberOfLBnodes];
+			  fswC   = &DD[DIR_PP0 * numberOfLBnodes];
+			  fneC   = &DD[DIR_MM0 * numberOfLBnodes];
+			  fnwC   = &DD[DIR_PM0 * numberOfLBnodes];
+			  fseC   = &DD[DIR_MP0 * numberOfLBnodes];
+			  fbwC   = &DD[DIR_P0P * numberOfLBnodes];
+			  fteC   = &DD[DIR_M0M * numberOfLBnodes];
+			  ftwC   = &DD[DIR_P0M * numberOfLBnodes];
+			  fbeC   = &DD[DIR_M0P * numberOfLBnodes];
+			  fbsC   = &DD[DIR_0PP * numberOfLBnodes];
+			  ftnC   = &DD[DIR_0MM * numberOfLBnodes];
+			  ftsC   = &DD[DIR_0PM * numberOfLBnodes];
+			  fbnC   = &DD[DIR_0MP * numberOfLBnodes];
+			  fzeroC = &DD[DIR_000 * numberOfLBnodes];
+			  fbswC  = &DD[DIR_PPP * numberOfLBnodes];
+			  fbneC  = &DD[DIR_MMP * numberOfLBnodes];
+			  fbnwC  = &DD[DIR_PMP * numberOfLBnodes];
+			  fbseC  = &DD[DIR_MPP * numberOfLBnodes];
+			  ftswC  = &DD[DIR_PPM * numberOfLBnodes];
+			  ftneC  = &DD[DIR_MMM * numberOfLBnodes];
+			  ftnwC  = &DD[DIR_PMM * numberOfLBnodes];
+			  ftseC  = &DD[DIR_MPM * numberOfLBnodes];
 		   }
 
 			  //////////////////////////////////////////////////////////////////////////
@@ -1928,7 +1928,7 @@ __global__ void ParticleNoSlipDeviceComp27(real* coordX,
 													  real* NormalX,
 													  real* NormalY,
 													  real* NormalZ,
-													  unsigned int size_Mat, 
+													  unsigned long long numberOfLBnodes, 
 													  bool isEvenTimestep)
 {
 
@@ -1937,63 +1937,63 @@ __global__ void ParticleNoSlipDeviceComp27(real* coordX,
    //Distributions27 D;
    //if (isEvenTimestep==true)
    //{
-   //   D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-   //   D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-   //   D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-   //   D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-   //   D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-   //   D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-   //   D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-   //   D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-   //   D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-   //   D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-   //   D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-   //   D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-   //   D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-   //   D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-   //   D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-   //   D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-   //   D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-   //   D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-   //   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   //   D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-   //   D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-   //   D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-   //   D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-   //   D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-   //   D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-   //   D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-   //   D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+   //   D.f[DIR_P00] = &DD[DIR_P00 * size_Mat];
+   //   D.f[DIR_M00] = &DD[DIR_M00 * size_Mat];
+   //   D.f[DIR_0P0] = &DD[DIR_0P0 * size_Mat];
+   //   D.f[DIR_0M0] = &DD[DIR_0M0 * size_Mat];
+   //   D.f[DIR_00P] = &DD[DIR_00P * size_Mat];
+   //   D.f[DIR_00M] = &DD[DIR_00M * size_Mat];
+   //   D.f[DIR_PP0] = &DD[DIR_PP0 * size_Mat];
+   //   D.f[DIR_MM0] = &DD[DIR_MM0 * size_Mat];
+   //   D.f[DIR_PM0] = &DD[DIR_PM0 * size_Mat];
+   //   D.f[DIR_MP0] = &DD[DIR_MP0 * size_Mat];
+   //   D.f[DIR_P0P] = &DD[DIR_P0P * size_Mat];
+   //   D.f[DIR_M0M] = &DD[DIR_M0M * size_Mat];
+   //   D.f[DIR_P0M] = &DD[DIR_P0M * size_Mat];
+   //   D.f[DIR_M0P] = &DD[DIR_M0P * size_Mat];
+   //   D.f[DIR_0PP] = &DD[DIR_0PP * size_Mat];
+   //   D.f[DIR_0MM] = &DD[DIR_0MM * size_Mat];
+   //   D.f[DIR_0PM] = &DD[DIR_0PM * size_Mat];
+   //   D.f[DIR_0MP] = &DD[DIR_0MP * size_Mat];
+   //   D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+   //   D.f[DIR_PPP] = &DD[DIR_PPP * size_Mat];
+   //   D.f[DIR_MMP] = &DD[DIR_MMP * size_Mat];
+   //   D.f[DIR_PMP] = &DD[DIR_PMP * size_Mat];
+   //   D.f[DIR_MPP] = &DD[DIR_MPP * size_Mat];
+   //   D.f[DIR_PPM] = &DD[DIR_PPM * size_Mat];
+   //   D.f[DIR_MMM] = &DD[DIR_MMM * size_Mat];
+   //   D.f[DIR_PMM] = &DD[DIR_PMM * size_Mat];
+   //   D.f[DIR_MPM] = &DD[DIR_MPM * size_Mat];
    //} 
    //else
    //{
-   //   D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-   //   D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-   //   D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-   //   D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-   //   D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-   //   D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-   //   D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-   //   D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-   //   D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-   //   D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-   //   D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-   //   D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-   //   D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-   //   D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-   //   D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-   //   D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-   //   D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-   //   D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-   //   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   //   D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-   //   D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-   //   D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-   //   D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-   //   D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-   //   D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-   //   D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-   //   D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+   //   D.f[DIR_M00] = &DD[DIR_P00 * size_Mat];
+   //   D.f[DIR_P00] = &DD[DIR_M00 * size_Mat];
+   //   D.f[DIR_0M0] = &DD[DIR_0P0 * size_Mat];
+   //   D.f[DIR_0P0] = &DD[DIR_0M0 * size_Mat];
+   //   D.f[DIR_00M] = &DD[DIR_00P * size_Mat];
+   //   D.f[DIR_00P] = &DD[DIR_00M * size_Mat];
+   //   D.f[DIR_MM0] = &DD[DIR_PP0 * size_Mat];
+   //   D.f[DIR_PP0] = &DD[DIR_MM0 * size_Mat];
+   //   D.f[DIR_MP0] = &DD[DIR_PM0 * size_Mat];
+   //   D.f[DIR_PM0] = &DD[DIR_MP0 * size_Mat];
+   //   D.f[DIR_M0M] = &DD[DIR_P0P * size_Mat];
+   //   D.f[DIR_P0P] = &DD[DIR_M0M * size_Mat];
+   //   D.f[DIR_M0P] = &DD[DIR_P0M * size_Mat];
+   //   D.f[DIR_P0M] = &DD[DIR_M0P * size_Mat];
+   //   D.f[DIR_0MM] = &DD[DIR_0PP * size_Mat];
+   //   D.f[DIR_0PP] = &DD[DIR_0MM * size_Mat];
+   //   D.f[DIR_0MP] = &DD[DIR_0PM * size_Mat];
+   //   D.f[DIR_0PM] = &DD[DIR_0MP * size_Mat];
+   //   D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+   //   D.f[DIR_PPP] = &DD[DIR_MMM * size_Mat];
+   //   D.f[DIR_MMP] = &DD[DIR_PPM * size_Mat];
+   //   D.f[DIR_PMP] = &DD[DIR_MPM * size_Mat];
+   //   D.f[DIR_MPP] = &DD[DIR_PMM * size_Mat];
+   //   D.f[DIR_PPM] = &DD[DIR_MMP * size_Mat];
+   //   D.f[DIR_MMM] = &DD[DIR_PPP * size_Mat];
+   //   D.f[DIR_PMM] = &DD[DIR_MPP * size_Mat];
+   //   D.f[DIR_MPM] = &DD[DIR_PMP * size_Mat];
    //}
    //////////////////////////////////////////////////////////////////////////////////
    //const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -2015,24 +2015,24 @@ __global__ void ParticleNoSlipDeviceComp27(real* coordX,
    // //         *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
    // //         *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
    // //         *q_dirBSE, *q_dirBNW; 
-   // //   q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-   //    q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-   // //   q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-   //    q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-   // //   q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-   //    q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-   // //   q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-   // //   q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-   // //   q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-   // //   q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-   // //   q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-   // //   q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-   // //   q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-   // //   q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-   // //   q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-   // //   q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-   // //   q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-   // //   q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+   // //   q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+   //    q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+   // //   q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+   //    q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+   // //   q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+   //    q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+   // //   q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+   // //   q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+   // //   q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+   // //   q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+   // //   q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+   // //   q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+   // //   q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+   // //   q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+   // //   q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+   // //   q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+   // //   q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+   // //   q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
    // //   q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
    // //   q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
    // //   q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -2047,24 +2047,24 @@ __global__ void ParticleNoSlipDeviceComp27(real* coordX,
    // //           *nx_dirBE,  *nx_dirTW,  *nx_dirTN,  *nx_dirBS,  *nx_dirBN,  *nx_dirTS,
    // //           *nx_dirTNE, *nx_dirTSW, *nx_dirTSE, *nx_dirTNW, *nx_dirBNE, *nx_dirBSW,
    // //           *nx_dirBSE, *nx_dirBNW; 
-   // //   nx_dirE   = &NormalX[DIR_P00   * numberOfBCnodes];
-   // //   nx_dirW   = &NormalX[DIR_M00   * numberOfBCnodes];
-   // //   nx_dirN   = &NormalX[DIR_0P0   * numberOfBCnodes];
-   // //   nx_dirS   = &NormalX[DIR_0M0   * numberOfBCnodes];
-   // //   nx_dirT   = &NormalX[DIR_00P   * numberOfBCnodes];
-   // //   nx_dirB   = &NormalX[DIR_00M   * numberOfBCnodes];
-   // //   nx_dirNE  = &NormalX[DIR_PP0  * numberOfBCnodes];
-   // //   nx_dirSW  = &NormalX[DIR_MM0  * numberOfBCnodes];
-   // //   nx_dirSE  = &NormalX[DIR_PM0  * numberOfBCnodes];
-   // //   nx_dirNW  = &NormalX[DIR_MP0  * numberOfBCnodes];
-   // //   nx_dirTE  = &NormalX[DIR_P0P  * numberOfBCnodes];
-   // //   nx_dirBW  = &NormalX[DIR_M0M  * numberOfBCnodes];
-   // //   nx_dirBE  = &NormalX[DIR_P0M  * numberOfBCnodes];
-   // //   nx_dirTW  = &NormalX[DIR_M0P  * numberOfBCnodes];
-   // //   nx_dirTN  = &NormalX[DIR_0PP  * numberOfBCnodes];
-   // //   nx_dirBS  = &NormalX[DIR_0MM  * numberOfBCnodes];
-   // //   nx_dirBN  = &NormalX[DIR_0PM  * numberOfBCnodes];
-   // //   nx_dirTS  = &NormalX[DIR_0MP  * numberOfBCnodes];
+   // //   nx_dirE   = &NormalX[DIR_P00 * numberOfBCnodes];
+   // //   nx_dirW   = &NormalX[DIR_M00 * numberOfBCnodes];
+   // //   nx_dirN   = &NormalX[DIR_0P0 * numberOfBCnodes];
+   // //   nx_dirS   = &NormalX[DIR_0M0 * numberOfBCnodes];
+   // //   nx_dirT   = &NormalX[DIR_00P * numberOfBCnodes];
+   // //   nx_dirB   = &NormalX[DIR_00M * numberOfBCnodes];
+   // //   nx_dirNE  = &NormalX[DIR_PP0 * numberOfBCnodes];
+   // //   nx_dirSW  = &NormalX[DIR_MM0 * numberOfBCnodes];
+   // //   nx_dirSE  = &NormalX[DIR_PM0 * numberOfBCnodes];
+   // //   nx_dirNW  = &NormalX[DIR_MP0 * numberOfBCnodes];
+   // //   nx_dirTE  = &NormalX[DIR_P0P * numberOfBCnodes];
+   // //   nx_dirBW  = &NormalX[DIR_M0M * numberOfBCnodes];
+   // //   nx_dirBE  = &NormalX[DIR_P0M * numberOfBCnodes];
+   // //   nx_dirTW  = &NormalX[DIR_M0P * numberOfBCnodes];
+   // //   nx_dirTN  = &NormalX[DIR_0PP * numberOfBCnodes];
+   // //   nx_dirBS  = &NormalX[DIR_0MM * numberOfBCnodes];
+   // //   nx_dirBN  = &NormalX[DIR_0PM * numberOfBCnodes];
+   // //   nx_dirTS  = &NormalX[DIR_0MP * numberOfBCnodes];
    // //   nx_dirTNE = &NormalX[DIR_PPP * numberOfBCnodes];
    // //   nx_dirTSW = &NormalX[DIR_MMP * numberOfBCnodes];
    // //   nx_dirTSE = &NormalX[DIR_PMP * numberOfBCnodes];
@@ -2079,24 +2079,24 @@ __global__ void ParticleNoSlipDeviceComp27(real* coordX,
    // //           *ny_dirBE,  *ny_dirTW,  *ny_dirTN,  *ny_dirBS,  *ny_dirBN,  *ny_dirTS,
    // //           *ny_dirTNE, *ny_dirTSW, *ny_dirTSE, *ny_dirTNW, *ny_dirBNE, *ny_dirBSW,
    // //           *ny_dirBSE, *ny_dirBNW; 
-   // //   ny_dirE   = &NormalY[DIR_P00   * numberOfBCnodes];
-   // //   ny_dirW   = &NormalY[DIR_M00   * numberOfBCnodes];
-   // //   ny_dirN   = &NormalY[DIR_0P0   * numberOfBCnodes];
-   // //   ny_dirS   = &NormalY[DIR_0M0   * numberOfBCnodes];
-   // //   ny_dirT   = &NormalY[DIR_00P   * numberOfBCnodes];
-   // //   ny_dirB   = &NormalY[DIR_00M   * numberOfBCnodes];
-   // //   ny_dirNE  = &NormalY[DIR_PP0  * numberOfBCnodes];
-   // //   ny_dirSW  = &NormalY[DIR_MM0  * numberOfBCnodes];
-   // //   ny_dirSE  = &NormalY[DIR_PM0  * numberOfBCnodes];
-   // //   ny_dirNW  = &NormalY[DIR_MP0  * numberOfBCnodes];
-   // //   ny_dirTE  = &NormalY[DIR_P0P  * numberOfBCnodes];
-   // //   ny_dirBW  = &NormalY[DIR_M0M  * numberOfBCnodes];
-   // //   ny_dirBE  = &NormalY[DIR_P0M  * numberOfBCnodes];
-   // //   ny_dirTW  = &NormalY[DIR_M0P  * numberOfBCnodes];
-   // //   ny_dirTN  = &NormalY[DIR_0PP  * numberOfBCnodes];
-   // //   ny_dirBS  = &NormalY[DIR_0MM  * numberOfBCnodes];
-   // //   ny_dirBN  = &NormalY[DIR_0PM  * numberOfBCnodes];
-   // //   ny_dirTS  = &NormalY[DIR_0MP  * numberOfBCnodes];
+   // //   ny_dirE   = &NormalY[DIR_P00 * numberOfBCnodes];
+   // //   ny_dirW   = &NormalY[DIR_M00 * numberOfBCnodes];
+   // //   ny_dirN   = &NormalY[DIR_0P0 * numberOfBCnodes];
+   // //   ny_dirS   = &NormalY[DIR_0M0 * numberOfBCnodes];
+   // //   ny_dirT   = &NormalY[DIR_00P * numberOfBCnodes];
+   // //   ny_dirB   = &NormalY[DIR_00M * numberOfBCnodes];
+   // //   ny_dirNE  = &NormalY[DIR_PP0 * numberOfBCnodes];
+   // //   ny_dirSW  = &NormalY[DIR_MM0 * numberOfBCnodes];
+   // //   ny_dirSE  = &NormalY[DIR_PM0 * numberOfBCnodes];
+   // //   ny_dirNW  = &NormalY[DIR_MP0 * numberOfBCnodes];
+   // //   ny_dirTE  = &NormalY[DIR_P0P * numberOfBCnodes];
+   // //   ny_dirBW  = &NormalY[DIR_M0M * numberOfBCnodes];
+   // //   ny_dirBE  = &NormalY[DIR_P0M * numberOfBCnodes];
+   // //   ny_dirTW  = &NormalY[DIR_M0P * numberOfBCnodes];
+   // //   ny_dirTN  = &NormalY[DIR_0PP * numberOfBCnodes];
+   // //   ny_dirBS  = &NormalY[DIR_0MM * numberOfBCnodes];
+   // //   ny_dirBN  = &NormalY[DIR_0PM * numberOfBCnodes];
+   // //   ny_dirTS  = &NormalY[DIR_0MP * numberOfBCnodes];
    // //   ny_dirTNE = &NormalY[DIR_PPP * numberOfBCnodes];
    // //   ny_dirTSW = &NormalY[DIR_MMP * numberOfBCnodes];
    // //   ny_dirTSE = &NormalY[DIR_PMP * numberOfBCnodes];
@@ -2111,24 +2111,24 @@ __global__ void ParticleNoSlipDeviceComp27(real* coordX,
    // //           *nz_dirBE,  *nz_dirTW,  *nz_dirTN,  *nz_dirBS,  *nz_dirBN,  *nz_dirTS,
    // //           *nz_dirTNE, *nz_dirTSW, *nz_dirTSE, *nz_dirTNW, *nz_dirBNE, *nz_dirBSW,
    // //           *nz_dirBSE, *nz_dirBNW; 
-   // //   nz_dirE   = &NormalZ[DIR_P00   * numberOfBCnodes];
-   // //   nz_dirW   = &NormalZ[DIR_M00   * numberOfBCnodes];
-   // //   nz_dirN   = &NormalZ[DIR_0P0   * numberOfBCnodes];
-   // //   nz_dirS   = &NormalZ[DIR_0M0   * numberOfBCnodes];
-   // //   nz_dirT   = &NormalZ[DIR_00P   * numberOfBCnodes];
-   // //   nz_dirB   = &NormalZ[DIR_00M   * numberOfBCnodes];
-   // //   nz_dirNE  = &NormalZ[DIR_PP0  * numberOfBCnodes];
-   // //   nz_dirSW  = &NormalZ[DIR_MM0  * numberOfBCnodes];
-   // //   nz_dirSE  = &NormalZ[DIR_PM0  * numberOfBCnodes];
-   // //   nz_dirNW  = &NormalZ[DIR_MP0  * numberOfBCnodes];
-   // //   nz_dirTE  = &NormalZ[DIR_P0P  * numberOfBCnodes];
-   // //   nz_dirBW  = &NormalZ[DIR_M0M  * numberOfBCnodes];
-   // //   nz_dirBE  = &NormalZ[DIR_P0M  * numberOfBCnodes];
-   // //   nz_dirTW  = &NormalZ[DIR_M0P  * numberOfBCnodes];
-   // //   nz_dirTN  = &NormalZ[DIR_0PP  * numberOfBCnodes];
-   // //   nz_dirBS  = &NormalZ[DIR_0MM  * numberOfBCnodes];
-   // //   nz_dirBN  = &NormalZ[DIR_0PM  * numberOfBCnodes];
-   // //   nz_dirTS  = &NormalZ[DIR_0MP  * numberOfBCnodes];
+   // //   nz_dirE   = &NormalZ[DIR_P00 * numberOfBCnodes];
+   // //   nz_dirW   = &NormalZ[DIR_M00 * numberOfBCnodes];
+   // //   nz_dirN   = &NormalZ[DIR_0P0 * numberOfBCnodes];
+   // //   nz_dirS   = &NormalZ[DIR_0M0 * numberOfBCnodes];
+   // //   nz_dirT   = &NormalZ[DIR_00P * numberOfBCnodes];
+   // //   nz_dirB   = &NormalZ[DIR_00M * numberOfBCnodes];
+   // //   nz_dirNE  = &NormalZ[DIR_PP0 * numberOfBCnodes];
+   // //   nz_dirSW  = &NormalZ[DIR_MM0 * numberOfBCnodes];
+   // //   nz_dirSE  = &NormalZ[DIR_PM0 * numberOfBCnodes];
+   // //   nz_dirNW  = &NormalZ[DIR_MP0 * numberOfBCnodes];
+   // //   nz_dirTE  = &NormalZ[DIR_P0P * numberOfBCnodes];
+   // //   nz_dirBW  = &NormalZ[DIR_M0M * numberOfBCnodes];
+   // //   nz_dirBE  = &NormalZ[DIR_P0M * numberOfBCnodes];
+   // //   nz_dirTW  = &NormalZ[DIR_M0P * numberOfBCnodes];
+   // //   nz_dirTN  = &NormalZ[DIR_0PP * numberOfBCnodes];
+   // //   nz_dirBS  = &NormalZ[DIR_0MM * numberOfBCnodes];
+   // //   nz_dirBN  = &NormalZ[DIR_0PM * numberOfBCnodes];
+   // //   nz_dirTS  = &NormalZ[DIR_0MP * numberOfBCnodes];
    // //   nz_dirTNE = &NormalZ[DIR_PPP * numberOfBCnodes];
    // //   nz_dirTSW = &NormalZ[DIR_MMP * numberOfBCnodes];
    // //   nz_dirTSE = &NormalZ[DIR_PMP * numberOfBCnodes];
@@ -2190,32 +2190,32 @@ __global__ void ParticleNoSlipDeviceComp27(real* coordX,
    //   unsigned int ktne = KQK;
    //   unsigned int kbsw = neighborZ[ksw];
    //   ////////////////////////////////////////////////////////////////////////////////
-   //   real f_W    = (D.f[DIR_P00   ])[ke   ];
-   //   real f_E    = (D.f[DIR_M00   ])[kw   ];
-   //   real f_S    = (D.f[DIR_0P0   ])[kn   ];
-   //   real f_N    = (D.f[DIR_0M0   ])[ks   ];
-   //   real f_B    = (D.f[DIR_00P   ])[kt   ];
-   //   real f_T    = (D.f[DIR_00M   ])[kb   ];
-   //   real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-   //   real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-   //   real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-   //   real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-   //   real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-   //   real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-   //   real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-   //   real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-   //   real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-   //   real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-   //   real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-   //   real f_BN   = (D.f[DIR_0MP  ])[kts  ];
-   //   real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-   //   real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-   //   real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-   //   real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-   //   real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-   //   real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-   //   real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-   //   real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+   //   real f_W    = (D.f[DIR_P00])[ke   ];
+   //   real f_E    = (D.f[DIR_M00])[kw   ];
+   //   real f_S    = (D.f[DIR_0P0])[kn   ];
+   //   real f_N    = (D.f[DIR_0M0])[ks   ];
+   //   real f_B    = (D.f[DIR_00P])[kt   ];
+   //   real f_T    = (D.f[DIR_00M])[kb   ];
+   //   real f_SW   = (D.f[DIR_PP0])[kne  ];
+   //   real f_NE   = (D.f[DIR_MM0])[ksw  ];
+   //   real f_NW   = (D.f[DIR_PM0])[kse  ];
+   //   real f_SE   = (D.f[DIR_MP0])[knw  ];
+   //   real f_BW   = (D.f[DIR_P0P])[kte  ];
+   //   real f_TE   = (D.f[DIR_M0M])[kbw  ];
+   //   real f_TW   = (D.f[DIR_P0M])[kbe  ];
+   //   real f_BE   = (D.f[DIR_M0P])[ktw  ];
+   //   real f_BS   = (D.f[DIR_0PP])[ktn  ];
+   //   real f_TN   = (D.f[DIR_0MM])[kbs  ];
+   //   real f_TS   = (D.f[DIR_0PM])[kbn  ];
+   //   real f_BN   = (D.f[DIR_0MP])[kts  ];
+   //   real f_BSW  = (D.f[DIR_PPP])[ktne ];
+   //   real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+   //   real f_BNW  = (D.f[DIR_PMP])[ktse ];
+   //   real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+   //   real f_TSW  = (D.f[DIR_PPM])[kbne ];
+   //   real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+   //   real f_TNW  = (D.f[DIR_PMM])[kbse ];
+   //   real f_TSE  = (D.f[DIR_MPM])[kbnw ];
    //   ////////////////////////////////////////////////////////////////////////////////
    //   // real feq, q;
    //   real vx1, vx2, vx3, drho;
@@ -2241,63 +2241,63 @@ __global__ void ParticleNoSlipDeviceComp27(real* coordX,
    //   //////////////////////////////////////////////////////////////////////////
    //   if (isEvenTimestep==false)
    //   {
-   //      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-   //      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-   //      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-   //      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-   //      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-   //      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-   //      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-   //      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-   //      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-   //      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-   //      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-   //      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-   //      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-   //      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-   //      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-   //      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-   //      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-   //      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-   //      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   //      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-   //      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-   //      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-   //      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-   //      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-   //      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-   //      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-   //      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+   //      D.f[DIR_P00] = &DD[DIR_P00 * size_Mat];
+   //      D.f[DIR_M00] = &DD[DIR_M00 * size_Mat];
+   //      D.f[DIR_0P0] = &DD[DIR_0P0 * size_Mat];
+   //      D.f[DIR_0M0] = &DD[DIR_0M0 * size_Mat];
+   //      D.f[DIR_00P] = &DD[DIR_00P * size_Mat];
+   //      D.f[DIR_00M] = &DD[DIR_00M * size_Mat];
+   //      D.f[DIR_PP0] = &DD[DIR_PP0 * size_Mat];
+   //      D.f[DIR_MM0] = &DD[DIR_MM0 * size_Mat];
+   //      D.f[DIR_PM0] = &DD[DIR_PM0 * size_Mat];
+   //      D.f[DIR_MP0] = &DD[DIR_MP0 * size_Mat];
+   //      D.f[DIR_P0P] = &DD[DIR_P0P * size_Mat];
+   //      D.f[DIR_M0M] = &DD[DIR_M0M * size_Mat];
+   //      D.f[DIR_P0M] = &DD[DIR_P0M * size_Mat];
+   //      D.f[DIR_M0P] = &DD[DIR_M0P * size_Mat];
+   //      D.f[DIR_0PP] = &DD[DIR_0PP * size_Mat];
+   //      D.f[DIR_0MM] = &DD[DIR_0MM * size_Mat];
+   //      D.f[DIR_0PM] = &DD[DIR_0PM * size_Mat];
+   //      D.f[DIR_0MP] = &DD[DIR_0MP * size_Mat];
+   //      D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+   //      D.f[DIR_PPP] = &DD[DIR_PPP * size_Mat];
+   //      D.f[DIR_MMP] = &DD[DIR_MMP * size_Mat];
+   //      D.f[DIR_PMP] = &DD[DIR_PMP * size_Mat];
+   //      D.f[DIR_MPP] = &DD[DIR_MPP * size_Mat];
+   //      D.f[DIR_PPM] = &DD[DIR_PPM * size_Mat];
+   //      D.f[DIR_MMM] = &DD[DIR_MMM * size_Mat];
+   //      D.f[DIR_PMM] = &DD[DIR_PMM * size_Mat];
+   //      D.f[DIR_MPM] = &DD[DIR_MPM * size_Mat];
    //   } 
    //   else
    //   {
-   //      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-   //      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-   //      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-   //      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-   //      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-   //      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-   //      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-   //      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-   //      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-   //      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-   //      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-   //      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-   //      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-   //      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-   //      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-   //      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-   //      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-   //      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-   //      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   //      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-   //      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-   //      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-   //      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-   //      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-   //      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-   //      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-   //      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+   //      D.f[DIR_M00] = &DD[DIR_P00 * size_Mat];
+   //      D.f[DIR_P00] = &DD[DIR_M00 * size_Mat];
+   //      D.f[DIR_0M0] = &DD[DIR_0P0 * size_Mat];
+   //      D.f[DIR_0P0] = &DD[DIR_0M0 * size_Mat];
+   //      D.f[DIR_00M] = &DD[DIR_00P * size_Mat];
+   //      D.f[DIR_00P] = &DD[DIR_00M * size_Mat];
+   //      D.f[DIR_MM0] = &DD[DIR_PP0 * size_Mat];
+   //      D.f[DIR_PP0] = &DD[DIR_MM0 * size_Mat];
+   //      D.f[DIR_MP0] = &DD[DIR_PM0 * size_Mat];
+   //      D.f[DIR_PM0] = &DD[DIR_MP0 * size_Mat];
+   //      D.f[DIR_M0M] = &DD[DIR_P0P * size_Mat];
+   //      D.f[DIR_P0P] = &DD[DIR_M0M * size_Mat];
+   //      D.f[DIR_M0P] = &DD[DIR_P0M * size_Mat];
+   //      D.f[DIR_P0M] = &DD[DIR_M0P * size_Mat];
+   //      D.f[DIR_0MM] = &DD[DIR_0PP * size_Mat];
+   //      D.f[DIR_0PP] = &DD[DIR_0MM * size_Mat];
+   //      D.f[DIR_0MP] = &DD[DIR_0PM * size_Mat];
+   //      D.f[DIR_0PM] = &DD[DIR_0MP * size_Mat];
+   //      D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+   //      D.f[DIR_PPP] = &DD[DIR_MMM * size_Mat];
+   //      D.f[DIR_MMP] = &DD[DIR_PPM * size_Mat];
+   //      D.f[DIR_PMP] = &DD[DIR_MPM * size_Mat];
+   //      D.f[DIR_MPP] = &DD[DIR_PMM * size_Mat];
+   //      D.f[DIR_PPM] = &DD[DIR_MMP * size_Mat];
+   //      D.f[DIR_MMM] = &DD[DIR_PPP * size_Mat];
+   //      D.f[DIR_PMM] = &DD[DIR_MPP * size_Mat];
+   //      D.f[DIR_MPM] = &DD[DIR_PMP * size_Mat];
    //   }
    //}
 }
diff --git a/src/gpu/VirtualFluids_GPU/GPU/PrecursorBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/PrecursorBCs27.cu
new file mode 100644
index 0000000000000000000000000000000000000000..177eb41587896dd7993b06f98a1506abfc4f3f5f
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/GPU/PrecursorBCs27.cu
@@ -0,0 +1,1157 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PrecursorBCs27.cu
+//! \ingroup GPU
+//! \author Henry Korb, Henrik Asmuth
+//======================================================================================
+#include "LBM/LB.h"
+#include <lbm/constants/NumericConstants.h>
+#include <lbm/constants/D3Q27.h>
+#include <lbm/MacroscopicQuantities.h>
+
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
+
+using namespace vf::lbm::constant;
+using namespace vf::lbm::dir;
+using namespace vf::gpu;
+
+__global__ void QPrecursorDeviceCompZeroPress(
+    int* subgridDistanceIndices,
+    int numberOfBCnodes,
+    int numberOfPrecursorNodes,
+    int sizeQ,
+    real omega,
+    real* distributions,
+    real* subgridDistances,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    uint* neighbors0PP,
+    uint* neighbors0PM,
+    uint* neighbors0MP,
+    uint* neighbors0MM,
+    real* weights0PP,
+    real* weights0PM,
+    real* weights0MP,
+    real* weights0MM,
+    real* vLast,
+    real* vCurrent,
+    real velocityX,
+    real velocityY,
+    real velocityZ,
+    real timeRatio,
+    real velocityRatio,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
+
+    if(nodeIndex>=numberOfBCnodes) return;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // interpolation of velocity
+    real vxLastInterpd, vyLastInterpd, vzLastInterpd;
+    real vxNextInterpd, vyNextInterpd, vzNextInterpd;
+
+    uint kNeighbor0PP = neighbors0PP[nodeIndex];
+    real d0PP = weights0PP[nodeIndex];
+
+    real* vxLast = vLast;
+    real* vyLast = &vLast[numberOfPrecursorNodes];
+    real* vzLast = &vLast[2*numberOfPrecursorNodes];
+
+    real* vxCurrent = vCurrent;
+    real* vyCurrent = &vCurrent[numberOfPrecursorNodes];
+    real* vzCurrent = &vCurrent[2*numberOfPrecursorNodes];
+
+    if(d0PP < 1e6)
+    {
+        uint kNeighbor0PM = neighbors0PM[nodeIndex];
+        uint kNeighbor0MP = neighbors0MP[nodeIndex];
+        uint kNeighbor0MM = neighbors0MM[nodeIndex];
+
+        real d0PM = weights0PM[nodeIndex];
+        real d0MP = weights0MP[nodeIndex];
+        real d0MM = weights0MM[nodeIndex];
+
+        real invWeightSum = 1.f/(d0PP+d0PM+d0MP+d0MM);
+
+        vxLastInterpd = (vxLast[kNeighbor0PP]*d0PP + vxLast[kNeighbor0PM]*d0PM + vxLast[kNeighbor0MP]*d0MP + vxLast[kNeighbor0MM]*d0MM)*invWeightSum;
+        vyLastInterpd = (vyLast[kNeighbor0PP]*d0PP + vyLast[kNeighbor0PM]*d0PM + vyLast[kNeighbor0MP]*d0MP + vyLast[kNeighbor0MM]*d0MM)*invWeightSum;
+        vzLastInterpd = (vzLast[kNeighbor0PP]*d0PP + vzLast[kNeighbor0PM]*d0PM + vzLast[kNeighbor0MP]*d0MP + vzLast[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        vxNextInterpd = (vxCurrent[kNeighbor0PP]*d0PP + vxCurrent[kNeighbor0PM]*d0PM + vxCurrent[kNeighbor0MP]*d0MP + vxCurrent[kNeighbor0MM]*d0MM)*invWeightSum;
+        vyNextInterpd = (vyCurrent[kNeighbor0PP]*d0PP + vyCurrent[kNeighbor0PM]*d0PM + vyCurrent[kNeighbor0MP]*d0MP + vyCurrent[kNeighbor0MM]*d0MM)*invWeightSum;
+        vzNextInterpd = (vzCurrent[kNeighbor0PP]*d0PP + vzCurrent[kNeighbor0PM]*d0PM + vzCurrent[kNeighbor0MP]*d0MP + vzCurrent[kNeighbor0MM]*d0MM)*invWeightSum;
+    }
+    else
+    {
+        vxLastInterpd = vxLast[kNeighbor0PP];
+        vyLastInterpd = vyLast[kNeighbor0PP];
+        vzLastInterpd = vzLast[kNeighbor0PP];
+
+        vxNextInterpd = vxCurrent[kNeighbor0PP];
+        vyNextInterpd = vyCurrent[kNeighbor0PP];
+        vzNextInterpd = vzCurrent[kNeighbor0PP];
+    }
+
+    // if(k==16300)s printf("%f %f %f\n", vxLastInterpd, vyLastInterpd, vzLastInterpd);
+    real VeloX = (velocityX + (1.f-timeRatio)*vxLastInterpd + timeRatio*vxNextInterpd)/velocityRatio;
+    real VeloY = (velocityY + (1.f-timeRatio)*vyLastInterpd + timeRatio*vyNextInterpd)/velocityRatio;
+    real VeloZ = (velocityZ + (1.f-timeRatio)*vzLastInterpd + timeRatio*vzNextInterpd)/velocityRatio;
+    // From here on just a copy of QVelDeviceCompZeroPress
+    ////////////////////////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////////////////////
+    //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep
+    //! is based on the esoteric twist algorithm \ref <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier
+    //! et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
+    //!
+    Distributions27 dist;
+    getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+
+    unsigned int KQK  = subgridDistanceIndices[nodeIndex];
+    unsigned int k000= KQK;
+    unsigned int kP00   = KQK;
+    unsigned int kM00   = neighborX[KQK];
+    unsigned int k0P0   = KQK;
+    unsigned int k0M0   = neighborY[KQK];
+    unsigned int k00P   = KQK;
+    unsigned int k00M   = neighborZ[KQK];
+    unsigned int kMM0  = neighborY[kM00];
+    unsigned int kPP0  = KQK;
+    unsigned int kPM0  = k0M0;
+    unsigned int kMP0  = kM00;
+    unsigned int kM0M  = neighborZ[kM00];
+    unsigned int kP0P  = KQK;
+    unsigned int kP0M  = k00M;
+    unsigned int kM0P  = kM00;
+    unsigned int k0PP  = KQK;
+    unsigned int k0MM  = neighborZ[k0M0];
+    unsigned int k0PM  = k00M;
+    unsigned int k0MP  = k0M0;
+    unsigned int kPMP = k0M0;
+    unsigned int kMPM = kM0M;
+    unsigned int kMPP = kM00;
+    unsigned int kPMM = k0MM;
+    unsigned int kMMP = kMM0;
+    unsigned int kPPM = k00M;
+    unsigned int kPPP = KQK;
+    unsigned int kMMM = neighborZ[kMM0];
+
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Set local distributions
+    //!
+    real f_M00 = (dist.f[DIR_P00])[kP00];
+    real f_P00 = (dist.f[DIR_M00])[kM00];
+    real f_0M0 = (dist.f[DIR_0P0])[k0P0];
+    real f_0P0 = (dist.f[DIR_0M0])[k0M0];
+    real f_00M = (dist.f[DIR_00P])[k00P];
+    real f_00P = (dist.f[DIR_00M])[k00M];
+    real f_MM0 = (dist.f[DIR_PP0])[kPP0];
+    real f_PP0 = (dist.f[DIR_MM0])[kMM0];
+    real f_MP0 = (dist.f[DIR_PM0])[kPM0];
+    real f_PM0 = (dist.f[DIR_MP0])[kMP0];
+    real f_M0M = (dist.f[DIR_P0P])[kP0P];
+    real f_P0P = (dist.f[DIR_M0M])[kM0M];
+    real f_M0P = (dist.f[DIR_P0M])[kP0M];
+    real f_P0M = (dist.f[DIR_M0P])[kM0P];
+    real f_0MM = (dist.f[DIR_0PP])[k0PP];
+    real f_0PP = (dist.f[DIR_0MM])[k0MM];
+    real f_0MP = (dist.f[DIR_0PM])[k0PM];
+    real f_0PM = (dist.f[DIR_0MP])[k0MP];
+    real f_MMM = (dist.f[DIR_PPP])[kPPP];
+    real f_PPM = (dist.f[DIR_MMP])[kMMP];
+    real f_MPM = (dist.f[DIR_PMP])[kPMP];
+    real f_PMM = (dist.f[DIR_MPP])[kMPP];
+    real f_MMP = (dist.f[DIR_PPM])[kPPM];
+    real f_PPP = (dist.f[DIR_MMM])[kMMM];
+    real f_MPP = (dist.f[DIR_PMM])[kPMM];
+    real f_PMP = (dist.f[DIR_MPM])[kMPM];
+
+    SubgridDistances27 subgridD;
+    getPointersToSubgridDistances(subgridD, subgridDistances, numberOfBCnodes);
+
+    ////////////////////////////////////////////////////////////////////////////////
+      real drho   =  f_PMP + f_MPP + f_PPP + f_MMP + f_PMM + f_MPM + f_PPM + f_MMM +
+                     f_0PM + f_0PP + f_0MP + f_0MM + f_P0M + f_M0P + f_P0P + f_M0M + f_PM0 + f_MP0 + f_PP0 + f_MM0 +
+                     f_00P + f_00M + f_0P0 + f_0M0 + f_P00 + f_M00 + ((dist.f[DIR_000])[k000]);
+
+      real vx1 =  (((f_PMP - f_MPM) - (f_MPP - f_PMM)) + ((f_PPP - f_MMM) - (f_MMP - f_PPM)) +
+                      ((f_P0M - f_M0P)   + (f_P0P - f_M0M))   + ((f_PM0 - f_MP0)   + (f_PP0 - f_MM0)) +
+                      (f_P00 - f_M00)) / (c1o1 + drho);
+
+
+      real vx2 =   ((-(f_PMP - f_MPM) + (f_MPP - f_PMM)) + ((f_PPP - f_MMM) - (f_MMP - f_PPM)) +
+                       ((f_0PM - f_0MP)   + (f_0PP - f_0MM))    + (-(f_PM0 - f_MP0)  + (f_PP0 - f_MM0)) +
+                       (f_0P0 - f_0M0)) / (c1o1 + drho);
+
+      real vx3 =   (((f_PMP - f_MPM) + (f_MPP - f_PMM)) + ((f_PPP - f_MMM) + (f_MMP - f_PPM)) +
+                       (-(f_0PM - f_0MP)  + (f_0PP - f_0MM))   + ((f_P0P - f_M0M)   - (f_P0M - f_M0P)) +
+                       (f_00P - f_00M)) / (c1o1 + drho);
+
+
+    // if(k==16383 || k==0) printf("k %d kQ %d drho = %f u %f v %f w %f\n",k, KQK, drho, vx1, vx2, vx3);
+      real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3) * (c1o1 + drho);
+    //////////////////////////////////////////////////////////////////////////
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Update distributions with subgrid distance (q) between zero and one
+    real feq, q, velocityLB, velocityBC;
+    q = (subgridD.q[DIR_P00])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1) // only update distribution for q between zero and one
+    {
+        velocityLB = vx1;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
+        velocityBC = VeloX;
+        (dist.f[DIR_M00])[kM00] = getInterpolatedDistributionForVeloWithPressureBC(q, f_P00, f_M00, feq, omega, drho, velocityBC, c2o27);
+    }
+
+    q = (subgridD.q[DIR_M00])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
+        velocityBC = -VeloX;
+        (dist.f[DIR_P00])[kP00] = getInterpolatedDistributionForVeloWithPressureBC(q, f_M00, f_P00, feq, omega, drho, velocityBC, c2o27);
+    }
+
+    q = (subgridD.q[DIR_0P0])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx2;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
+        velocityBC = VeloY;
+        (dist.f[DIR_0M0])[DIR_0M0] = getInterpolatedDistributionForVeloWithPressureBC(q, f_0P0, f_0M0, feq, omega, drho, velocityBC, c2o27);
+    }
+
+    q = (subgridD.q[DIR_0M0])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx2;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
+        velocityBC = -VeloY;
+        (dist.f[DIR_0P0])[k0P0] = getInterpolatedDistributionForVeloWithPressureBC(q, f_0M0, f_0P0, feq, omega, drho, velocityBC, c2o27);
+    }
+
+    q = (subgridD.q[DIR_00P])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
+        velocityBC = VeloZ;
+        (dist.f[DIR_00M])[k00M] = getInterpolatedDistributionForVeloWithPressureBC(q, f_00P, f_00M, feq, omega, drho, velocityBC, c2o27);
+    }
+
+    q = (subgridD.q[DIR_00M])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
+        velocityBC = -VeloZ;
+        (dist.f[DIR_00P])[k00P] = getInterpolatedDistributionForVeloWithPressureBC(q, f_00M, f_00P, feq, omega, drho, velocityBC, c2o27);
+    }
+
+    q = (subgridD.q[DIR_PP0])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 + vx2;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = VeloX + VeloY;
+        (dist.f[DIR_MM0])[kMM0] = getInterpolatedDistributionForVeloWithPressureBC(q, f_PP0, f_MM0, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_MM0])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 - vx2;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloX - VeloY;
+        (dist.f[DIR_PP0])[kPP0] = getInterpolatedDistributionForVeloWithPressureBC(q, f_MM0, f_PP0, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_PM0])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 - vx2;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = VeloX - VeloY;
+        (dist.f[DIR_MP0])[kMP0] = getInterpolatedDistributionForVeloWithPressureBC(q, f_PM0, f_MP0, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_MP0])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 + vx2;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloX + VeloY;
+        (dist.f[DIR_PM0])[kPM0] = getInterpolatedDistributionForVeloWithPressureBC(q, f_MP0, f_PM0, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_P0P])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = VeloX + VeloZ;
+        (dist.f[DIR_M0M])[kM0M] = getInterpolatedDistributionForVeloWithPressureBC(q, f_P0P, f_M0M, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_M0M])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloX - VeloZ;
+        (dist.f[DIR_P0P])[kP0P] = getInterpolatedDistributionForVeloWithPressureBC(q, f_M0M, f_P0P, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_P0M])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = VeloX - VeloZ;
+        (dist.f[DIR_M0P])[kM0P] = getInterpolatedDistributionForVeloWithPressureBC(q, f_P0M, f_M0P, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_M0P])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloX + VeloZ;
+        (dist.f[DIR_P0M])[kP0M] = getInterpolatedDistributionForVeloWithPressureBC(q, f_M0P, f_P0M, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_0PP])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx2 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = VeloY + VeloZ;
+        (dist.f[DIR_0MM])[k0MM] = getInterpolatedDistributionForVeloWithPressureBC(q, f_0PP, f_0MM, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_0MM])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx2 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloY - VeloZ;
+        (dist.f[DIR_0PP])[k0PP] = getInterpolatedDistributionForVeloWithPressureBC(q, f_0MM, f_0PP, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_0PM])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx2 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = VeloY - VeloZ;
+        (dist.f[DIR_0MP])[k0MP] = getInterpolatedDistributionForVeloWithPressureBC(q, f_0PM, f_0PP, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_0MP])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx2 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloY + VeloZ;
+        (dist.f[DIR_0PM])[k0PM] = getInterpolatedDistributionForVeloWithPressureBC(q, f_0PP, f_0PM, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_PPP])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 + vx2 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = VeloX + VeloY + VeloZ;
+        (dist.f[DIR_MMM])[kMMM] = getInterpolatedDistributionForVeloWithPressureBC(q, f_PPP, f_MMM, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_MMM])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 - vx2 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = -VeloX - VeloY - VeloZ;
+        (dist.f[DIR_PPP])[kPPP] = getInterpolatedDistributionForVeloWithPressureBC(q, f_MMM, f_PPP, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_PPM])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 + vx2 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = VeloX + VeloY - VeloZ;
+        (dist.f[DIR_MMP])[kMMP] = getInterpolatedDistributionForVeloWithPressureBC(q, f_PPM, f_MMP, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_MMP])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 - vx2 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = -VeloX - VeloY + VeloZ;
+        (dist.f[DIR_PPM])[kPPM] = getInterpolatedDistributionForVeloWithPressureBC(q, f_MMP, f_PPM, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_PMP])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 - vx2 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = VeloX - VeloY + VeloZ;
+        (dist.f[DIR_MPM])[kMPM] = getInterpolatedDistributionForVeloWithPressureBC(q, f_PMP, f_MPM, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_MPM])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 + vx2 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = -VeloX + VeloY - VeloZ;
+        (dist.f[DIR_PMP])[kPMP] = getInterpolatedDistributionForVeloWithPressureBC(q, f_MPM, f_PMP, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_PMM])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 - vx2 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = VeloX - VeloY - VeloZ;
+        (dist.f[DIR_MPP])[kMPP] = getInterpolatedDistributionForVeloWithPressureBC(q, f_PMM, f_MPP, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_MPP])[nodeIndex];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 + vx2 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = -VeloX + VeloY + VeloZ;
+        (dist.f[DIR_PMM])[kPMM] = getInterpolatedDistributionForVeloWithPressureBC(q, f_MPP, f_PMM, feq, omega, drho, velocityBC, c1o216);
+    }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void PrecursorDeviceEQ27(
+    int *subgridDistanceIndices,
+    int numberOfBCnodes,
+    int numberOfPrecursorNodes,
+    real omega,
+    real* distributions,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    uint* neighbors0PP,
+    uint* neighbors0PM,
+    uint* neighbors0MP,
+    uint* neighbors0MM,
+    real* weights0PP,
+    real* weights0PM,
+    real* weights0MP,
+    real* weights0MM,
+    real* vLast,
+    real* vCurrent,
+    real velocityX,
+    real velocityY,
+    real velocityZ,
+    real timeRatio,
+    real velocityRatio,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
+
+    if(nodeIndex>=numberOfBCnodes) return;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // interpolation of velocity
+    real vxLastInterpd, vyLastInterpd, vzLastInterpd;
+    real vxNextInterpd, vyNextInterpd, vzNextInterpd;
+
+    uint kNeighbor0PP = neighbors0PP[nodeIndex];
+    real d0PP = weights0PP[nodeIndex];
+
+    real* vxLast = vLast;
+    real* vyLast = &vLast[numberOfPrecursorNodes];
+    real* vzLast = &vLast[2*numberOfPrecursorNodes];
+
+    real* vxCurrent = vCurrent;
+    real* vyCurrent = &vCurrent[numberOfPrecursorNodes];
+    real* vzCurrent = &vCurrent[2*numberOfPrecursorNodes];
+
+    if(d0PP < 1e6)
+    {
+        uint kNeighbor0PM = neighbors0PM[nodeIndex];
+        uint kNeighbor0MP = neighbors0MP[nodeIndex];
+        uint kNeighbor0MM = neighbors0MM[nodeIndex];
+
+        real d0PM = weights0PM[nodeIndex];
+        real d0MP = weights0MP[nodeIndex];
+        real d0MM = weights0MM[nodeIndex];
+
+        real invWeightSum = 1.f/(d0PP+d0PM+d0MP+d0MM);
+
+        vxLastInterpd = (vxLast[kNeighbor0PP]*d0PP + vxLast[kNeighbor0PM]*d0PM + vxLast[kNeighbor0MP]*d0MP + vxLast[kNeighbor0MM]*d0MM)*invWeightSum;
+        vyLastInterpd = (vyLast[kNeighbor0PP]*d0PP + vyLast[kNeighbor0PM]*d0PM + vyLast[kNeighbor0MP]*d0MP + vyLast[kNeighbor0MM]*d0MM)*invWeightSum;
+        vzLastInterpd = (vzLast[kNeighbor0PP]*d0PP + vzLast[kNeighbor0PM]*d0PM + vzLast[kNeighbor0MP]*d0MP + vzLast[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        vxNextInterpd = (vxCurrent[kNeighbor0PP]*d0PP + vxCurrent[kNeighbor0PM]*d0PM + vxCurrent[kNeighbor0MP]*d0MP + vxCurrent[kNeighbor0MM]*d0MM)*invWeightSum;
+        vyNextInterpd = (vyCurrent[kNeighbor0PP]*d0PP + vyCurrent[kNeighbor0PM]*d0PM + vyCurrent[kNeighbor0MP]*d0MP + vyCurrent[kNeighbor0MM]*d0MM)*invWeightSum;
+        vzNextInterpd = (vzCurrent[kNeighbor0PP]*d0PP + vzCurrent[kNeighbor0PM]*d0PM + vzCurrent[kNeighbor0MP]*d0MP + vzCurrent[kNeighbor0MM]*d0MM)*invWeightSum;
+    }
+    else
+    {
+        vxLastInterpd = vxLast[kNeighbor0PP];
+        vyLastInterpd = vyLast[kNeighbor0PP];
+        vzLastInterpd = vzLast[kNeighbor0PP];
+
+        vxNextInterpd = vxCurrent[kNeighbor0PP];
+        vyNextInterpd = vyCurrent[kNeighbor0PP];
+        vzNextInterpd = vzCurrent[kNeighbor0PP];
+    }
+
+    // if(k==16300) printf("%f %f %f\n", vxLastInterpd, vyLastInterpd, vzLastInterpd);
+    real VeloX = (velocityX + (1.f-timeRatio)*vxLastInterpd + timeRatio*vxNextInterpd)/velocityRatio;
+    real VeloY = (velocityY + (1.f-timeRatio)*vyLastInterpd + timeRatio*vyNextInterpd)/velocityRatio;
+    real VeloZ = (velocityZ + (1.f-timeRatio)*vzLastInterpd + timeRatio*vzNextInterpd)/velocityRatio;
+    // From here on just a copy of QVelDeviceCompZeroPress
+    ////////////////////////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////////////////////
+    //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep
+    //! is based on the esoteric twist algorithm \ref <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier
+    //! et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
+    //!
+    Distributions27 dist;
+    getPointersToDistributions(dist, distributions, numberOfLBnodes, !isEvenTimestep);
+
+    unsigned int KQK  = subgridDistanceIndices[nodeIndex]; //QK
+    unsigned int k000 = KQK; //000
+    unsigned int kP00 = KQK; //P00
+    unsigned int kM00 = neighborX[KQK]; //M00
+    unsigned int k0P0   = KQK; //n
+    unsigned int k0M0   = neighborY[KQK]; //s
+    unsigned int k00P   = KQK; //t
+    unsigned int k00M   = neighborZ[KQK]; //b
+    unsigned int kMM0  = neighborY[kM00]; //sw
+    unsigned int kPP0  = KQK; //ne
+    unsigned int kPM0  = k0M0; //se
+    unsigned int kMP0  = kM00; //nw
+    unsigned int kM0M  = neighborZ[kM00]; //bw
+    unsigned int kP0P  = KQK; //te
+    unsigned int kP0M  = k00M; //be
+    unsigned int k0PP  = KQK; //tn
+    unsigned int k0MM  = neighborZ[k0M0]; //bs
+    unsigned int kM0P  = kM00; //tw
+    unsigned int k0PM  = k00M; //bn
+    unsigned int k0MP  = k0M0; //ts
+    unsigned int kPMP = k0M0; //tse
+    unsigned int kMPM = kM0M; //bnw
+    unsigned int kMPP = kM00; //tnw
+    unsigned int kPMM = k0MM; //bse
+    unsigned int kMMP = kMM0; //tsw
+    unsigned int kPPM = k00M; //bne
+    unsigned int kPPP = KQK; //tne
+    unsigned int kMMM = neighborZ[kMM0]; //bsw
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // based on BGK Plus Comp
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    real f_M00 = (dist.f[DIR_P00])[kP00];
+    real f_P00 = (dist.f[DIR_M00])[kM00];
+    real f_0M0 = (dist.f[DIR_0P0])[k0P0];
+    real f_0P0 = (dist.f[DIR_0M0])[k0M0];
+    real f_00M = (dist.f[DIR_00P])[k00P];
+    real f_00P = (dist.f[DIR_00M])[k00M];
+    real f_MM0 = (dist.f[DIR_PP0])[kPP0];
+    real f_PP0 = (dist.f[DIR_MM0])[kMM0];
+    real f_MP0 = (dist.f[DIR_PM0])[kPM0];
+    real f_PM0 = (dist.f[DIR_MP0])[kMP0];
+    real f_M0M = (dist.f[DIR_P0P])[kP0P];
+    real f_P0P = (dist.f[DIR_M0M])[kM0M];
+    real f_M0P = (dist.f[DIR_P0M])[kP0M];
+    real f_P0M = (dist.f[DIR_M0P])[kM0P];
+    real f_0MM = (dist.f[DIR_0PP])[k0PP];
+    real f_0PP = (dist.f[DIR_0MM])[k0MM];
+    real f_0PM = (dist.f[DIR_0MP])[k0MP];
+    real f_0MP = (dist.f[DIR_0PM])[k0PM];
+    real f_000 = (dist.f[DIR_000])[k000];
+    real f_MMM = (dist.f[DIR_PPP])[kPPP];
+    real f_PPM = (dist.f[DIR_MMP])[kMMP];
+    real f_MPM = (dist.f[DIR_PMP])[kPMP];
+    real f_PMM = (dist.f[DIR_MPP])[kMPP];
+    real f_MMP = (dist.f[DIR_PPM])[kPPM];
+    real f_PPP = (dist.f[DIR_MMM])[kMMM];
+    real f_MPP = (dist.f[DIR_PMM])[kPMM];
+    real f_PMP = (dist.f[DIR_MPM])[kMPM];
+
+      ////////////////////////////////////////////////////////////////////////////////
+      //! - Set macroscopic quantities
+      //!
+      real drho = c0o1;
+
+      real vx1  = VeloX;
+
+      real vx2  = VeloY;
+
+      real vx3  = VeloZ;
+
+      real cusq = c3o2 * (vx1 * vx1 + vx2 * vx2 + vx3 * vx3);
+
+      ////////////////////////////////////////////////////////////////////////////////
+      f_000 = c8o27* (drho-(drho+c1o1)*cusq);
+      f_P00 = c2o27* (drho+(drho+c1o1)*(c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cusq));
+      f_M00 = c2o27* (drho+(drho+c1o1)*(c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cusq));
+      f_0P0 = c2o27* (drho+(drho+c1o1)*(c3o1*(    vx2     )+c9o2*(     vx2    )*(     vx2    )-cusq));
+      f_0M0 = c2o27* (drho+(drho+c1o1)*(c3o1*(   -vx2     )+c9o2*(    -vx2    )*(    -vx2    )-cusq));
+      f_00P = c2o27* (drho+(drho+c1o1)*(c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cusq));
+      f_00M = c2o27* (drho+(drho+c1o1)*(c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cusq));
+      f_PP0 = c1o54* (drho+(drho+c1o1)*(c3o1*( vx1+vx2    )+c9o2*( vx1+vx2    )*( vx1+vx2    )-cusq));
+      f_MM0 = c1o54* (drho+(drho+c1o1)*(c3o1*(-vx1-vx2    )+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cusq));
+      f_PM0 = c1o54* (drho+(drho+c1o1)*(c3o1*( vx1-vx2    )+c9o2*( vx1-vx2    )*( vx1-vx2    )-cusq));
+      f_MP0 = c1o54* (drho+(drho+c1o1)*(c3o1*(-vx1+vx2    )+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cusq));
+      f_P0P = c1o54* (drho+(drho+c1o1)*(c3o1*( vx1    +vx3)+c9o2*( vx1    +vx3)*( vx1    +vx3)-cusq));
+      f_M0M = c1o54* (drho+(drho+c1o1)*(c3o1*(-vx1    -vx3)+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cusq));
+      f_P0M = c1o54* (drho+(drho+c1o1)*(c3o1*( vx1    -vx3)+c9o2*( vx1    -vx3)*( vx1    -vx3)-cusq));
+      f_M0P = c1o54* (drho+(drho+c1o1)*(c3o1*(-vx1    +vx3)+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cusq));
+      f_0PP = c1o54* (drho+(drho+c1o1)*(c3o1*(     vx2+vx3)+c9o2*(     vx2+vx3)*(     vx2+vx3)-cusq));
+      f_0MM = c1o54* (drho+(drho+c1o1)*(c3o1*(    -vx2-vx3)+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cusq));
+      f_0PM = c1o54* (drho+(drho+c1o1)*(c3o1*(     vx2-vx3)+c9o2*(     vx2-vx3)*(     vx2-vx3)-cusq));
+      f_0MP = c1o54* (drho+(drho+c1o1)*(c3o1*(    -vx2+vx3)+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cusq));
+      f_PPP = c1o216*(drho+(drho+c1o1)*(c3o1*( vx1+vx2+vx3)+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cusq));
+      f_MMM = c1o216*(drho+(drho+c1o1)*(c3o1*(-vx1-vx2-vx3)+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cusq));
+      f_PPM = c1o216*(drho+(drho+c1o1)*(c3o1*( vx1+vx2-vx3)+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cusq));
+      f_MMP = c1o216*(drho+(drho+c1o1)*(c3o1*(-vx1-vx2+vx3)+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cusq));
+      f_PMP = c1o216*(drho+(drho+c1o1)*(c3o1*( vx1-vx2+vx3)+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cusq));
+      f_MPM = c1o216*(drho+(drho+c1o1)*(c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cusq));
+      f_PMM = c1o216*(drho+(drho+c1o1)*(c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cusq));
+      f_MPP = c1o216*(drho+(drho+c1o1)*(c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cusq));
+
+      ////////////////////////////////////////////////////////////////////////////////
+      //! write the new distributions to the bc nodes
+      //!
+      (dist.f[DIR_P00])[kP00] = f_M00;
+      (dist.f[DIR_PP0])[kPP0] = f_MM0;
+      (dist.f[DIR_P0M])[kP0M] = f_M0P;
+      (dist.f[DIR_PM0])[kPM0] = f_MP0;
+      (dist.f[DIR_PMP])[kPMP] = f_MPM;
+      (dist.f[DIR_P0P])[kP0P] = f_M0M;
+      (dist.f[DIR_PPM])[kPPM] = f_MMP;
+      (dist.f[DIR_PPP])[kPPP] = f_MMM;
+      (dist.f[DIR_PMM])[kPMM] = f_MPP;
+
+      (dist.f[DIR_M00])[kM00] = f_P00;
+      (dist.f[DIR_MM0])[kMM0] = f_PP0;
+      (dist.f[DIR_M0M])[kM0M] = f_P0P;
+      (dist.f[DIR_MP0])[kMP0] = f_PM0;
+      (dist.f[DIR_M0P])[kM0P] = f_P0M;
+      (dist.f[DIR_MMM])[kMMM] = f_PPP;
+      (dist.f[DIR_MMP])[kMMP] = f_PPM;
+      (dist.f[DIR_MPP])[kMPP] = f_PMM;
+      (dist.f[DIR_MPM])[kMPM] = f_PMP;
+
+      (dist.f[DIR_0P0])[k0P0] = f_0M0;
+      (dist.f[DIR_0M0])[k0M0] = f_0P0;
+      (dist.f[DIR_00P])[k00P] = f_00M;
+      (dist.f[DIR_00M])[k00M] = f_00P;
+      (dist.f[DIR_0PP])[k0PP] = f_0MM;
+      (dist.f[DIR_0MM])[k0MM] = f_0PP;
+      (dist.f[DIR_0PM])[k0PM] = f_0MP;
+      (dist.f[DIR_0MP])[k0MP] = f_0PM;
+      (dist.f[DIR_000])[k000] = f_000;
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void PrecursorDeviceDistributions(
+    int *subgridDistanceIndices,
+    int numberOfBCnodes,
+    int numberOfPrecursorNodes,
+    real* distributions,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    uint* neighbors0PP,
+    uint* neighbors0PM,
+    uint* neighbors0MP,
+    uint* neighbors0MM,
+    real* weights0PP,
+    real* weights0PM,
+    real* weights0MP,
+    real* weights0MM,
+    real* fsLast,
+    real* fsNext,
+    real timeRatio,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
+
+    if(nodeIndex>=numberOfBCnodes) return;
+
+    uint kNeighbor0PP = neighbors0PP[nodeIndex];
+    real d0PP = weights0PP[nodeIndex];
+
+    real f0LastInterp, f1LastInterp, f2LastInterp, f3LastInterp, f4LastInterp, f5LastInterp, f6LastInterp, f7LastInterp, f8LastInterp;
+    real f0NextInterp, f1NextInterp, f2NextInterp, f3NextInterp, f4NextInterp, f5NextInterp, f6NextInterp, f7NextInterp, f8NextInterp;
+
+    real* f0Last = fsLast;
+    real* f1Last = &fsLast[  numberOfPrecursorNodes];
+    real* f2Last = &fsLast[2*numberOfPrecursorNodes];
+    real* f3Last = &fsLast[3*numberOfPrecursorNodes];
+    real* f4Last = &fsLast[4*numberOfPrecursorNodes];
+    real* f5Last = &fsLast[5*numberOfPrecursorNodes];
+    real* f6Last = &fsLast[6*numberOfPrecursorNodes];
+    real* f7Last = &fsLast[7*numberOfPrecursorNodes];
+    real* f8Last = &fsLast[8*numberOfPrecursorNodes];
+
+    real* f0Next = fsNext;
+    real* f1Next = &fsNext[  numberOfPrecursorNodes];
+    real* f2Next = &fsNext[2*numberOfPrecursorNodes];
+    real* f3Next = &fsNext[3*numberOfPrecursorNodes];
+    real* f4Next = &fsNext[4*numberOfPrecursorNodes];
+    real* f5Next = &fsNext[5*numberOfPrecursorNodes];
+    real* f6Next = &fsNext[6*numberOfPrecursorNodes];
+    real* f7Next = &fsNext[7*numberOfPrecursorNodes];
+    real* f8Next = &fsNext[8*numberOfPrecursorNodes];
+
+
+    if(d0PP<1e6)
+    {
+        uint kNeighbor0PM = neighbors0PM[nodeIndex];
+        uint kNeighbor0MP = neighbors0MP[nodeIndex];
+        uint kNeighbor0MM = neighbors0MM[nodeIndex];
+
+        real d0PM = weights0PM[nodeIndex];
+        real d0MP = weights0MP[nodeIndex];
+        real d0MM = weights0MM[nodeIndex];
+
+        real invWeightSum = 1.f/(d0PP+d0PM+d0MP+d0MM);
+
+        f0LastInterp = (f0Last[kNeighbor0PP]*d0PP + f0Last[kNeighbor0PM]*d0PM + f0Last[kNeighbor0MP]*d0MP + f0Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f0NextInterp = (f0Next[kNeighbor0PP]*d0PP + f0Next[kNeighbor0PM]*d0PM + f0Next[kNeighbor0MP]*d0MP + f0Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f1LastInterp = (f1Last[kNeighbor0PP]*d0PP + f1Last[kNeighbor0PM]*d0PM + f1Last[kNeighbor0MP]*d0MP + f1Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f1NextInterp = (f1Next[kNeighbor0PP]*d0PP + f1Next[kNeighbor0PM]*d0PM + f1Next[kNeighbor0MP]*d0MP + f1Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f2LastInterp = (f2Last[kNeighbor0PP]*d0PP + f2Last[kNeighbor0PM]*d0PM + f2Last[kNeighbor0MP]*d0MP + f2Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f2NextInterp = (f2Next[kNeighbor0PP]*d0PP + f2Next[kNeighbor0PM]*d0PM + f2Next[kNeighbor0MP]*d0MP + f2Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f3LastInterp = (f3Last[kNeighbor0PP]*d0PP + f3Last[kNeighbor0PM]*d0PM + f3Last[kNeighbor0MP]*d0MP + f3Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f3NextInterp = (f3Next[kNeighbor0PP]*d0PP + f3Next[kNeighbor0PM]*d0PM + f3Next[kNeighbor0MP]*d0MP + f3Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f4LastInterp = (f4Last[kNeighbor0PP]*d0PP + f4Last[kNeighbor0PM]*d0PM + f4Last[kNeighbor0MP]*d0MP + f4Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f4NextInterp = (f4Next[kNeighbor0PP]*d0PP + f4Next[kNeighbor0PM]*d0PM + f4Next[kNeighbor0MP]*d0MP + f4Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f5LastInterp = (f5Last[kNeighbor0PP]*d0PP + f5Last[kNeighbor0PM]*d0PM + f5Last[kNeighbor0MP]*d0MP + f5Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f5NextInterp = (f5Next[kNeighbor0PP]*d0PP + f5Next[kNeighbor0PM]*d0PM + f5Next[kNeighbor0MP]*d0MP + f5Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f6LastInterp = (f6Last[kNeighbor0PP]*d0PP + f6Last[kNeighbor0PM]*d0PM + f6Last[kNeighbor0MP]*d0MP + f6Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f6NextInterp = (f6Next[kNeighbor0PP]*d0PP + f6Next[kNeighbor0PM]*d0PM + f6Next[kNeighbor0MP]*d0MP + f6Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f7LastInterp = (f7Last[kNeighbor0PP]*d0PP + f7Last[kNeighbor0PM]*d0PM + f7Last[kNeighbor0MP]*d0MP + f7Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f7NextInterp = (f7Next[kNeighbor0PP]*d0PP + f7Next[kNeighbor0PM]*d0PM + f7Next[kNeighbor0MP]*d0MP + f7Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f8LastInterp = (f8Last[kNeighbor0PP]*d0PP + f8Last[kNeighbor0PM]*d0PM + f8Last[kNeighbor0MP]*d0MP + f8Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f8NextInterp = (f8Next[kNeighbor0PP]*d0PP + f8Next[kNeighbor0PM]*d0PM + f8Next[kNeighbor0MP]*d0MP + f8Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+    } else {
+        f0LastInterp = f0Last[kNeighbor0PP];
+        f1LastInterp = f1Last[kNeighbor0PP];
+        f2LastInterp = f2Last[kNeighbor0PP];
+        f3LastInterp = f3Last[kNeighbor0PP];
+        f4LastInterp = f4Last[kNeighbor0PP];
+        f5LastInterp = f5Last[kNeighbor0PP];
+        f6LastInterp = f6Last[kNeighbor0PP];
+        f7LastInterp = f7Last[kNeighbor0PP];
+        f8LastInterp = f8Last[kNeighbor0PP];
+
+        f0NextInterp = f0Next[kNeighbor0PP];
+        f1NextInterp = f1Next[kNeighbor0PP];
+        f2NextInterp = f2Next[kNeighbor0PP];
+        f3NextInterp = f3Next[kNeighbor0PP];
+        f4NextInterp = f4Next[kNeighbor0PP];
+        f5NextInterp = f5Next[kNeighbor0PP];
+        f6NextInterp = f6Next[kNeighbor0PP];
+        f7NextInterp = f7Next[kNeighbor0PP];
+        f8NextInterp = f8Next[kNeighbor0PP];
+    }
+    //////////////////////////////////////////////////////////////////////////
+    //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep
+    //! is based on the esoteric twist algorithm \ref <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier
+    //! et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
+    //!
+    Distributions27 dist;
+    getPointersToDistributions(dist, distributions, numberOfLBnodes, !isEvenTimestep);
+
+    unsigned int KQK  = subgridDistanceIndices[nodeIndex];
+    // unsigned int k000= KQK;
+    unsigned int kP00   = KQK;
+    // unsigned int kM00   = neighborX[KQK];
+    // unsigned int k0P0   = KQK;
+    unsigned int k0M0   = neighborY[KQK];
+    // unsigned int k00P   = KQK;
+    unsigned int k00M   = neighborZ[KQK];
+    // unsigned int kMM0  = neighborY[kM00];
+    unsigned int kPP0  = KQK;
+    unsigned int kPM0  = k0M0;
+    // unsigned int kMP0  = kM00;
+    // unsigned int kM0M  = neighborZ[kM00];
+    unsigned int kP0P  = KQK;
+    unsigned int kP0M  = k00M;
+    // unsigned int kM0P  = kM00;
+    unsigned int k0MM  = neighborZ[k0M0];
+    // unsigned int k0PM  = k00M;
+    // unsigned int k0MP  = k0M0;
+    unsigned int kPMP = k0M0;
+    // unsigned int kMPM = kM0M;
+    // unsigned int kMPP = kM00;
+    unsigned int kPMM = k0MM;
+    // unsigned int kMMP = kMM0;
+    unsigned int kPPM = k00M;
+    unsigned int kPPP = KQK;
+    // unsigned int kMMM = neighborZ[kMM0];
+
+    dist.f[DIR_P00][kP00] = f0LastInterp*(1.f-timeRatio) + f0NextInterp*timeRatio;
+    dist.f[DIR_PP0][kPP0] = f1LastInterp*(1.f-timeRatio) + f1NextInterp*timeRatio;
+    dist.f[DIR_PM0][kPM0] = f2LastInterp*(1.f-timeRatio) + f2NextInterp*timeRatio;
+    dist.f[DIR_P0P][kP0P] = f3LastInterp*(1.f-timeRatio) + f3NextInterp*timeRatio;
+    dist.f[DIR_P0M][kP0M] = f4LastInterp*(1.f-timeRatio) + f4NextInterp*timeRatio;
+    dist.f[DIR_PPP][kPPP] = f5LastInterp*(1.f-timeRatio) + f5NextInterp*timeRatio;
+    dist.f[DIR_PMP][kPMP] = f6LastInterp*(1.f-timeRatio) + f6NextInterp*timeRatio;
+    dist.f[DIR_PPM][kPPM] = f7LastInterp*(1.f-timeRatio) + f7NextInterp*timeRatio;
+    dist.f[DIR_PMM][kPMM] = f8LastInterp*(1.f-timeRatio) + f8NextInterp*timeRatio;
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// NOTE: Has not been tested after bug fix!
+__global__ void QPrecursorDeviceDistributions(
+    int* subgridDistanceIndices,
+    real* subgridDistances,
+    int sizeQ,
+    int numberOfBCnodes,
+    int numberOfPrecursorNodes,
+    real* distributions,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    uint* neighbors0PP,
+    uint* neighbors0PM,
+    uint* neighbors0MP,
+    uint* neighbors0MM,
+    real* weights0PP,
+    real* weights0PM,
+    real* weights0MP,
+    real* weights0MM,
+    real* fsLast,
+    real* fsNext,
+    real timeRatio,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
+{
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
+
+    if(nodeIndex>=numberOfBCnodes) return;
+
+    uint kNeighbor0PP = neighbors0PP[nodeIndex];
+    real d0PP = weights0PP[nodeIndex];
+
+    real f0LastInterp, f1LastInterp, f2LastInterp, f3LastInterp, f4LastInterp, f5LastInterp, f6LastInterp, f7LastInterp, f8LastInterp;
+    real f0NextInterp, f1NextInterp, f2NextInterp, f3NextInterp, f4NextInterp, f5NextInterp, f6NextInterp, f7NextInterp, f8NextInterp;
+
+    real* f0Last = fsLast;
+    real* f1Last = &fsLast[  numberOfPrecursorNodes];
+    real* f2Last = &fsLast[2*numberOfPrecursorNodes];
+    real* f3Last = &fsLast[3*numberOfPrecursorNodes];
+    real* f4Last = &fsLast[4*numberOfPrecursorNodes];
+    real* f5Last = &fsLast[5*numberOfPrecursorNodes];
+    real* f6Last = &fsLast[6*numberOfPrecursorNodes];
+    real* f7Last = &fsLast[7*numberOfPrecursorNodes];
+    real* f8Last = &fsLast[8*numberOfPrecursorNodes];
+
+    real* f0Next = fsNext;
+    real* f1Next = &fsNext[  numberOfPrecursorNodes];
+    real* f2Next = &fsNext[2*numberOfPrecursorNodes];
+    real* f3Next = &fsNext[3*numberOfPrecursorNodes];
+    real* f4Next = &fsNext[4*numberOfPrecursorNodes];
+    real* f5Next = &fsNext[5*numberOfPrecursorNodes];
+    real* f6Next = &fsNext[6*numberOfPrecursorNodes];
+    real* f7Next = &fsNext[7*numberOfPrecursorNodes];
+    real* f8Next = &fsNext[8*numberOfPrecursorNodes];
+
+
+    if(d0PP<1e6)
+    {
+        uint kNeighbor0PM = neighbors0PM[nodeIndex];
+        uint kNeighbor0MP = neighbors0MP[nodeIndex];
+        uint kNeighbor0MM = neighbors0MM[nodeIndex];
+
+        real d0PM = weights0PM[nodeIndex];
+        real d0MP = weights0MP[nodeIndex];
+        real d0MM = weights0MM[nodeIndex];
+
+        real invWeightSum = 1.f/(d0PP+d0PM+d0MP+d0MM);
+
+        f0LastInterp = (f0Last[kNeighbor0PP]*d0PP + f0Last[kNeighbor0PM]*d0PM + f0Last[kNeighbor0MP]*d0MP + f0Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f0NextInterp = (f0Next[kNeighbor0PP]*d0PP + f0Next[kNeighbor0PM]*d0PM + f0Next[kNeighbor0MP]*d0MP + f0Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f1LastInterp = (f1Last[kNeighbor0PP]*d0PP + f1Last[kNeighbor0PM]*d0PM + f1Last[kNeighbor0MP]*d0MP + f1Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f1NextInterp = (f1Next[kNeighbor0PP]*d0PP + f1Next[kNeighbor0PM]*d0PM + f1Next[kNeighbor0MP]*d0MP + f1Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f2LastInterp = (f2Last[kNeighbor0PP]*d0PP + f2Last[kNeighbor0PM]*d0PM + f2Last[kNeighbor0MP]*d0MP + f2Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f2NextInterp = (f2Next[kNeighbor0PP]*d0PP + f2Next[kNeighbor0PM]*d0PM + f2Next[kNeighbor0MP]*d0MP + f2Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f3LastInterp = (f3Last[kNeighbor0PP]*d0PP + f3Last[kNeighbor0PM]*d0PM + f3Last[kNeighbor0MP]*d0MP + f3Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f3NextInterp = (f3Next[kNeighbor0PP]*d0PP + f3Next[kNeighbor0PM]*d0PM + f3Next[kNeighbor0MP]*d0MP + f3Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f4LastInterp = (f4Last[kNeighbor0PP]*d0PP + f4Last[kNeighbor0PM]*d0PM + f4Last[kNeighbor0MP]*d0MP + f4Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f4NextInterp = (f4Next[kNeighbor0PP]*d0PP + f4Next[kNeighbor0PM]*d0PM + f4Next[kNeighbor0MP]*d0MP + f4Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f5LastInterp = (f5Last[kNeighbor0PP]*d0PP + f5Last[kNeighbor0PM]*d0PM + f5Last[kNeighbor0MP]*d0MP + f5Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f5NextInterp = (f5Next[kNeighbor0PP]*d0PP + f5Next[kNeighbor0PM]*d0PM + f5Next[kNeighbor0MP]*d0MP + f5Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f6LastInterp = (f6Last[kNeighbor0PP]*d0PP + f6Last[kNeighbor0PM]*d0PM + f6Last[kNeighbor0MP]*d0MP + f6Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f6NextInterp = (f6Next[kNeighbor0PP]*d0PP + f6Next[kNeighbor0PM]*d0PM + f6Next[kNeighbor0MP]*d0MP + f6Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f7LastInterp = (f7Last[kNeighbor0PP]*d0PP + f7Last[kNeighbor0PM]*d0PM + f7Last[kNeighbor0MP]*d0MP + f7Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f7NextInterp = (f7Next[kNeighbor0PP]*d0PP + f7Next[kNeighbor0PM]*d0PM + f7Next[kNeighbor0MP]*d0MP + f7Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+        f8LastInterp = (f8Last[kNeighbor0PP]*d0PP + f8Last[kNeighbor0PM]*d0PM + f8Last[kNeighbor0MP]*d0MP + f8Last[kNeighbor0MM]*d0MM)*invWeightSum;
+        f8NextInterp = (f8Next[kNeighbor0PP]*d0PP + f8Next[kNeighbor0PM]*d0PM + f8Next[kNeighbor0MP]*d0MP + f8Next[kNeighbor0MM]*d0MM)*invWeightSum;
+
+    } else {
+        f0LastInterp = f0Last[kNeighbor0PP];
+        f1LastInterp = f1Last[kNeighbor0PP];
+        f2LastInterp = f2Last[kNeighbor0PP];
+        f3LastInterp = f3Last[kNeighbor0PP];
+        f4LastInterp = f4Last[kNeighbor0PP];
+        f5LastInterp = f5Last[kNeighbor0PP];
+        f6LastInterp = f6Last[kNeighbor0PP];
+        f7LastInterp = f7Last[kNeighbor0PP];
+        f8LastInterp = f8Last[kNeighbor0PP];
+
+        f0NextInterp = f0Next[kNeighbor0PP];
+        f1NextInterp = f1Next[kNeighbor0PP];
+        f2NextInterp = f2Next[kNeighbor0PP];
+        f3NextInterp = f3Next[kNeighbor0PP];
+        f4NextInterp = f4Next[kNeighbor0PP];
+        f5NextInterp = f5Next[kNeighbor0PP];
+        f6NextInterp = f6Next[kNeighbor0PP];
+        f7NextInterp = f7Next[kNeighbor0PP];
+        f8NextInterp = f8Next[kNeighbor0PP];
+    }
+    //////////////////////////////////////////////////////////////////////////
+    //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep
+    //! is based on the esoteric twist algorithm \ref <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier
+    //! et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
+    //!
+    Distributions27 dist;
+    getPointersToDistributions(dist, distributions, numberOfLBnodes, !isEvenTimestep);
+
+    unsigned int KQK  = subgridDistanceIndices[nodeIndex];
+    // unsigned int k000= KQK;
+    unsigned int kP00   = KQK;
+    // unsigned int kM00   = neighborX[KQK];
+    // unsigned int k0P0   = KQK;
+    unsigned int k0M0   = neighborY[KQK];
+    // unsigned int k00P   = KQK;
+    unsigned int k00M   = neighborZ[KQK];
+    // unsigned int kMM0  = neighborY[kM00];
+    unsigned int kPP0  = KQK;
+    unsigned int kPM0  = k0M0;
+    // unsigned int kMP0  = kM00;
+    // unsigned int kM0M  = neighborZ[kM00];
+    unsigned int kP0P  = KQK;
+    unsigned int kP0M  = k00M;
+    // unsigned int kM0P  = kM00;
+    unsigned int k0MM  = neighborZ[k0M0];
+    // unsigned int k0PM  = k00M;
+    // unsigned int k0MP  = k0M0;
+    unsigned int kPMP = k0M0;
+    // unsigned int kMPM = kM0M;
+    // unsigned int kMPP = kM00;
+    unsigned int kPMM = k0MM;
+    // unsigned int kMMP = kMM0;
+    unsigned int kPPM = k00M;
+    unsigned int kPPP = KQK;
+    // unsigned int kMMM = neighborZ[kMM0];
+    SubgridDistances27 qs;
+    getPointersToSubgridDistances(qs, subgridDistances, sizeQ);
+
+    real q;
+    q = qs.q[DIR_P00][nodeIndex]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_P00][kP00] = f0LastInterp*(1.f-timeRatio) + f0NextInterp*timeRatio;
+    q = qs.q[DIR_PP0][nodeIndex]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_PP0][kPP0] = f1LastInterp*(1.f-timeRatio) + f1NextInterp*timeRatio;
+    q = qs.q[DIR_PM0][nodeIndex]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_PM0][kPM0] = f2LastInterp*(1.f-timeRatio) + f2NextInterp*timeRatio;
+    q = qs.q[DIR_P0P][nodeIndex]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_P0P][kP0P] = f3LastInterp*(1.f-timeRatio) + f3NextInterp*timeRatio;
+    q = qs.q[DIR_P0M][nodeIndex]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_P0M][kP0M] = f4LastInterp*(1.f-timeRatio) + f4NextInterp*timeRatio;
+    q = qs.q[DIR_PPP][nodeIndex]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_PPP][kPPP] = f5LastInterp*(1.f-timeRatio) + f5NextInterp*timeRatio;
+    q = qs.q[DIR_PMP][nodeIndex]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_PMP][kPMP] = f6LastInterp*(1.f-timeRatio) + f6NextInterp*timeRatio;
+    q = qs.q[DIR_PPM][nodeIndex]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_PPM][kPPM] = f7LastInterp*(1.f-timeRatio) + f7NextInterp*timeRatio;
+    q = qs.q[DIR_PMM][nodeIndex]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_PMM][kPMM] = f8LastInterp*(1.f-timeRatio) + f8NextInterp*timeRatio;
+
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/GPU/PressBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/PressBCs27.cu
index ccb2ce79c63515e59e4f9ae75016f44ced71a170..02cfd2bce3723162b645cef568c87ca3b1dd2720 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/PressBCs27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/PressBCs27.cu
@@ -1,29 +1,63 @@
-/* Device code */
-#include "LBM/LB.h" 
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PressBCs27.cu
+//! \ingroup GPU
+//! \author Martin Schoenherr, Anna Wellmann
+//======================================================================================
+#include "LBM/LB.h"
 #include "lbm/constants/D3Q27.h"
 #include "lbm/constants/NumericConstants.h"
-#include "KernelUtilities.h"
+#include "lbm/MacroscopicQuantities.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
 
 using namespace vf::lbm::constant;
 using namespace vf::lbm::dir;
+using namespace vf::gpu;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-__global__ void QInflowScaleByPressDevice27(  real* rhoBC,
-														 real* DD, 
-														 int* k_Q, 
-														 int* k_N, 
-														 int numberOfBCnodes, 
-														 real om1, 
-														 unsigned int* neighborX,
-														 unsigned int* neighborY,
-														 unsigned int* neighborZ,
-														 unsigned int size_Mat, 
-														 bool isEvenTimestep)
+__global__ void QInflowScaleByPressDevice27(
+    real* rhoBC,
+    real* DD,
+    int* k_Q,
+    int* k_N,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+   const unsigned  x = threadIdx.x;  // Globaler x-Index
+   const unsigned  y = blockIdx.x;   // Globaler y-Index
+   const unsigned  z = blockIdx.y;   // Globaler z-Index
 
    const unsigned nx = blockDim.x;
    const unsigned ny = gridDim.x;
@@ -97,141 +131,141 @@ __global__ void QInflowScaleByPressDevice27(  real* rhoBC,
       Distributions27 D;
       if (isEvenTimestep==true)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      real f1_E    = (D.f[DIR_P00   ])[k1e   ];
-      real f1_W    = (D.f[DIR_M00   ])[k1w   ];
-      real f1_N    = (D.f[DIR_0P0   ])[k1n   ];
-      real f1_S    = (D.f[DIR_0M0   ])[k1s   ];
-      real f1_T    = (D.f[DIR_00P   ])[k1t   ];
-      real f1_B    = (D.f[DIR_00M   ])[k1b   ];
-      real f1_NE   = (D.f[DIR_PP0  ])[k1ne  ];
-      real f1_SW   = (D.f[DIR_MM0  ])[k1sw  ];
-      real f1_SE   = (D.f[DIR_PM0  ])[k1se  ];
-      real f1_NW   = (D.f[DIR_MP0  ])[k1nw  ];
-      real f1_TE   = (D.f[DIR_P0P  ])[k1te  ];
-      real f1_BW   = (D.f[DIR_M0M  ])[k1bw  ];
-      real f1_BE   = (D.f[DIR_P0M  ])[k1be  ];
-      real f1_TW   = (D.f[DIR_M0P  ])[k1tw  ];
-      real f1_TN   = (D.f[DIR_0PP  ])[k1tn  ];
-      real f1_BS   = (D.f[DIR_0MM  ])[k1bs  ];
-      real f1_BN   = (D.f[DIR_0PM  ])[k1bn  ];
-      real f1_TS   = (D.f[DIR_0MP  ])[k1ts  ];
+      real f1_E    = (D.f[DIR_P00])[k1e   ];
+      real f1_W    = (D.f[DIR_M00])[k1w   ];
+      real f1_N    = (D.f[DIR_0P0])[k1n   ];
+      real f1_S    = (D.f[DIR_0M0])[k1s   ];
+      real f1_T    = (D.f[DIR_00P])[k1t   ];
+      real f1_B    = (D.f[DIR_00M])[k1b   ];
+      real f1_NE   = (D.f[DIR_PP0])[k1ne  ];
+      real f1_SW   = (D.f[DIR_MM0])[k1sw  ];
+      real f1_SE   = (D.f[DIR_PM0])[k1se  ];
+      real f1_NW   = (D.f[DIR_MP0])[k1nw  ];
+      real f1_TE   = (D.f[DIR_P0P])[k1te  ];
+      real f1_BW   = (D.f[DIR_M0M])[k1bw  ];
+      real f1_BE   = (D.f[DIR_P0M])[k1be  ];
+      real f1_TW   = (D.f[DIR_M0P])[k1tw  ];
+      real f1_TN   = (D.f[DIR_0PP])[k1tn  ];
+      real f1_BS   = (D.f[DIR_0MM])[k1bs  ];
+      real f1_BN   = (D.f[DIR_0PM])[k1bn  ];
+      real f1_TS   = (D.f[DIR_0MP])[k1ts  ];
       //real f1_ZERO = (D.f[DIR_000])[k1zero];
-      real f1_TNE  = (D.f[DIR_PPP ])[k1tne ];
-      real f1_TSW  = (D.f[DIR_MMP ])[k1tsw ];
-      real f1_TSE  = (D.f[DIR_PMP ])[k1tse ];
-      real f1_TNW  = (D.f[DIR_MPP ])[k1tnw ];
-      real f1_BNE  = (D.f[DIR_PPM ])[k1bne ];
-      real f1_BSW  = (D.f[DIR_MMM ])[k1bsw ];
-      real f1_BSE  = (D.f[DIR_PMM ])[k1bse ];
-      real f1_BNW  = (D.f[DIR_MPM ])[k1bnw ];
+      real f1_TNE  = (D.f[DIR_PPP])[k1tne ];
+      real f1_TSW  = (D.f[DIR_MMP])[k1tsw ];
+      real f1_TSE  = (D.f[DIR_PMP])[k1tse ];
+      real f1_TNW  = (D.f[DIR_MPP])[k1tnw ];
+      real f1_BNE  = (D.f[DIR_PPM])[k1bne ];
+      real f1_BSW  = (D.f[DIR_MMM])[k1bsw ];
+      real f1_BSE  = (D.f[DIR_PMM])[k1bse ];
+      real f1_BNW  = (D.f[DIR_MPM])[k1bnw ];
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      real f_E    = (D.f[DIR_P00   ])[ke   ];
-      real f_W    = (D.f[DIR_M00   ])[kw   ];
-      real f_N    = (D.f[DIR_0P0   ])[kn   ];
-      real f_S    = (D.f[DIR_0M0   ])[ks   ];
-      real f_T    = (D.f[DIR_00P   ])[kt   ];
-      real f_B    = (D.f[DIR_00M   ])[kb   ];
-      real f_NE   = (D.f[DIR_PP0  ])[kne  ];
-      real f_SW   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_SE   = (D.f[DIR_PM0  ])[kse  ];
-      real f_NW   = (D.f[DIR_MP0  ])[knw  ];
-      real f_TE   = (D.f[DIR_P0P  ])[kte  ];
-      real f_BW   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_BE   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_TW   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_TN   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_BS   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_BN   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_TS   = (D.f[DIR_0MP  ])[kts  ];
+      real f_E    = (D.f[DIR_P00])[ke   ];
+      real f_W    = (D.f[DIR_M00])[kw   ];
+      real f_N    = (D.f[DIR_0P0])[kn   ];
+      real f_S    = (D.f[DIR_0M0])[ks   ];
+      real f_T    = (D.f[DIR_00P])[kt   ];
+      real f_B    = (D.f[DIR_00M])[kb   ];
+      real f_NE   = (D.f[DIR_PP0])[kne  ];
+      real f_SW   = (D.f[DIR_MM0])[ksw  ];
+      real f_SE   = (D.f[DIR_PM0])[kse  ];
+      real f_NW   = (D.f[DIR_MP0])[knw  ];
+      real f_TE   = (D.f[DIR_P0P])[kte  ];
+      real f_BW   = (D.f[DIR_M0M])[kbw  ];
+      real f_BE   = (D.f[DIR_P0M])[kbe  ];
+      real f_TW   = (D.f[DIR_M0P])[ktw  ];
+      real f_TN   = (D.f[DIR_0PP])[ktn  ];
+      real f_BS   = (D.f[DIR_0MM])[kbs  ];
+      real f_BN   = (D.f[DIR_0PM])[kbn  ];
+      real f_TS   = (D.f[DIR_0MP])[kts  ];
       //real f_ZERO = (D.f[DIR_000])[kzero];
-      real f_TNE  = (D.f[DIR_PPP ])[ktne ];
-      real f_TSW  = (D.f[DIR_MMP ])[ktsw ];
-      real f_TSE  = (D.f[DIR_PMP ])[ktse ];
-      real f_TNW  = (D.f[DIR_MPP ])[ktnw ];
-      real f_BNE  = (D.f[DIR_PPM ])[kbne ];
-      real f_BSW  = (D.f[DIR_MMM ])[kbsw ];
-      real f_BSE  = (D.f[DIR_PMM ])[kbse ];
-      real f_BNW  = (D.f[DIR_MPM ])[kbnw ];
+      real f_TNE  = (D.f[DIR_PPP])[ktne ];
+      real f_TSW  = (D.f[DIR_MMP])[ktsw ];
+      real f_TSE  = (D.f[DIR_PMP])[ktse ];
+      real f_TNW  = (D.f[DIR_MPP])[ktnw ];
+      real f_BNE  = (D.f[DIR_PPM])[kbne ];
+      real f_BSW  = (D.f[DIR_MMM])[kbsw ];
+      real f_BSE  = (D.f[DIR_PMM])[kbse ];
+      real f_BNW  = (D.f[DIR_MPM])[kbnw ];
       //////////////////////////////////////////////////////////////////////////
       // real vx1, vx2, vx3;
       real drho, drho1;
       //////////////////////////////////////////////////////////////////////////
-	  //Dichte
+     //Dichte
       drho1  =  f1_TSE + f1_TNW + f1_TNE + f1_TSW + f1_BSE + f1_BNW + f1_BNE + f1_BSW +
-                f1_BN + f1_TS + f1_TN + f1_BS + f1_BE + f1_TW + f1_TE + f1_BW + f1_SE + f1_NW + f1_NE + f1_SW + 
-                f1_T + f1_B + f1_N + f1_S + f1_E + f1_W + ((D.f[DIR_000])[k1zero]); 
+                f1_BN + f1_TS + f1_TN + f1_BS + f1_BE + f1_TW + f1_TE + f1_BW + f1_SE + f1_NW + f1_NE + f1_SW +
+                f1_T + f1_B + f1_N + f1_S + f1_E + f1_W + ((D.f[DIR_000])[k1zero]);
       drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
-                f_BN + f_TS + f_TN + f_BS + f_BE + f_TW + f_TE + f_BW + f_SE + f_NW + f_NE + f_SW + 
-                f_T + f_B + f_N + f_S + f_E + f_W + ((D.f[DIR_000])[kzero]); 
+                f_BN + f_TS + f_TN + f_BS + f_BE + f_TW + f_TE + f_BW + f_SE + f_NW + f_NE + f_SW +
+                f_T + f_B + f_N + f_S + f_E + f_W + ((D.f[DIR_000])[kzero]);
       //////////////////////////////////////////////////////////////////////////
-	  //Schallgeschwindigkeit
-	  real cs = c1o1 / sqrtf(c3o1);
+     //Schallgeschwindigkeit
+     real cs = c1o1 / sqrtf(c3o1);
       //////////////////////////////////////////////////////////////////////////
-	  real rhoInterpol = drho1 * cs + (c1o1 - cs) * drho; 
-	  //real diffRho = (rhoBC[k] + one) / (rhoInterpol + one);
-	  real diffRhoToAdd = rhoBC[k] - rhoInterpol;
-	  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //no velocity
-	  //////////////////////////////////////////
+     real rhoInterpol = drho1 * cs + (c1o1 - cs) * drho;
+     //real diffRho = (rhoBC[k] + one) / (rhoInterpol + one);
+     real diffRhoToAdd = rhoBC[k] - rhoInterpol;
+     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+     //no velocity
+     //////////////////////////////////////////
       f_E    = f1_E   * cs + (c1o1 - cs) * f_E   ;
       f_W    = f1_W   * cs + (c1o1 - cs) * f_W   ;
       f_N    = f1_N   * cs + (c1o1 - cs) * f_N   ;
@@ -258,16 +292,16 @@ __global__ void QInflowScaleByPressDevice27(  real* rhoBC,
       f_BSW  = f1_BSW * cs + (c1o1 - cs) * f_BSW ;
       f_BSE  = f1_BSE * cs + (c1o1 - cs) * f_BSE ;
       f_BNW  = f1_BNW * cs + (c1o1 - cs) * f_BNW ;
-	  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //scale by press
-	  //////////////////////////////////////////
-	  //f_E    = (f_E   + c2over27 ) * diffRho - c2over27 ;
+     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+     //scale by press
+     //////////////////////////////////////////
+     //f_E    = (f_E   + c2over27 ) * diffRho - c2over27 ;
    //   f_W    = (f_W   + c2over27 ) * diffRho - c2over27 ;
    //   f_N    = (f_N   + c2over27 ) * diffRho - c2over27 ;
    //   f_S    = (f_S   + c2over27 ) * diffRho - c2over27 ;
    //   f_T    = (f_T   + c2over27 ) * diffRho - c2over27 ;
    //   f_B    = (f_B   + c2over27 ) * diffRho - c2over27 ;
-	  //f_NE   = (f_NE  + c1over54 ) * diffRho - c1over54 ;
+     //f_NE   = (f_NE  + c1over54 ) * diffRho - c1over54 ;
    //   f_SW   = (f_SW  + c1over54 ) * diffRho - c1over54 ;
    //   f_SE   = (f_SE  + c1over54 ) * diffRho - c1over54 ;
    //   f_NW   = (f_NW  + c1over54 ) * diffRho - c1over54 ;
@@ -287,16 +321,16 @@ __global__ void QInflowScaleByPressDevice27(  real* rhoBC,
    //   f_BSW  = (f_BSW + c1over216) * diffRho - c1over216;
    //   f_BSE  = (f_BSE + c1over216) * diffRho - c1over216;
    //   f_BNW  = (f_BNW + c1over216) * diffRho - c1over216;
-	  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  // add press
-	  //////////////////////////////////////////
-	  f_E    = (f_E   + c2o27  * diffRhoToAdd);
+     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+     // add press
+     //////////////////////////////////////////
+     f_E    = (f_E   + c2o27  * diffRhoToAdd);
       f_W    = (f_W   + c2o27  * diffRhoToAdd);
       f_N    = (f_N   + c2o27  * diffRhoToAdd);
       f_S    = (f_S   + c2o27  * diffRhoToAdd);
       f_T    = (f_T   + c2o27  * diffRhoToAdd);
       f_B    = (f_B   + c2o27  * diffRhoToAdd);
-	  f_NE   = (f_NE  + c1o54  * diffRhoToAdd);
+     f_NE   = (f_NE  + c1o54  * diffRhoToAdd);
       f_SW   = (f_SW  + c1o54  * diffRhoToAdd);
       f_SE   = (f_SE  + c1o54  * diffRhoToAdd);
       f_NW   = (f_NW  + c1o54  * diffRhoToAdd);
@@ -316,111 +350,111 @@ __global__ void QInflowScaleByPressDevice27(  real* rhoBC,
       f_BSW  = (f_BSW + c1o216 * diffRhoToAdd);
       f_BSE  = (f_BSE + c1o216 * diffRhoToAdd);
       f_BNW  = (f_BNW + c1o216 * diffRhoToAdd);
-	  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-	  //////////////////////////////////////////////////////////////////////////
+     //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       //////////////////////////////////////////////////////////////////////////
       //__syncthreads();
-	  // -X
-	  //(D.f[DIR_P00   ])[ke   ] = f_E   ;
-	  //(D.f[DIR_PM0  ])[kse  ] = f_SE  ;
-	  //(D.f[DIR_PP0  ])[kne  ] = f_NE  ;
-	  //(D.f[DIR_P0M  ])[kbe  ] = f_BE  ;
-	  //(D.f[DIR_P0P  ])[kte  ] = f_TE  ;
-	  //(D.f[DIR_PMP ])[ktse ] = f_TSE ;
-	  //(D.f[DIR_PPP ])[ktne ] = f_TNE ;
-	  //(D.f[DIR_PMM ])[kbse ] = f_BSE ;
-	  //(D.f[DIR_PPM ])[kbne ] = f_BNE ;     
-	  // X
-	  (D.f[DIR_M00   ])[kw   ] = f_W   ;
-	  (D.f[DIR_MM0  ])[ksw  ] = f_SW  ;
-	  (D.f[DIR_MP0  ])[knw  ] = f_NW  ;
-	  (D.f[DIR_M0M  ])[kbw  ] = f_BW  ;
-	  (D.f[DIR_M0P  ])[ktw  ] = f_TW  ;
-	  (D.f[DIR_MMP ])[ktsw ] = f_TSW ;
-	  (D.f[DIR_MPP ])[ktnw ] = f_TNW ;
-	  (D.f[DIR_MMM ])[kbsw ] = f_BSW ;
-	  (D.f[DIR_MPM ])[kbnw ] = f_BNW ;     
-	  // Y
-	  //(D.f[DIR_0M0   ])[ks   ] = f_S   ;
-	  //(D.f[DIR_PM0  ])[kse  ] = f_SE  ;
-	  //(D.f[DIR_MM0  ])[ksw  ] = f_SW  ;
-	  //(D.f[DIR_0MP  ])[kts  ] = f_TS  ;
-	  //(D.f[DIR_0MM  ])[kbs  ] = f_BS  ;
-	  //(D.f[DIR_PMP ])[ktse ] = f_TSE ;
-	  //(D.f[DIR_MMP ])[ktsw ] = f_TSW ;
-	  //(D.f[DIR_PMM ])[kbse ] = f_BSE ;
-	  //(D.f[DIR_MMM ])[kbsw ] = f_BSW ;     
-	  // Z
-	  //(D.f[DIR_00M   ])[kb   ] = f_B   ;
-	  //(D.f[DIR_P0M  ])[kbe  ] = f_BE  ;
-	  //(D.f[DIR_M0M  ])[kbw  ] = f_BW  ;
-	  //(D.f[DIR_0PM  ])[kbn  ] = f_BN  ;
-	  //(D.f[DIR_0MM  ])[kbs  ] = f_BS  ;
-	  //(D.f[DIR_PPM ])[kbne ] = f_BNE ;
-	  //(D.f[DIR_MPM ])[kbnw ] = f_BNW ;
-	  //(D.f[DIR_PMM ])[kbse ] = f_BSE ;
-	  //(D.f[DIR_MMM ])[kbsw ] = f_BSW ;     
+     // -X
+     //(D.f[DIR_P00])[ke   ] = f_E   ;
+     //(D.f[DIR_PM0])[kse  ] = f_SE  ;
+     //(D.f[DIR_PP0])[kne  ] = f_NE  ;
+     //(D.f[DIR_P0M])[kbe  ] = f_BE  ;
+     //(D.f[DIR_P0P])[kte  ] = f_TE  ;
+     //(D.f[DIR_PMP])[ktse ] = f_TSE ;
+     //(D.f[DIR_PPP])[ktne ] = f_TNE ;
+     //(D.f[DIR_PMM])[kbse ] = f_BSE ;
+     //(D.f[DIR_PPM])[kbne ] = f_BNE ;
+     // X
+     (D.f[DIR_M00])[kw   ] = f_W   ;
+     (D.f[DIR_MM0])[ksw  ] = f_SW  ;
+     (D.f[DIR_MP0])[knw  ] = f_NW  ;
+     (D.f[DIR_M0M])[kbw  ] = f_BW  ;
+     (D.f[DIR_M0P])[ktw  ] = f_TW  ;
+     (D.f[DIR_MMP])[ktsw ] = f_TSW ;
+     (D.f[DIR_MPP])[ktnw ] = f_TNW ;
+     (D.f[DIR_MMM])[kbsw ] = f_BSW ;
+     (D.f[DIR_MPM])[kbnw ] = f_BNW ;
+     // Y
+     //(D.f[DIR_0M0])[ks   ] = f_S   ;
+     //(D.f[DIR_PM0])[kse  ] = f_SE  ;
+     //(D.f[DIR_MM0])[ksw  ] = f_SW  ;
+     //(D.f[DIR_0MP])[kts  ] = f_TS  ;
+     //(D.f[DIR_0MM])[kbs  ] = f_BS  ;
+     //(D.f[DIR_PMP])[ktse ] = f_TSE ;
+     //(D.f[DIR_MMP])[ktsw ] = f_TSW ;
+     //(D.f[DIR_PMM])[kbse ] = f_BSE ;
+     //(D.f[DIR_MMM])[kbsw ] = f_BSW ;
+     // Z
+     //(D.f[DIR_00M])[kb   ] = f_B   ;
+     //(D.f[DIR_P0M])[kbe  ] = f_BE  ;
+     //(D.f[DIR_M0M])[kbw  ] = f_BW  ;
+     //(D.f[DIR_0PM])[kbn  ] = f_BN  ;
+     //(D.f[DIR_0MM])[kbs  ] = f_BS  ;
+     //(D.f[DIR_PPM])[kbne ] = f_BNE ;
+     //(D.f[DIR_MPM])[kbnw ] = f_BNW ;
+     //(D.f[DIR_PMM])[kbse ] = f_BSE ;
+     //(D.f[DIR_MMM])[kbsw ] = f_BSW ;
       //////////////////////////////////////////////////////////////////////////
    }
 }
@@ -465,22 +499,23 @@ __global__ void QInflowScaleByPressDevice27(  real* rhoBC,
 
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-__global__ void QPressDeviceIncompNEQ27( real* rhoBC,
-													real* DD, 
-													int* k_Q, 
-													int* k_N, 
-													int numberOfBCnodes, 
-													real om1, 
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													unsigned int size_Mat, 
-													bool isEvenTimestep)
+__global__ void QPressDeviceIncompNEQ27(
+    real* rhoBC,
+    real* DD,
+    int* k_Q,
+    int* k_N,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+   const unsigned  x = threadIdx.x;  // Globaler x-Index
+   const unsigned  y = blockIdx.x;   // Globaler y-Index
+   const unsigned  z = blockIdx.y;   // Globaler z-Index
 
    const unsigned nx = blockDim.x;
    const unsigned ny = gridDim.x;
@@ -554,112 +589,112 @@ __global__ void QPressDeviceIncompNEQ27( real* rhoBC,
       Distributions27 D;
       if (isEvenTimestep==true) //// ACHTUNG PREColl !!!!!!!!!!!!!!
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real        f1_E,f1_W,f1_N,f1_S,f1_T,f1_B,f1_NE,f1_SW,f1_SE,f1_NW,f1_TE,f1_BW,f1_BE,f1_TW,f1_TN,f1_BS,f1_BN,f1_TS,f1_ZERO,
                      f1_TNE,f1_TSW,f1_TSE,f1_TNW,f1_BNE,f1_BSW,f1_BSE,f1_BNW;
 
-      f1_W    = (D.f[DIR_P00   ])[k1e   ];
-      f1_E    = (D.f[DIR_M00   ])[k1w   ];
-      f1_S    = (D.f[DIR_0P0   ])[k1n   ];
-      f1_N    = (D.f[DIR_0M0   ])[k1s   ];
-      f1_B    = (D.f[DIR_00P   ])[k1t   ];
-      f1_T    = (D.f[DIR_00M   ])[k1b   ];
-      f1_SW   = (D.f[DIR_PP0  ])[k1ne  ];
-      f1_NE   = (D.f[DIR_MM0  ])[k1sw  ];
-      f1_NW   = (D.f[DIR_PM0  ])[k1se  ];
-      f1_SE   = (D.f[DIR_MP0  ])[k1nw  ];
-      f1_BW   = (D.f[DIR_P0P  ])[k1te  ];
-      f1_TE   = (D.f[DIR_M0M  ])[k1bw  ];
-      f1_TW   = (D.f[DIR_P0M  ])[k1be  ];
-      f1_BE   = (D.f[DIR_M0P  ])[k1tw  ];
-      f1_BS   = (D.f[DIR_0PP  ])[k1tn  ];
-      f1_TN   = (D.f[DIR_0MM  ])[k1bs  ];
-      f1_TS   = (D.f[DIR_0PM  ])[k1bn  ];
-      f1_BN   = (D.f[DIR_0MP  ])[k1ts  ];
+      f1_W    = (D.f[DIR_P00])[k1e   ];
+      f1_E    = (D.f[DIR_M00])[k1w   ];
+      f1_S    = (D.f[DIR_0P0])[k1n   ];
+      f1_N    = (D.f[DIR_0M0])[k1s   ];
+      f1_B    = (D.f[DIR_00P])[k1t   ];
+      f1_T    = (D.f[DIR_00M])[k1b   ];
+      f1_SW   = (D.f[DIR_PP0])[k1ne  ];
+      f1_NE   = (D.f[DIR_MM0])[k1sw  ];
+      f1_NW   = (D.f[DIR_PM0])[k1se  ];
+      f1_SE   = (D.f[DIR_MP0])[k1nw  ];
+      f1_BW   = (D.f[DIR_P0P])[k1te  ];
+      f1_TE   = (D.f[DIR_M0M])[k1bw  ];
+      f1_TW   = (D.f[DIR_P0M])[k1be  ];
+      f1_BE   = (D.f[DIR_M0P])[k1tw  ];
+      f1_BS   = (D.f[DIR_0PP])[k1tn  ];
+      f1_TN   = (D.f[DIR_0MM])[k1bs  ];
+      f1_TS   = (D.f[DIR_0PM])[k1bn  ];
+      f1_BN   = (D.f[DIR_0MP])[k1ts  ];
       f1_ZERO = (D.f[DIR_000])[k1zero];
-      f1_BSW  = (D.f[DIR_PPP ])[k1tne ];
-      f1_BNE  = (D.f[DIR_MMP ])[k1tsw ];
-      f1_BNW  = (D.f[DIR_PMP ])[k1tse ];
-      f1_BSE  = (D.f[DIR_MPP ])[k1tnw ];
-      f1_TSW  = (D.f[DIR_PPM ])[k1bne ];
-      f1_TNE  = (D.f[DIR_MMM ])[k1bsw ];
-      f1_TNW  = (D.f[DIR_PMM ])[k1bse ];
-      f1_TSE  = (D.f[DIR_MPM ])[k1bnw ];
+      f1_BSW  = (D.f[DIR_PPP])[k1tne ];
+      f1_BNE  = (D.f[DIR_MMP])[k1tsw ];
+      f1_BNW  = (D.f[DIR_PMP])[k1tse ];
+      f1_BSE  = (D.f[DIR_MPP])[k1tnw ];
+      f1_TSW  = (D.f[DIR_PPM])[k1bne ];
+      f1_TNE  = (D.f[DIR_MMM])[k1bsw ];
+      f1_TNW  = (D.f[DIR_PMM])[k1bse ];
+      f1_TSE  = (D.f[DIR_MPM])[k1bnw ];
 
       //////////////////////////////////////////////////////////////////////////
       real drho1    =  f1_ZERO+f1_E+f1_W+f1_N+f1_S+f1_T+f1_B+f1_NE+f1_SW+f1_SE+f1_NW+f1_TE+f1_BW+f1_BE+f1_TW+f1_TN+f1_BS+f1_BN+f1_TS+
                           f1_TNE+f1_TSW+f1_TSE+f1_TNW+f1_BNE+f1_BSW+f1_BSE+f1_BNW;
 
       real vx1      =  ((f1_TSE - f1_BNW) - (f1_TNW - f1_BSE)) + ((f1_TNE - f1_BSW) - (f1_TSW - f1_BNE)) +
-						  ((f1_BE - f1_TW)   + (f1_TE - f1_BW))   + ((f1_SE - f1_NW)   + (f1_NE - f1_SW)) +
-						  (f1_E - f1_W); 
+                    ((f1_BE - f1_TW)   + (f1_TE - f1_BW))   + ((f1_SE - f1_NW)   + (f1_NE - f1_SW)) +
+                    (f1_E - f1_W);
 
 
       real vx2    =   (-(f1_TSE - f1_BNW) + (f1_TNW - f1_BSE)) + ((f1_TNE - f1_BSW) - (f1_TSW - f1_BNE)) +
-						 ((f1_BN - f1_TS)   + (f1_TN - f1_BS))    + (-(f1_SE - f1_NW)  + (f1_NE - f1_SW)) +
-						 (f1_N - f1_S); 
+                   ((f1_BN - f1_TS)   + (f1_TN - f1_BS))    + (-(f1_SE - f1_NW)  + (f1_NE - f1_SW)) +
+                   (f1_N - f1_S);
 
       real vx3    =   ((f1_TSE - f1_BNW) + (f1_TNW - f1_BSE)) + ((f1_TNE - f1_BSW) + (f1_TSW - f1_BNE)) +
-						 (-(f1_BN - f1_TS)  + (f1_TN - f1_BS))   + ((f1_TE - f1_BW)   - (f1_BE - f1_TW)) +
-						 (f1_T - f1_B); 
+                   (-(f1_BN - f1_TS)  + (f1_TN - f1_BS))   + ((f1_TE - f1_BW)   - (f1_BE - f1_TW)) +
+                   (f1_T - f1_B);
 
       real cusq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
@@ -690,15 +725,15 @@ __global__ void QPressDeviceIncompNEQ27( real* rhoBC,
       f1_BNW   -=  c1o216*(drho1+(drho1+c1o1)*(c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cusq));
       f1_BSE   -=  c1o216*(drho1+(drho1+c1o1)*(c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cusq));
       f1_TNW   -=  c1o216*(drho1+(drho1+c1o1)*(c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cusq));
-	   
-	  drho1 = rhoBC[k];
 
-	  //if(vx1 < zero){
-		 // vx1 *= 0.9;
-	  //}
-	  //if(vx2 < zero){
-		 // vx2 *= c1o10;//0.9;
-	  //}
+     drho1 = rhoBC[k];
+
+     //if(vx1 < zero){
+       // vx1 *= 0.9;
+     //}
+     //if(vx2 < zero){
+       // vx2 *= c1o10;//0.9;
+     //}
 
       f1_ZERO  += c8o27*  (drho1-(drho1+c1o1)*cusq);
       f1_E     += c2o27*  (drho1+(drho1+c1o1)*(c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cusq));
@@ -728,39 +763,39 @@ __global__ void QPressDeviceIncompNEQ27( real* rhoBC,
       f1_BSE   +=  c1o216*(drho1+(drho1+c1o1)*(c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cusq));
       f1_TNW   +=  c1o216*(drho1+(drho1+c1o1)*(c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cusq));
 
-	  //drho1 = (drho1 + rhoBC[k])/2.f;
-	  //drho1 = drho1 - rhoBC[k];
+     //drho1 = (drho1 + rhoBC[k])/2.f;
+     //drho1 = drho1 - rhoBC[k];
       //////////////////////////////////////////////////////////////////////////
 
       __syncthreads();
 
-      (D.f[DIR_P00   ])[ke   ] = f1_W   ;  
-      (D.f[DIR_M00   ])[kw   ] = f1_E   ;	
-      (D.f[DIR_0P0   ])[kn   ] = f1_S   ;	
-      (D.f[DIR_0M0   ])[ks   ] = f1_N   ;	
-      (D.f[DIR_00P   ])[kt   ] = f1_B   ;	
-      (D.f[DIR_00M   ])[kb   ] = f1_T   ;	
-      (D.f[DIR_PP0  ])[kne  ] = f1_SW  ;	
-      (D.f[DIR_MM0  ])[ksw  ] = f1_NE  ;	
-      (D.f[DIR_PM0  ])[kse  ] = f1_NW  ;	
-      (D.f[DIR_MP0  ])[knw  ] = f1_SE  ;	
-      (D.f[DIR_P0P  ])[kte  ] = f1_BW  ;	
-      (D.f[DIR_M0M  ])[kbw  ] = f1_TE  ;	
-      (D.f[DIR_P0M  ])[kbe  ] = f1_TW  ;	
-      (D.f[DIR_M0P  ])[ktw  ] = f1_BE  ;	
-      (D.f[DIR_0PP  ])[ktn  ] = f1_BS  ;	
-      (D.f[DIR_0MM  ])[kbs  ] = f1_TN  ;	
-      (D.f[DIR_0PM  ])[kbn  ] = f1_TS  ;	
-      (D.f[DIR_0MP  ])[kts  ] = f1_BN  ;	
-      (D.f[DIR_000])[kzero] = f1_ZERO;	
-      (D.f[DIR_PPP ])[ktne ] = f1_BSW ;	
-      (D.f[DIR_MMP ])[ktsw ] = f1_BNE ;	
-      (D.f[DIR_PMP ])[ktse ] = f1_BNW ;	
-      (D.f[DIR_MPP ])[ktnw ] = f1_BSE ;	
-      (D.f[DIR_PPM ])[kbne ] = f1_TSW ;	
-      (D.f[DIR_MMM ])[kbsw ] = f1_TNE ;	
-      (D.f[DIR_PMM ])[kbse ] = f1_TNW ;	
-      (D.f[DIR_MPM ])[kbnw ] = f1_TSE ;       
+      (D.f[DIR_P00])[ke   ] = f1_W   ;
+      (D.f[DIR_M00])[kw   ] = f1_E   ;
+      (D.f[DIR_0P0])[kn   ] = f1_S   ;
+      (D.f[DIR_0M0])[ks   ] = f1_N   ;
+      (D.f[DIR_00P])[kt   ] = f1_B   ;
+      (D.f[DIR_00M])[kb   ] = f1_T   ;
+      (D.f[DIR_PP0])[kne  ] = f1_SW  ;
+      (D.f[DIR_MM0])[ksw  ] = f1_NE  ;
+      (D.f[DIR_PM0])[kse  ] = f1_NW  ;
+      (D.f[DIR_MP0])[knw  ] = f1_SE  ;
+      (D.f[DIR_P0P])[kte  ] = f1_BW  ;
+      (D.f[DIR_M0M])[kbw  ] = f1_TE  ;
+      (D.f[DIR_P0M])[kbe  ] = f1_TW  ;
+      (D.f[DIR_M0P])[ktw  ] = f1_BE  ;
+      (D.f[DIR_0PP])[ktn  ] = f1_BS  ;
+      (D.f[DIR_0MM])[kbs  ] = f1_TN  ;
+      (D.f[DIR_0PM])[kbn  ] = f1_TS  ;
+      (D.f[DIR_0MP])[kts  ] = f1_BN  ;
+      (D.f[DIR_000])[kzero] = f1_ZERO;
+      (D.f[DIR_PPP])[ktne ] = f1_BSW ;
+      (D.f[DIR_MMP])[ktsw ] = f1_BNE ;
+      (D.f[DIR_PMP])[ktse ] = f1_BNW ;
+      (D.f[DIR_MPP])[ktnw ] = f1_BSE ;
+      (D.f[DIR_PPM])[kbne ] = f1_TSW ;
+      (D.f[DIR_MMM])[kbsw ] = f1_TNE ;
+      (D.f[DIR_PMM])[kbse ] = f1_TNW ;
+      (D.f[DIR_MPM])[kbnw ] = f1_TSE ;
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -804,54 +839,49 @@ __global__ void QPressDeviceIncompNEQ27( real* rhoBC,
 
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-__global__ void QPressDeviceNEQ27(real* rhoBC,
-                                             real* distribution, 
-                                             int* bcNodeIndices,
-                                             int* bcNeighborIndices,
-                                             int numberOfBCnodes,
-                                             real omega1, 
-                                             unsigned int* neighborX,
-                                             unsigned int* neighborY,
-                                             unsigned int* neighborZ,
-                                             unsigned int numberOfLBnodes, 
-                                             bool isEvenTimestep)
+__global__ void QPressDeviceNEQ27(
+    real* rhoBC,
+    real* distributions,
+    int* bcNodeIndices,
+    int* bcNeighborIndices,
+    int numberOfBCnodes,
+    real omega1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
-   //////////////////////////////////////////////////////////////////////////
-	//! The pressure boundary condition is executed in the following steps
-	//!
-	////////////////////////////////////////////////////////////////////////////////
-	//! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
-	//!
-   const unsigned x = threadIdx.x;    // global x-index 
-   const unsigned y = blockIdx.x;     // global y-index 
-   const unsigned z = blockIdx.y;     // global z-index 
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
+   ////////////////////////////////////////////////////////////////////////////////
+   //! The pressure boundary condition is executed in the following steps
+   //!
 
-   const unsigned k = nx*(ny*z + y) + x;
+   ////////////////////////////////////////////////////////////////////////////////
+   //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+   //!
+   const unsigned nodeIndex = getNodeIndex();
 
-   //////////////////////////////////////////////////////////////////////////
+   ////////////////////////////////////////////////////////////////////////////////
    //! - Run for all indices in size of boundary condition (numberOfBCnodes)
    //!
-   if(k < numberOfBCnodes)
+   if(nodeIndex < numberOfBCnodes)
    {
       //////////////////////////////////////////////////////////////////////////
       //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
       //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
       //!
       Distributions27 dist;
-      getPointersToDistributions(dist, distribution, numberOfLBnodes, isEvenTimestep);
+      getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local pressure
       //!
-      real rhoBClocal = rhoBC[k];
+      real rhoBClocal = rhoBC[nodeIndex];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set neighbor indices (necessary for indirect addressing)
       //!
-      unsigned int KQK  = bcNodeIndices[k];
+      unsigned int KQK  = bcNodeIndices[nodeIndex];
       unsigned int kzero= KQK;
       unsigned int ke   = KQK;
       unsigned int kw   = neighborX[KQK];
@@ -882,7 +912,7 @@ __global__ void QPressDeviceNEQ27(real* rhoBC,
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set neighbor indices (necessary for indirect addressing) for neighboring node
       //!
-      unsigned int K1QK  = bcNeighborIndices[k];
+      unsigned int K1QK  = bcNeighborIndices[nodeIndex];
       unsigned int k1zero= K1QK;
       unsigned int k1e   = K1QK;
       unsigned int k1w   = neighborX[K1QK];
@@ -914,52 +944,52 @@ __global__ void QPressDeviceNEQ27(real* rhoBC,
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local distributions for neighboring node
       //!
-      real f1_W    = (dist.f[DIR_P00   ])[k1e   ];
-      real f1_E    = (dist.f[DIR_M00   ])[k1w   ];
-      real f1_S    = (dist.f[DIR_0P0   ])[k1n   ];
-      real f1_N    = (dist.f[DIR_0M0   ])[k1s   ];
-      real f1_B    = (dist.f[DIR_00P   ])[k1t   ];
-      real f1_T    = (dist.f[DIR_00M   ])[k1b   ];
-      real f1_SW   = (dist.f[DIR_PP0  ])[k1ne  ];
-      real f1_NE   = (dist.f[DIR_MM0  ])[k1sw  ];
-      real f1_NW   = (dist.f[DIR_PM0  ])[k1se  ];
-      real f1_SE   = (dist.f[DIR_MP0  ])[k1nw  ];
-      real f1_BW   = (dist.f[DIR_P0P  ])[k1te  ];
-      real f1_TE   = (dist.f[DIR_M0M  ])[k1bw  ];
-      real f1_TW   = (dist.f[DIR_P0M  ])[k1be  ];
-      real f1_BE   = (dist.f[DIR_M0P  ])[k1tw  ];
-      real f1_BS   = (dist.f[DIR_0PP  ])[k1tn  ];
-      real f1_TN   = (dist.f[DIR_0MM  ])[k1bs  ];
-      real f1_TS   = (dist.f[DIR_0PM  ])[k1bn  ];
-      real f1_BN   = (dist.f[DIR_0MP  ])[k1ts  ];
+      real f1_W    = (dist.f[DIR_P00])[k1e   ];
+      real f1_E    = (dist.f[DIR_M00])[k1w   ];
+      real f1_S    = (dist.f[DIR_0P0])[k1n   ];
+      real f1_N    = (dist.f[DIR_0M0])[k1s   ];
+      real f1_B    = (dist.f[DIR_00P])[k1t   ];
+      real f1_T    = (dist.f[DIR_00M])[k1b   ];
+      real f1_SW   = (dist.f[DIR_PP0])[k1ne  ];
+      real f1_NE   = (dist.f[DIR_MM0])[k1sw  ];
+      real f1_NW   = (dist.f[DIR_PM0])[k1se  ];
+      real f1_SE   = (dist.f[DIR_MP0])[k1nw  ];
+      real f1_BW   = (dist.f[DIR_P0P])[k1te  ];
+      real f1_TE   = (dist.f[DIR_M0M])[k1bw  ];
+      real f1_TW   = (dist.f[DIR_P0M])[k1be  ];
+      real f1_BE   = (dist.f[DIR_M0P])[k1tw  ];
+      real f1_BS   = (dist.f[DIR_0PP])[k1tn  ];
+      real f1_TN   = (dist.f[DIR_0MM])[k1bs  ];
+      real f1_TS   = (dist.f[DIR_0PM])[k1bn  ];
+      real f1_BN   = (dist.f[DIR_0MP])[k1ts  ];
       real f1_ZERO = (dist.f[DIR_000])[k1zero];
-      real f1_BSW  = (dist.f[DIR_PPP ])[k1tne ];
-      real f1_BNE  = (dist.f[DIR_MMP ])[k1tsw ];
-      real f1_BNW  = (dist.f[DIR_PMP ])[k1tse ];
-      real f1_BSE  = (dist.f[DIR_MPP ])[k1tnw ];
-      real f1_TSW  = (dist.f[DIR_PPM ])[k1bne ];
-      real f1_TNE  = (dist.f[DIR_MMM ])[k1bsw ];
-      real f1_TNW  = (dist.f[DIR_PMM ])[k1bse ];
-      real f1_TSE  = (dist.f[DIR_MPM ])[k1bnw ];
+      real f1_BSW  = (dist.f[DIR_PPP])[k1tne ];
+      real f1_BNE  = (dist.f[DIR_MMP])[k1tsw ];
+      real f1_BNW  = (dist.f[DIR_PMP])[k1tse ];
+      real f1_BSE  = (dist.f[DIR_MPP])[k1tnw ];
+      real f1_TSW  = (dist.f[DIR_PPM])[k1bne ];
+      real f1_TNE  = (dist.f[DIR_MMM])[k1bsw ];
+      real f1_TNW  = (dist.f[DIR_PMM])[k1bse ];
+      real f1_TSE  = (dist.f[DIR_MPM])[k1bnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Calculate macroscopic quantities (for neighboring node)
       //!
       real drho1 = f1_TSE + f1_TNW + f1_TNE + f1_TSW + f1_BSE + f1_BNW + f1_BNE + f1_BSW +
-                   f1_BN + f1_TS + f1_TN + f1_BS + f1_BE + f1_TW + f1_TE + f1_BW + f1_SE + f1_NW + f1_NE + f1_SW + 
-                   f1_T + f1_B + f1_N + f1_S + f1_E + f1_W + ((dist.f[DIR_000])[kzero]); 
+                   f1_BN + f1_TS + f1_TN + f1_BS + f1_BE + f1_TW + f1_TE + f1_BW + f1_SE + f1_NW + f1_NE + f1_SW +
+                   f1_T + f1_B + f1_N + f1_S + f1_E + f1_W + ((dist.f[DIR_000])[kzero]);
 
       real vx1  = (((f1_TSE - f1_BNW) - (f1_TNW - f1_BSE)) + ((f1_TNE - f1_BSW) - (f1_TSW - f1_BNE)) +
                    ((f1_BE - f1_TW)   + (f1_TE - f1_BW))   + ((f1_SE - f1_NW)   + (f1_NE - f1_SW)) +
-                   (f1_E - f1_W)) / (c1o1 + drho1);          
+                   (f1_E - f1_W)) / (c1o1 + drho1);
 
       real vx2  = ((-(f1_TSE - f1_BNW) + (f1_TNW - f1_BSE)) + ((f1_TNE - f1_BSW) - (f1_TSW - f1_BNE)) +
                    ((f1_BN - f1_TS)   + (f1_TN - f1_BS))    + (-(f1_SE - f1_NW)  + (f1_NE - f1_SW)) +
-                   (f1_N - f1_S)) / (c1o1 + drho1); 
+                   (f1_N - f1_S)) / (c1o1 + drho1);
 
       real vx3  = (((f1_TSE - f1_BNW) + (f1_TNW - f1_BSE)) + ((f1_TNE - f1_BSW) + (f1_TSW - f1_BNE)) +
                    (-(f1_BN - f1_TS)  + (f1_TN - f1_BS))   + ((f1_TE - f1_BW)   - (f1_BE - f1_TW)) +
-                   (f1_T - f1_B)) / (c1o1 + drho1); 
+                   (f1_T - f1_B)) / (c1o1 + drho1);
 
       real cusq = c3o2 * (vx1 * vx1 + vx2 * vx2 + vx3 * vx3);
 
@@ -1037,33 +1067,33 @@ __global__ void QPressDeviceNEQ27(real* rhoBC,
       ////////////////////////////////////////////////////////////////////////////////
       //! write the new distributions to the bc nodes
       //!
-      (dist.f[DIR_P00   ])[ke   ] = f1_W   ;
-      (dist.f[DIR_M00   ])[kw   ] = f1_E   ;
-      (dist.f[DIR_0P0   ])[kn   ] = f1_S   ;
-      (dist.f[DIR_0M0   ])[ks   ] = f1_N   ;
-      (dist.f[DIR_00P   ])[kt   ] = f1_B   ;
-      (dist.f[DIR_00M   ])[kb   ] = f1_T   ;
-      (dist.f[DIR_PP0  ])[kne  ] = f1_SW  ;
-      (dist.f[DIR_MM0  ])[ksw  ] = f1_NE  ;
-      (dist.f[DIR_PM0  ])[kse  ] = f1_NW  ;
-      (dist.f[DIR_MP0  ])[knw  ] = f1_SE  ;
-      (dist.f[DIR_P0P  ])[kte  ] = f1_BW  ;
-      (dist.f[DIR_M0M  ])[kbw  ] = f1_TE  ;
-      (dist.f[DIR_P0M  ])[kbe  ] = f1_TW  ;
-      (dist.f[DIR_M0P  ])[ktw  ] = f1_BE  ;
-      (dist.f[DIR_0PP  ])[ktn  ] = f1_BS  ;
-      (dist.f[DIR_0MM  ])[kbs  ] = f1_TN  ;
-      (dist.f[DIR_0PM  ])[kbn  ] = f1_TS  ;
-      (dist.f[DIR_0MP  ])[kts  ] = f1_BN  ;
+      (dist.f[DIR_P00])[ke   ] = f1_W   ;
+      (dist.f[DIR_M00])[kw   ] = f1_E   ;
+      (dist.f[DIR_0P0])[kn   ] = f1_S   ;
+      (dist.f[DIR_0M0])[ks   ] = f1_N   ;
+      (dist.f[DIR_00P])[kt   ] = f1_B   ;
+      (dist.f[DIR_00M])[kb   ] = f1_T   ;
+      (dist.f[DIR_PP0])[kne  ] = f1_SW  ;
+      (dist.f[DIR_MM0])[ksw  ] = f1_NE  ;
+      (dist.f[DIR_PM0])[kse  ] = f1_NW  ;
+      (dist.f[DIR_MP0])[knw  ] = f1_SE  ;
+      (dist.f[DIR_P0P])[kte  ] = f1_BW  ;
+      (dist.f[DIR_M0M])[kbw  ] = f1_TE  ;
+      (dist.f[DIR_P0M])[kbe  ] = f1_TW  ;
+      (dist.f[DIR_M0P])[ktw  ] = f1_BE  ;
+      (dist.f[DIR_0PP])[ktn  ] = f1_BS  ;
+      (dist.f[DIR_0MM])[kbs  ] = f1_TN  ;
+      (dist.f[DIR_0PM])[kbn  ] = f1_TS  ;
+      (dist.f[DIR_0MP])[kts  ] = f1_BN  ;
       (dist.f[DIR_000])[kzero] = f1_ZERO;
-      (dist.f[DIR_PPP ])[ktne ] = f1_BSW ;
-      (dist.f[DIR_MMP ])[ktsw ] = f1_BNE ;
-      (dist.f[DIR_PMP ])[ktse ] = f1_BNW ;
-      (dist.f[DIR_MPP ])[ktnw ] = f1_BSE ;
-      (dist.f[DIR_PPM ])[kbne ] = f1_TSW ;
-      (dist.f[DIR_MMM ])[kbsw ] = f1_TNE ;
-      (dist.f[DIR_PMM ])[kbse ] = f1_TNW ;
-      (dist.f[DIR_MPM ])[kbnw ] = f1_TSE ;
+      (dist.f[DIR_PPP])[ktne ] = f1_BSW ;
+      (dist.f[DIR_MMP])[ktsw ] = f1_BNE ;
+      (dist.f[DIR_PMP])[ktse ] = f1_BNW ;
+      (dist.f[DIR_MPP])[ktnw ] = f1_BSE ;
+      (dist.f[DIR_PPM])[kbne ] = f1_TSW ;
+      (dist.f[DIR_MMM])[kbsw ] = f1_TNE ;
+      (dist.f[DIR_PMM])[kbse ] = f1_TNW ;
+      (dist.f[DIR_MPM])[kbnw ] = f1_TSE ;
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1107,16 +1137,17 @@ __global__ void QPressDeviceNEQ27(real* rhoBC,
 
 
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void LB_BC_Press_East27( int nx, 
-                                               int ny, 
-                                               int tz, 
-                                               unsigned int* bcMatD, 
-                                               unsigned int* neighborX,
-                                               unsigned int* neighborY,
-                                               unsigned int* neighborZ,
-                                               real* DD, 
-                                               unsigned int size_Mat, 
-                                               bool isEvenTimestep) 
+__global__ void LB_BC_Press_East27(
+    int nx,
+    int ny,
+    int tz,
+    unsigned int* bcMatD,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    //thread-index
    int ty = blockIdx.x;
@@ -1124,9 +1155,9 @@ __global__ void LB_BC_Press_East27( int nx,
 
    int  k, k1, nxny;                   // Zugriff auf arrays im device
 
-   int  x = tx + STARTOFFX;  // Globaler x-Index 
-   int  y = ty + STARTOFFY;  // Globaler y-Index 
-   int  z = tz + STARTOFFZ;  // Globaler z-Index 
+   int  x = tx + STARTOFFX;  // Globaler x-Index
+   int  y = ty + STARTOFFY;  // Globaler y-Index
+   int  z = tz + STARTOFFZ;  // Globaler z-Index
 
    k = nx*(ny*z + y) + x;
    nxny = nx*ny;
@@ -1137,63 +1168,63 @@ __global__ void LB_BC_Press_East27( int nx,
       Distributions27 D;
       if (isEvenTimestep==true)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////
       ////////////////////////////////////////////////////////////////////////////////
@@ -1312,69 +1343,69 @@ __global__ void LB_BC_Press_East27( int nx,
       real        f1_E,f1_W,f1_N,f1_S,f1_T,f1_B,f1_NE,f1_SW,f1_SE,f1_NW,f1_TE,f1_BW,f1_BE,f1_TW,f1_TN,f1_BS,f1_BN,f1_TS,f1_ZERO,
                    f1_TNE,f1_TSW,f1_TSE,f1_TNW,f1_BNE,f1_BSW,f1_BSE,f1_BNW;
 
-      f1_W    = (D.f[DIR_P00   ])[k1e   ];
-      f1_E    = (D.f[DIR_M00   ])[k1w   ];
-      f1_S    = (D.f[DIR_0P0   ])[k1n   ];
-      f1_N    = (D.f[DIR_0M0   ])[k1s   ];
-      f1_B    = (D.f[DIR_00P   ])[k1t   ];
-      f1_T    = (D.f[DIR_00M   ])[k1b   ];
-      f1_SW   = (D.f[DIR_PP0  ])[k1ne  ];
-      f1_NE   = (D.f[DIR_MM0  ])[k1sw  ];
-      f1_NW   = (D.f[DIR_PM0  ])[k1se  ];
-      f1_SE   = (D.f[DIR_MP0  ])[k1nw  ];
-      f1_BW   = (D.f[DIR_P0P  ])[k1te  ];
-      f1_TE   = (D.f[DIR_M0M  ])[k1bw  ];
-      f1_TW   = (D.f[DIR_P0M  ])[k1be  ];
-      f1_BE   = (D.f[DIR_M0P  ])[k1tw  ];
-      f1_BS   = (D.f[DIR_0PP  ])[k1tn  ];
-      f1_TN   = (D.f[DIR_0MM  ])[k1bs  ];
-      f1_TS   = (D.f[DIR_0PM  ])[k1bn  ];
-      f1_BN   = (D.f[DIR_0MP  ])[k1ts  ];
+      f1_W    = (D.f[DIR_P00])[k1e   ];
+      f1_E    = (D.f[DIR_M00])[k1w   ];
+      f1_S    = (D.f[DIR_0P0])[k1n   ];
+      f1_N    = (D.f[DIR_0M0])[k1s   ];
+      f1_B    = (D.f[DIR_00P])[k1t   ];
+      f1_T    = (D.f[DIR_00M])[k1b   ];
+      f1_SW   = (D.f[DIR_PP0])[k1ne  ];
+      f1_NE   = (D.f[DIR_MM0])[k1sw  ];
+      f1_NW   = (D.f[DIR_PM0])[k1se  ];
+      f1_SE   = (D.f[DIR_MP0])[k1nw  ];
+      f1_BW   = (D.f[DIR_P0P])[k1te  ];
+      f1_TE   = (D.f[DIR_M0M])[k1bw  ];
+      f1_TW   = (D.f[DIR_P0M])[k1be  ];
+      f1_BE   = (D.f[DIR_M0P])[k1tw  ];
+      f1_BS   = (D.f[DIR_0PP])[k1tn  ];
+      f1_TN   = (D.f[DIR_0MM])[k1bs  ];
+      f1_TS   = (D.f[DIR_0PM])[k1bn  ];
+      f1_BN   = (D.f[DIR_0MP])[k1ts  ];
       f1_ZERO = (D.f[DIR_000])[k1zero];
-      f1_BSW  = (D.f[DIR_PPP ])[k1tne ];
-      f1_BNE  = (D.f[DIR_MMP ])[k1tsw ];
-      f1_BNW  = (D.f[DIR_PMP ])[k1tse ];
-      f1_BSE  = (D.f[DIR_MPP ])[k1tnw ];
-      f1_TSW  = (D.f[DIR_PPM ])[k1bne ];
-      f1_TNE  = (D.f[DIR_MMM ])[k1bsw ];
-      f1_TNW  = (D.f[DIR_PMM ])[k1bse ];
-      f1_TSE  = (D.f[DIR_MPM ])[k1bnw ];
+      f1_BSW  = (D.f[DIR_PPP])[k1tne ];
+      f1_BNE  = (D.f[DIR_MMP])[k1tsw ];
+      f1_BNW  = (D.f[DIR_PMP])[k1tse ];
+      f1_BSE  = (D.f[DIR_MPP])[k1tnw ];
+      f1_TSW  = (D.f[DIR_PPM])[k1bne ];
+      f1_TNE  = (D.f[DIR_MMM])[k1bsw ];
+      f1_TNW  = (D.f[DIR_PMM])[k1bse ];
+      f1_TSE  = (D.f[DIR_MPM])[k1bnw ];
 
       real drho1    =  f1_ZERO+f1_E+f1_W+f1_N+f1_S+f1_T+f1_B+f1_NE+f1_SW+f1_SE+f1_NW+f1_TE+f1_BW+f1_BE+f1_TW+f1_TN+f1_BS+f1_BN+f1_TS+
                         f1_TNE+f1_TSW+f1_TSE+f1_TNW+f1_BNE+f1_BSW+f1_BSE+f1_BNW;
 
       __syncthreads();
 
-      (D.f[DIR_P00   ])[ke   ] = f1_W   -c2o27*drho1;
-      (D.f[DIR_M00   ])[kw   ] = f1_E   -c2o27*drho1;
-      (D.f[DIR_0P0   ])[kn   ] = f1_S   -c2o27*drho1;
-      (D.f[DIR_0M0   ])[ks   ] = f1_N   -c2o27*drho1;
-      (D.f[DIR_00P   ])[kt   ] = f1_B   -c2o27*drho1;
-      (D.f[DIR_00M   ])[kb   ] = f1_T   -c2o27*drho1;
-      (D.f[DIR_PP0  ])[kne  ] = f1_SW  -c1o54*drho1;
-      (D.f[DIR_MM0  ])[ksw  ] = f1_NE  -c1o54*drho1;
-      (D.f[DIR_PM0  ])[kse  ] = f1_NW  -c1o54*drho1;
-      (D.f[DIR_MP0  ])[knw  ] = f1_SE  -c1o54*drho1;
-      (D.f[DIR_P0P  ])[kte  ] = f1_BW  -c1o54*drho1;
-      (D.f[DIR_M0M  ])[kbw  ] = f1_TE  -c1o54*drho1;
-      (D.f[DIR_P0M  ])[kbe  ] = f1_TW  -c1o54*drho1;
-      (D.f[DIR_M0P  ])[ktw  ] = f1_BE  -c1o54*drho1;
-      (D.f[DIR_0PP  ])[ktn  ] = f1_BS  -c1o54*drho1;
-      (D.f[DIR_0MM  ])[kbs  ] = f1_TN  -c1o54*drho1;
-      (D.f[DIR_0PM  ])[kbn  ] = f1_TS  -c1o54*drho1;
-      (D.f[DIR_0MP  ])[kts  ] = f1_BN  -c1o54*drho1;
+      (D.f[DIR_P00])[ke   ] = f1_W   -c2o27*drho1;
+      (D.f[DIR_M00])[kw   ] = f1_E   -c2o27*drho1;
+      (D.f[DIR_0P0])[kn   ] = f1_S   -c2o27*drho1;
+      (D.f[DIR_0M0])[ks   ] = f1_N   -c2o27*drho1;
+      (D.f[DIR_00P])[kt   ] = f1_B   -c2o27*drho1;
+      (D.f[DIR_00M])[kb   ] = f1_T   -c2o27*drho1;
+      (D.f[DIR_PP0])[kne  ] = f1_SW  -c1o54*drho1;
+      (D.f[DIR_MM0])[ksw  ] = f1_NE  -c1o54*drho1;
+      (D.f[DIR_PM0])[kse  ] = f1_NW  -c1o54*drho1;
+      (D.f[DIR_MP0])[knw  ] = f1_SE  -c1o54*drho1;
+      (D.f[DIR_P0P])[kte  ] = f1_BW  -c1o54*drho1;
+      (D.f[DIR_M0M])[kbw  ] = f1_TE  -c1o54*drho1;
+      (D.f[DIR_P0M])[kbe  ] = f1_TW  -c1o54*drho1;
+      (D.f[DIR_M0P])[ktw  ] = f1_BE  -c1o54*drho1;
+      (D.f[DIR_0PP])[ktn  ] = f1_BS  -c1o54*drho1;
+      (D.f[DIR_0MM])[kbs  ] = f1_TN  -c1o54*drho1;
+      (D.f[DIR_0PM])[kbn  ] = f1_TS  -c1o54*drho1;
+      (D.f[DIR_0MP])[kts  ] = f1_BN  -c1o54*drho1;
       (D.f[DIR_000])[kzero] = f1_ZERO-c8o27*drho1;
-      (D.f[DIR_PPP ])[ktne ] = f1_BSW -c1o216*drho1;
-      (D.f[DIR_MMP ])[ktsw ] = f1_BNE -c1o216*drho1;
-      (D.f[DIR_PMP ])[ktse ] = f1_BNW -c1o216*drho1;
-      (D.f[DIR_MPP ])[ktnw ] = f1_BSE -c1o216*drho1;
-      (D.f[DIR_PPM ])[kbne ] = f1_TSW -c1o216*drho1;
-      (D.f[DIR_MMM ])[kbsw ] = f1_TNE -c1o216*drho1;
-      (D.f[DIR_PMM ])[kbse ] = f1_TNW -c1o216*drho1;
-      (D.f[DIR_MPM ])[kbnw ] = f1_TSE -c1o216*drho1;       
+      (D.f[DIR_PPP])[ktne ] = f1_BSW -c1o216*drho1;
+      (D.f[DIR_MMP])[ktsw ] = f1_BNE -c1o216*drho1;
+      (D.f[DIR_PMP])[ktse ] = f1_BNW -c1o216*drho1;
+      (D.f[DIR_MPP])[ktnw ] = f1_BSE -c1o216*drho1;
+      (D.f[DIR_PPM])[kbne ] = f1_TSW -c1o216*drho1;
+      (D.f[DIR_MMM])[kbsw ] = f1_TNE -c1o216*drho1;
+      (D.f[DIR_PMM])[kbse ] = f1_TNW -c1o216*drho1;
+      (D.f[DIR_MPM])[kbnw ] = f1_TSE -c1o216*drho1;
    }
    __syncthreads();
-}          
+}
 //////////////////////////////////////////////////////////////////////////////
 
 
@@ -1416,83 +1447,84 @@ __global__ void LB_BC_Press_East27( int nx,
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QPressDevice27(real* rhoBC,
-                                           real* DD, 
-                                           int* k_Q, 
-                                           real* QQ,
-                                           unsigned int numberOfBCnodes, 
-                                           real om1, 
-                                           unsigned int* neighborX,
-                                           unsigned int* neighborY,
-                                           unsigned int* neighborZ,
-                                           unsigned int size_Mat, 
-                                           bool isEvenTimestep)
+__global__ void QPressDevice27(
+    real* rhoBC,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-   } 
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+   }
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+   const unsigned  x = threadIdx.x;  // Globaler x-Index
+   const unsigned  y = blockIdx.x;   // Globaler y-Index
+   const unsigned  z = blockIdx.y;   // Globaler z-Index
 
    const unsigned nx = blockDim.x;
    const unsigned ny = gridDim.x;
@@ -1502,29 +1534,29 @@ __global__ void QPressDevice27(real* rhoBC,
 
    if(k<numberOfBCnodes)
    {
-      real *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB, 
+      real *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB,
          *q_dirNE,  *q_dirSW,  *q_dirSE,  *q_dirNW,  *q_dirTE,  *q_dirBW,
          *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
          *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
-         *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+         *q_dirBSE, *q_dirBNW;
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -1567,46 +1599,46 @@ __global__ void QPressDevice27(real* rhoBC,
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real q, vx1, vx2, vx3, drho;
       vx1    =  ((f_TSE - f_BNW) - (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
                   ((f_BE - f_TW)   + (f_TE - f_BW))   + ((f_SE - f_NW)   + (f_NE - f_SW)) +
-                  (f_E - f_W); 
+                  (f_E - f_W);
 
 
       vx2    =   (-(f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
                   ((f_BN - f_TS)   + (f_TN - f_BS))    + (-(f_SE - f_NW)  + (f_NE - f_SW)) +
-                  (f_N - f_S); 
+                  (f_N - f_S);
 
       vx3    =   ((f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) + (f_TSW - f_BNE)) +
                   (-(f_BN - f_TS)  + (f_TN - f_BS))   + ((f_TE - f_BW)   - (f_BE - f_TW)) +
-                  (f_T - f_B); 
+                  (f_T - f_B);
 
       real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
       //////////////////////////////////////////////////////////////////////////
@@ -1616,245 +1648,245 @@ __global__ void QPressDevice27(real* rhoBC,
       ////////////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       q = q_dirE[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_M00])[kw]=c2o27* (drho+c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cu_sq); 
-         //(D.f[DIR_P00])[ke]=c2over27* (drho+three*( vx1        )+c9over2*( vx1        )*( vx1        )-cu_sq); 
+         (D.f[DIR_M00])[kw]=c2o27* (drho+c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cu_sq);
+         //(D.f[DIR_P00])[ke]=c2over27* (drho+three*( vx1        )+c9over2*( vx1        )*( vx1        )-cu_sq);
       }
 
       q = q_dirW[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_P00])[ke]=c2o27* (drho+c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cu_sq); 
-         //(D.f[DIR_M00])[kw]=c2over27* (drho+three*(-vx1        )+c9over2*(-vx1        )*(-vx1        )-cu_sq); 
+         (D.f[DIR_P00])[ke]=c2o27* (drho+c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cu_sq);
+         //(D.f[DIR_M00])[kw]=c2over27* (drho+three*(-vx1        )+c9over2*(-vx1        )*(-vx1        )-cu_sq);
       }
 
       q = q_dirN[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_0M0])[ks]=c2o27* (drho+c3o1*(   -vx2     )+c9o2*(    -vx2    )*(    -vx2    )-cu_sq); 
-         //(D.f[DIR_0P0])[kn]=c2over27* (drho+three*(    vx2     )+c9over2*(     vx2    )*(     vx2    )-cu_sq); 
+         (D.f[DIR_0M0])[ks]=c2o27* (drho+c3o1*(   -vx2     )+c9o2*(    -vx2    )*(    -vx2    )-cu_sq);
+         //(D.f[DIR_0P0])[kn]=c2over27* (drho+three*(    vx2     )+c9over2*(     vx2    )*(     vx2    )-cu_sq);
       }
 
       q = q_dirS[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_0P0])[kn]=c2o27* (drho+c3o1*(    vx2     )+c9o2*(     vx2    )*(     vx2    )-cu_sq); 
-         //(D.f[DIR_0M0])[ks]=c2over27* (drho+three*(   -vx2     )+c9over2*(    -vx2    )*(    -vx2    )-cu_sq); 
+         (D.f[DIR_0P0])[kn]=c2o27* (drho+c3o1*(    vx2     )+c9o2*(     vx2    )*(     vx2    )-cu_sq);
+         //(D.f[DIR_0M0])[ks]=c2over27* (drho+three*(   -vx2     )+c9over2*(    -vx2    )*(    -vx2    )-cu_sq);
       }
 
       q = q_dirT[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_00M])[kb]=c2o27* (drho+c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cu_sq); 
-         //(D.f[DIR_00P])[kt]=c2over27* (drho+three*(         vx3)+c9over2*(         vx3)*(         vx3)-cu_sq); 
+         (D.f[DIR_00M])[kb]=c2o27* (drho+c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cu_sq);
+         //(D.f[DIR_00P])[kt]=c2over27* (drho+three*(         vx3)+c9over2*(         vx3)*(         vx3)-cu_sq);
       }
 
       q = q_dirB[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_00P])[kt]=c2o27* (drho+c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cu_sq); 
-         //(D.f[DIR_00M])[kb]=c2over27* (drho+three*(        -vx3)+c9over2*(        -vx3)*(        -vx3)-cu_sq); 
+         (D.f[DIR_00P])[kt]=c2o27* (drho+c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cu_sq);
+         //(D.f[DIR_00M])[kb]=c2over27* (drho+three*(        -vx3)+c9over2*(        -vx3)*(        -vx3)-cu_sq);
       }
 
       q = q_dirNE[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_MM0])[ksw]=c1o54* (drho+c3o1*(-vx1-vx2    )+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq); 
-         //(D.f[DIR_PP0])[kne]=c1over54* (drho+three*( vx1+vx2    )+c9over2*( vx1+vx2    )*( vx1+vx2    )-cu_sq); 
+         (D.f[DIR_MM0])[ksw]=c1o54* (drho+c3o1*(-vx1-vx2    )+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
+         //(D.f[DIR_PP0])[kne]=c1over54* (drho+three*( vx1+vx2    )+c9over2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
       }
 
       q = q_dirSW[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_PP0])[kne]=c1o54* (drho+c3o1*( vx1+vx2    )+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq); 
-         //(D.f[DIR_MM0])[ksw]=c1over54* (drho+three*(-vx1-vx2    )+c9over2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq); 
+         (D.f[DIR_PP0])[kne]=c1o54* (drho+c3o1*( vx1+vx2    )+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
+         //(D.f[DIR_MM0])[ksw]=c1over54* (drho+three*(-vx1-vx2    )+c9over2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
       }
 
       q = q_dirSE[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_MP0])[knw]=c1o54* (drho+c3o1*(-vx1+vx2    )+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq); 
-         //(D.f[DIR_PM0])[kse]=c1over54* (drho+three*( vx1-vx2    )+c9over2*( vx1-vx2    )*( vx1-vx2    )-cu_sq); 
+         (D.f[DIR_MP0])[knw]=c1o54* (drho+c3o1*(-vx1+vx2    )+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
+         //(D.f[DIR_PM0])[kse]=c1over54* (drho+three*( vx1-vx2    )+c9over2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
       }
 
       q = q_dirNW[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_PM0])[kse]=c1o54* (drho+c3o1*( vx1-vx2    )+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq); 
-         //(D.f[DIR_MP0])[knw]=c1over54* (drho+three*(-vx1+vx2    )+c9over2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq); 
+         (D.f[DIR_PM0])[kse]=c1o54* (drho+c3o1*( vx1-vx2    )+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
+         //(D.f[DIR_MP0])[knw]=c1over54* (drho+three*(-vx1+vx2    )+c9over2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
       }
 
       q = q_dirTE[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_M0M])[kbw]=c1o54* (drho+c3o1*(-vx1    -vx3)+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq); 
-         //(D.f[DIR_P0P])[kte]=c1over54* (drho+three*( vx1    +vx3)+c9over2*( vx1    +vx3)*( vx1    +vx3)-cu_sq); 
+         (D.f[DIR_M0M])[kbw]=c1o54* (drho+c3o1*(-vx1    -vx3)+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
+         //(D.f[DIR_P0P])[kte]=c1over54* (drho+three*( vx1    +vx3)+c9over2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
       }
 
       q = q_dirBW[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_P0P])[kte]=c1o54* (drho+c3o1*( vx1    +vx3)+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq); 
-         //(D.f[DIR_M0M])[kbw]=c1over54* (drho+three*(-vx1    -vx3)+c9over2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq); 
+         (D.f[DIR_P0P])[kte]=c1o54* (drho+c3o1*( vx1    +vx3)+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
+         //(D.f[DIR_M0M])[kbw]=c1over54* (drho+three*(-vx1    -vx3)+c9over2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
       }
 
       q = q_dirBE[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_M0P])[ktw]=c1o54* (drho+c3o1*(-vx1    +vx3)+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq); 
-         //(D.f[DIR_P0M])[kbe]=c1over54* (drho+three*( vx1    -vx3)+c9over2*( vx1    -vx3)*( vx1    -vx3)-cu_sq); 
+         (D.f[DIR_M0P])[ktw]=c1o54* (drho+c3o1*(-vx1    +vx3)+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
+         //(D.f[DIR_P0M])[kbe]=c1over54* (drho+three*( vx1    -vx3)+c9over2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
       }
 
       q = q_dirTW[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_P0M])[kbe]=c1o54* (drho+c3o1*( vx1    -vx3)+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq); 
-         //(D.f[DIR_M0P])[ktw]=c1over54* (drho+three*(-vx1    +vx3)+c9over2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq); 
+         (D.f[DIR_P0M])[kbe]=c1o54* (drho+c3o1*( vx1    -vx3)+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
+         //(D.f[DIR_M0P])[ktw]=c1over54* (drho+three*(-vx1    +vx3)+c9over2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
       }
 
       q = q_dirTN[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_0MM])[kbs]=c1o54* (drho+c3o1*(    -vx2-vx3)+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq); 
-         //(D.f[DIR_0PP])[ktn]=c1over54* (drho+three*(     vx2+vx3)+c9over2*(     vx2+vx3)*(     vx2+vx3)-cu_sq); 
+         (D.f[DIR_0MM])[kbs]=c1o54* (drho+c3o1*(    -vx2-vx3)+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
+         //(D.f[DIR_0PP])[ktn]=c1over54* (drho+three*(     vx2+vx3)+c9over2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
       }
 
       q = q_dirBS[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_0PP])[ktn]=c1o54* (drho+c3o1*(     vx2+vx3)+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq); 
-         //(D.f[DIR_0MM])[kbs]=c1over54* (drho+three*(    -vx2-vx3)+c9over2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq); 
+         (D.f[DIR_0PP])[ktn]=c1o54* (drho+c3o1*(     vx2+vx3)+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
+         //(D.f[DIR_0MM])[kbs]=c1over54* (drho+three*(    -vx2-vx3)+c9over2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
       }
 
       q = q_dirBN[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_0MP])[kts]=c1o54* (drho+c3o1*(    -vx2+vx3)+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq); 
-         //(D.f[DIR_0PM])[kbn]=c1over54* (drho+three*(     vx2-vx3)+c9over2*(     vx2-vx3)*(     vx2-vx3)-cu_sq); 
+         (D.f[DIR_0MP])[kts]=c1o54* (drho+c3o1*(    -vx2+vx3)+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
+         //(D.f[DIR_0PM])[kbn]=c1over54* (drho+three*(     vx2-vx3)+c9over2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
       }
 
       q = q_dirTS[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_0PM])[kbn]=c1o54* (drho+c3o1*(     vx2-vx3)+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq); 
-         //(D.f[DIR_0MP])[kts]=c1over54* (drho+three*(    -vx2+vx3)+c9over2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq); 
+         (D.f[DIR_0PM])[kbn]=c1o54* (drho+c3o1*(     vx2-vx3)+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
+         //(D.f[DIR_0MP])[kts]=c1over54* (drho+three*(    -vx2+vx3)+c9over2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
       }
 
       q = q_dirTNE[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_MMM])[kbsw]=c1o216*(drho+c3o1*(-vx1-vx2-vx3)+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq); 
-         //(D.f[DIR_PPP])[ktne]=c1over216*(drho+three*( vx1+vx2+vx3)+c9over2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq); 
+         (D.f[DIR_MMM])[kbsw]=c1o216*(drho+c3o1*(-vx1-vx2-vx3)+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
+         //(D.f[DIR_PPP])[ktne]=c1over216*(drho+three*( vx1+vx2+vx3)+c9over2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
       }
 
       q = q_dirBSW[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_PPP])[ktne]=c1o216*(drho+c3o1*( vx1+vx2+vx3)+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq); 
-         //(D.f[DIR_MMM])[kbsw]=c1over216*(drho+three*(-vx1-vx2-vx3)+c9over2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq); 
+         (D.f[DIR_PPP])[ktne]=c1o216*(drho+c3o1*( vx1+vx2+vx3)+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
+         //(D.f[DIR_MMM])[kbsw]=c1over216*(drho+three*(-vx1-vx2-vx3)+c9over2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
       }
 
       q = q_dirBNE[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_MMP])[ktsw]=c1o216*(drho+c3o1*(-vx1-vx2+vx3)+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq); 
-         //(D.f[DIR_PPM])[kbne]=c1over216*(drho+three*( vx1+vx2-vx3)+c9over2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq); 
+         (D.f[DIR_MMP])[ktsw]=c1o216*(drho+c3o1*(-vx1-vx2+vx3)+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
+         //(D.f[DIR_PPM])[kbne]=c1over216*(drho+three*( vx1+vx2-vx3)+c9over2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
       }
 
       q = q_dirTSW[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_PPM])[kbne]=c1o216*(drho+c3o1*( vx1+vx2-vx3)+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq); 
-         //(D.f[DIR_MMP])[ktsw]=c1over216*(drho+three*(-vx1-vx2+vx3)+c9over2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq); 
+         (D.f[DIR_PPM])[kbne]=c1o216*(drho+c3o1*( vx1+vx2-vx3)+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
+         //(D.f[DIR_MMP])[ktsw]=c1over216*(drho+three*(-vx1-vx2+vx3)+c9over2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
       }
 
       q = q_dirTSE[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_MPM])[kbnw]=c1o216*(drho+c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq); 
-         //(D.f[DIR_PMP])[ktse]=c1over216*(drho+three*( vx1-vx2+vx3)+c9over2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq); 
+         (D.f[DIR_MPM])[kbnw]=c1o216*(drho+c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
+         //(D.f[DIR_PMP])[ktse]=c1over216*(drho+three*( vx1-vx2+vx3)+c9over2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
       }
 
       q = q_dirBNW[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_PMP])[ktse]=c1o216*(drho+c3o1*( vx1-vx2+vx3)+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq); 
-         //(D.f[DIR_MPM])[kbnw]=c1over216*(drho+three*(-vx1+vx2-vx3)+c9over2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq); 
+         (D.f[DIR_PMP])[ktse]=c1o216*(drho+c3o1*( vx1-vx2+vx3)+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
+         //(D.f[DIR_MPM])[kbnw]=c1over216*(drho+three*(-vx1+vx2-vx3)+c9over2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
       }
 
       q = q_dirBSE[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_MPP])[ktnw]=c1o216*(drho+c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq); 
-         //(D.f[DIR_PMM])[kbse]=c1over216*(drho+three*( vx1-vx2-vx3)+c9over2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq); 
+         (D.f[DIR_MPP])[ktnw]=c1o216*(drho+c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
+         //(D.f[DIR_PMM])[kbse]=c1over216*(drho+three*( vx1-vx2-vx3)+c9over2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
       }
 
       q = q_dirTNW[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_PMM])[kbse]=c1o216*(drho+c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq); 
-         //(D.f[DIR_MPP])[ktnw]=c1over216*(drho+three*(-vx1+vx2+vx3)+c9over2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq); 
+         (D.f[DIR_PMM])[kbse]=c1o216*(drho+c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
+         //(D.f[DIR_MPP])[ktnw]=c1over216*(drho+three*(-vx1+vx2+vx3)+c9over2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
       }
    }
 }
@@ -1899,86 +1931,87 @@ __global__ void QPressDevice27(real* rhoBC,
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QPressDeviceAntiBB27(   real* rhoBC,
-												   real* vx,
-												   real* vy,
-												   real* vz,
-												   real* DD, 
-												   int* k_Q, 
-												   real* QQ,
-												   int numberOfBCnodes, 
-												   real om1, 
-												   unsigned int* neighborX,
-												   unsigned int* neighborY,
-												   unsigned int* neighborZ,
-												   unsigned int size_Mat, 
-												   bool isEvenTimestep)
+__global__ void QPressDeviceAntiBB27(
+    real* rhoBC,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-   } 
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+   }
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+   const unsigned  x = threadIdx.x;  // Globaler x-Index
+   const unsigned  y = blockIdx.x;   // Globaler y-Index
+   const unsigned  z = blockIdx.y;   // Globaler z-Index
 
    const unsigned nx = blockDim.x;
    const unsigned ny = gridDim.x;
@@ -1988,37 +2021,37 @@ __global__ void QPressDeviceAntiBB27(   real* rhoBC,
 
    if(k<numberOfBCnodes)
    {
-      real *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB, 
+      real *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB,
          *q_dirNE,  *q_dirSW,  *q_dirSE,  *q_dirNW,  *q_dirTE,  *q_dirBW,
          *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
          *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
-         *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   *numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   *numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   *numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   *numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   *numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   *numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  *numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  *numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  *numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  *numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  *numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  *numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  *numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  *numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  *numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  *numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  *numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  *numberOfBCnodes];
-      q_dirTNE = &QQ[DIR_PPP *numberOfBCnodes];
-      q_dirTSW = &QQ[DIR_MMP *numberOfBCnodes];
-      q_dirTSE = &QQ[DIR_PMP *numberOfBCnodes];
-      q_dirTNW = &QQ[DIR_MPP *numberOfBCnodes];
-      q_dirBNE = &QQ[DIR_PPM *numberOfBCnodes];
-      q_dirBSW = &QQ[DIR_MMM *numberOfBCnodes];
-      q_dirBSE = &QQ[DIR_PMM *numberOfBCnodes];
-      q_dirBNW = &QQ[DIR_MPM *numberOfBCnodes];
+         *q_dirBSE, *q_dirBNW;
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
+      q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
+      q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
+      q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
+      q_dirTNW = &QQ[DIR_MPP * numberOfBCnodes];
+      q_dirBNE = &QQ[DIR_PPM * numberOfBCnodes];
+      q_dirBSW = &QQ[DIR_MMM * numberOfBCnodes];
+      q_dirBSE = &QQ[DIR_PMM * numberOfBCnodes];
+      q_dirBNW = &QQ[DIR_MPM * numberOfBCnodes];
       ////////////////////////////////////////////////////////////////////////////////
       //index
       unsigned int KQK  = k_Q[k];
@@ -2053,123 +2086,123 @@ __global__ void QPressDeviceAntiBB27(   real* rhoBC,
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW, f_ZERO;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
       f_ZERO = (D.f[DIR_000])[kzero];
       ////////////////////////////////////////////////////////////////////////////////
       //real vx1, vx2, vx3, drho;
       //vx1    =  ((f_TSE - f_BNW) - (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
       //            ((f_BE - f_TW)   + (f_TE - f_BW))   + ((f_SE - f_NW)   + (f_NE - f_SW)) +
-      //            (f_E - f_W); 
+      //            (f_E - f_W);
 
 
       //vx2    =   (-(f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
       //            ((f_BN - f_TS)   + (f_TN - f_BS))    + (-(f_SE - f_NW)  + (f_NE - f_SW)) +
-      //            (f_N - f_S); 
+      //            (f_N - f_S);
 
       //vx3    =   ((f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) + (f_TSW - f_BNE)) +
       //            (-(f_BN - f_TS)  + (f_TN - f_BS))   + ((f_TE - f_BW)   - (f_BE - f_TW)) +
-      //            (f_T - f_B); 
+      //            (f_T - f_B);
 
       //real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
       //////////////////////////////////////////////////////////////////////////
       real drho    = f_ZERO+f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+
-						f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
+                  f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
       drho = drho - rhoBC[k];
-	  drho *= 0.01f;
+     drho *= 0.01f;
       ////////////////////////////////////////////////////////////////////////////////
-	  real q;
+     real q;
       //deltaRho = (rhoBC[k] + one) / (deltaRho + one);
       ////////////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       q = q_dirE[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_M00])[kw]=f_W-c2o27*drho; 
+         (D.f[DIR_M00])[kw]=f_W-c2o27*drho;
       }
 
       q = q_dirW[k];
@@ -2181,19 +2214,19 @@ __global__ void QPressDeviceAntiBB27(   real* rhoBC,
       q = q_dirN[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_0M0])[ks]=f_S-c2o27*drho; 
+         (D.f[DIR_0M0])[ks]=f_S-c2o27*drho;
       }
 
       q = q_dirS[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_0P0])[kn]=f_N-c2o27*drho; 
+         (D.f[DIR_0P0])[kn]=f_N-c2o27*drho;
       }
 
       q = q_dirT[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_00M])[kb]=f_B-c2o27*drho; 
+         (D.f[DIR_00M])[kb]=f_B-c2o27*drho;
       }
 
       q = q_dirB[k];
@@ -2229,13 +2262,13 @@ __global__ void QPressDeviceAntiBB27(   real* rhoBC,
       q = q_dirTE[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_M0M])[kbw]=f_BW-c1o54*drho; 
+         (D.f[DIR_M0M])[kbw]=f_BW-c1o54*drho;
       }
 
       q = q_dirBW[k];
       if (q>=c0o1 && q<=c1o1)
       {
-         (D.f[DIR_P0P])[kte]=f_TE-c1o54*drho; 
+         (D.f[DIR_P0P])[kte]=f_TE-c1o54*drho;
       }
 
       q = q_dirBE[k];
@@ -2364,21 +2397,22 @@ __global__ void QPressDeviceAntiBB27(   real* rhoBC,
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QPressDeviceFixBackflow27( real* rhoBC,
-                                                      real* DD, 
-                                                      int* k_Q, 
-                                                      int numberOfBCnodes, 
-                                                      real om1, 
-                                                      unsigned int* neighborX,
-                                                      unsigned int* neighborY,
-                                                      unsigned int* neighborZ,
-                                                      unsigned int size_Mat, 
-                                                      bool isEvenTimestep)
+__global__ void QPressDeviceFixBackflow27(
+    real* rhoBC,
+    real* DD,
+    int* k_Q,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+   const unsigned  x = threadIdx.x;  // Globaler x-Index
+   const unsigned  y = blockIdx.x;   // Globaler y-Index
+   const unsigned  z = blockIdx.y;   // Globaler z-Index
 
    const unsigned nx = blockDim.x;
    const unsigned ny = gridDim.x;
@@ -2426,63 +2460,63 @@ __global__ void QPressDeviceFixBackflow27( real* rhoBC,
       Distributions27 D;
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
          (D.f[DIR_M00])[kw]       = c2o27  * deltaRho;
@@ -2555,21 +2589,22 @@ __global__ void QPressDeviceFixBackflow27( real* rhoBC,
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QPressDeviceDirDepBot27(  real* rhoBC,
-                                                     real* DD, 
-                                                     int* k_Q, 
-                                                     int numberOfBCnodes, 
-                                                     real om1, 
-                                                     unsigned int* neighborX,
-                                                     unsigned int* neighborY,
-                                                     unsigned int* neighborZ,
-                                                     unsigned int size_Mat, 
-                                                     bool isEvenTimestep)
+__global__ void QPressDeviceDirDepBot27(
+    real* rhoBC,
+    real* DD,
+    int* k_Q,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+   const unsigned  x = threadIdx.x;  // Globaler x-Index
+   const unsigned  y = blockIdx.x;   // Globaler y-Index
+   const unsigned  z = blockIdx.y;   // Globaler z-Index
 
    const unsigned nx = blockDim.x;
    const unsigned ny = gridDim.x;
@@ -2617,86 +2652,86 @@ __global__ void QPressDeviceDirDepBot27(  real* rhoBC,
       Distributions27 D;
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real f_E,f_W,f_N,f_S,f_T,f_NE,f_SW,f_SE,f_NW,f_TE,f_TW,f_TN,f_TS,f_ZERO,f_TNE,f_TSW,f_TSE,f_TNW;//,
             //f_B,f_BW,f_BE,f_BS,f_BN,f_BSW,f_BNE,f_BNW,f_BSE;
 
-      f_E    = (D.f[DIR_P00   ])[ke   ];
-      f_W    = (D.f[DIR_M00   ])[kw   ];
-      f_N    = (D.f[DIR_0P0   ])[kn   ];
-      f_S    = (D.f[DIR_0M0   ])[ks   ];
-      f_T    = (D.f[DIR_00P   ])[kt   ];
-      f_NE   = (D.f[DIR_PP0  ])[kne  ];
-      f_SW   = (D.f[DIR_MM0  ])[ksw  ];
-      f_SE   = (D.f[DIR_PM0  ])[kse  ];
-      f_NW   = (D.f[DIR_MP0  ])[knw  ];
-      f_TE   = (D.f[DIR_P0P  ])[kte  ];
-      f_TW   = (D.f[DIR_M0P  ])[ktw  ];
-      f_TN   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TS   = (D.f[DIR_0MP  ])[kts  ];
+      f_E    = (D.f[DIR_P00])[ke   ];
+      f_W    = (D.f[DIR_M00])[kw   ];
+      f_N    = (D.f[DIR_0P0])[kn   ];
+      f_S    = (D.f[DIR_0M0])[ks   ];
+      f_T    = (D.f[DIR_00P])[kt   ];
+      f_NE   = (D.f[DIR_PP0])[kne  ];
+      f_SW   = (D.f[DIR_MM0])[ksw  ];
+      f_SE   = (D.f[DIR_PM0])[kse  ];
+      f_NW   = (D.f[DIR_MP0])[knw  ];
+      f_TE   = (D.f[DIR_P0P])[kte  ];
+      f_TW   = (D.f[DIR_M0P])[ktw  ];
+      f_TN   = (D.f[DIR_0PP])[ktn  ];
+      f_TS   = (D.f[DIR_0MP])[kts  ];
       f_ZERO = (D.f[DIR_000])[kzero];
-      f_TNE  = (D.f[DIR_PPP ])[ktne ];
-      f_TSW  = (D.f[DIR_MMP ])[ktsw ];
-      f_TSE  = (D.f[DIR_PMP ])[ktse ];
-      f_TNW  = (D.f[DIR_MPP ])[ktnw ];
+      f_TNE  = (D.f[DIR_PPP])[ktne ];
+      f_TSW  = (D.f[DIR_MMP])[ktsw ];
+      f_TSE  = (D.f[DIR_PMP])[ktse ];
+      f_TNW  = (D.f[DIR_MPP])[ktnw ];
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
       //f_B   = (four*rho- four*f_SW-     eight*f_TSW-four*f_W-   eight*f_TW- four*f_NW-     eight*f_TNW-four*f_S-   eight*f_TS-four*f_ZERO+     f_T-four*f_N-   eight*f_TN- four*f_SE-     eight*f_TSE-four*f_E-   eight*f_TE- four*f_NE-     eight*f_TNE)/nine;
@@ -2793,496 +2828,474 @@ __global__ void QPressDeviceDirDepBot27(  real* rhoBC,
 
 
 
-
-
+__host__ __device__ real computeOutflowDistribution(const real* const &f, const real* const &f1, const int dir, const real cs)
+{
+   return f1[dir] * cs + (c1o1 - cs) * f[dir];
+}
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-__global__ void QPressNoRhoDevice27(  real* rhoBC,
-												 real* DD, 
-												 int* k_Q, 
-												 int* k_N, 
-												 int numberOfBCnodes, 
-												 real om1, 
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int size_Mat, 
-												 bool isEvenTimestep)
+__global__ void QPressNoRhoDevice27(
+    real* rhoBC,
+    real* distributions,
+    int* k_Q,
+    int* k_N,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    int direction)
 {
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+   //! - Get the node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+   //!
+   const unsigned nodeIndex = getNodeIndex();
 
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
+   //////////////////////////////////////////////////////////////////////////
 
-   const unsigned k = nx*(ny*z + y) + x;
+   if(nodeIndex >= numberOfBCnodes) return;
+
+   ////////////////////////////////////////////////////////////////////////////////
+   //index
+   unsigned int KQK  = k_Q[nodeIndex];
+   // unsigned int kzero= KQK;
+   unsigned int ke   = KQK;
+   unsigned int kw   = neighborX[KQK];
+   unsigned int kn   = KQK;
+   unsigned int ks   = neighborY[KQK];
+   unsigned int kt   = KQK;
+   unsigned int kb   = neighborZ[KQK];
+   unsigned int ksw  = neighborY[kw];
+   unsigned int kne  = KQK;
+   unsigned int kse  = ks;
+   unsigned int knw  = kw;
+   unsigned int kbw  = neighborZ[kw];
+   unsigned int kte  = KQK;
+   unsigned int kbe  = kb;
+   unsigned int ktw  = kw;
+   unsigned int kbs  = neighborZ[ks];
+   unsigned int ktn  = KQK;
+   unsigned int kbn  = kb;
+   unsigned int kts  = ks;
+   unsigned int ktse = ks;
+   unsigned int kbnw = kbw;
+   unsigned int ktnw = kw;
+   unsigned int kbse = kbs;
+   unsigned int ktsw = ksw;
+   unsigned int kbne = kb;
+   unsigned int ktne = KQK;
+   unsigned int kbsw = neighborZ[ksw];
+   ////////////////////////////////////////////////////////////////////////////////
+   //index1
+   unsigned int K1QK  = k_N[nodeIndex];
+   //unsigned int k1zero= K1QK;
+   unsigned int k1e   = K1QK;
+   unsigned int k1w   = neighborX[K1QK];
+   unsigned int k1n   = K1QK;
+   unsigned int k1s   = neighborY[K1QK];
+   unsigned int k1t   = K1QK;
+   unsigned int k1b   = neighborZ[K1QK];
+   unsigned int k1sw  = neighborY[k1w];
+   unsigned int k1ne  = K1QK;
+   unsigned int k1se  = k1s;
+   unsigned int k1nw  = k1w;
+   unsigned int k1bw  = neighborZ[k1w];
+   unsigned int k1te  = K1QK;
+   unsigned int k1be  = k1b;
+   unsigned int k1tw  = k1w;
+   unsigned int k1bs  = neighborZ[k1s];
+   unsigned int k1tn  = K1QK;
+   unsigned int k1bn  = k1b;
+   unsigned int k1ts  = k1s;
+   unsigned int k1tse = k1s;
+   unsigned int k1bnw = k1bw;
+   unsigned int k1tnw = k1w;
+   unsigned int k1bse = k1bs;
+   unsigned int k1tsw = k1sw;
+   unsigned int k1bne = k1b;
+   unsigned int k1tne = K1QK;
+   unsigned int k1bsw = neighborZ[k1sw];
+   ////////////////////////////////////////////////////////////////////////////////
+   Distributions27 dist;
+   getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+   real f[27], f1[27];
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   f1[DIR_P00] = (dist.f[DIR_P00])[k1e   ];
+   f1[DIR_M00] = (dist.f[DIR_M00])[k1w   ];
+   f1[DIR_0P0] = (dist.f[DIR_0P0])[k1n   ];
+   f1[DIR_0M0] = (dist.f[DIR_0M0])[k1s   ];
+   f1[DIR_00P] = (dist.f[DIR_00P])[k1t   ];
+   f1[DIR_00M] = (dist.f[DIR_00M])[k1b   ];
+   f1[DIR_PP0] = (dist.f[DIR_PP0])[k1ne  ];
+   f1[DIR_MM0] = (dist.f[DIR_MM0])[k1sw  ];
+   f1[DIR_PM0] = (dist.f[DIR_PM0])[k1se  ];
+   f1[DIR_MP0] = (dist.f[DIR_MP0])[k1nw  ];
+   f1[DIR_P0P] = (dist.f[DIR_P0P])[k1te  ];
+   f1[DIR_M0M] = (dist.f[DIR_M0M])[k1bw  ];
+   f1[DIR_P0M] = (dist.f[DIR_P0M])[k1be  ];
+   f1[DIR_M0P] = (dist.f[DIR_M0P])[k1tw  ];
+   f1[DIR_0PP] = (dist.f[DIR_0PP])[k1tn  ];
+   f1[DIR_0MM] = (dist.f[DIR_0MM])[k1bs  ];
+   f1[DIR_0PM] = (dist.f[DIR_0PM])[k1bn  ];
+   f1[DIR_0MP] = (dist.f[DIR_0MP])[k1ts  ];
+   // f1[DIR_000] = (dist.f[DIR_000])[k1zero];
+   f1[DIR_PPP] = (dist.f[DIR_PPP])[k1tne ];
+   f1[DIR_MMP] = (dist.f[DIR_MMP])[k1tsw ];
+   f1[DIR_PMP] = (dist.f[DIR_PMP])[k1tse ];
+   f1[DIR_MPP] = (dist.f[DIR_MPP])[k1tnw ];
+   f1[DIR_PPM] = (dist.f[DIR_PPM])[k1bne ];
+   f1[DIR_MMM] = (dist.f[DIR_MMM])[k1bsw ];
+   f1[DIR_PMM] = (dist.f[DIR_PMM])[k1bse ];
+   f1[DIR_MPM] = (dist.f[DIR_MPM])[k1bnw ];
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   f[DIR_P00] = (dist.f[DIR_P00])[ke   ];
+   f[DIR_M00] = (dist.f[DIR_M00])[kw   ];
+   f[DIR_0P0] = (dist.f[DIR_0P0])[kn   ];
+   f[DIR_0M0] = (dist.f[DIR_0M0])[ks   ];
+   f[DIR_00P] = (dist.f[DIR_00P])[kt   ];
+   f[DIR_00M] = (dist.f[DIR_00M])[kb   ];
+   f[DIR_PP0] = (dist.f[DIR_PP0])[kne  ];
+   f[DIR_MM0] = (dist.f[DIR_MM0])[ksw  ];
+   f[DIR_PM0] = (dist.f[DIR_PM0])[kse  ];
+   f[DIR_MP0] = (dist.f[DIR_MP0])[knw  ];
+   f[DIR_P0P] = (dist.f[DIR_P0P])[kte  ];
+   f[DIR_M0M] = (dist.f[DIR_M0M])[kbw  ];
+   f[DIR_P0M] = (dist.f[DIR_P0M])[kbe  ];
+   f[DIR_M0P] = (dist.f[DIR_M0P])[ktw  ];
+   f[DIR_0PP] = (dist.f[DIR_0PP])[ktn  ];
+   f[DIR_0MM] = (dist.f[DIR_0MM])[kbs  ];
+   f[DIR_0PM] = (dist.f[DIR_0PM])[kbn  ];
+   f[DIR_0MP] = (dist.f[DIR_0MP])[kts  ];
+   // f[DIR_000] = (dist.f[DIR_000])[kzero];
+   f[DIR_PPP] = (dist.f[DIR_PPP])[ktne ];
+   f[DIR_MMP] = (dist.f[DIR_MMP])[ktsw ];
+   f[DIR_PMP] = (dist.f[DIR_PMP])[ktse ];
+   f[DIR_MPP] = (dist.f[DIR_MPP])[ktnw ];
+   f[DIR_PPM] = (dist.f[DIR_PPM])[kbne ];
+   f[DIR_MMM] = (dist.f[DIR_MMM])[kbsw ];
+   f[DIR_PMM] = (dist.f[DIR_PMM])[kbse ];
+   f[DIR_MPM] = (dist.f[DIR_MPM])[kbnw ];
    //////////////////////////////////////////////////////////////////////////
 
-   if(k<numberOfBCnodes)
+
+   real cs = c1o1 / sqrtf(c3o1);
+
+   //////////////////////////////////////////////////////////////////////////
+   getPointersToDistributions(dist, distributions, numberOfLBnodes, !isEvenTimestep);
+   switch(direction)
    {
-      ////////////////////////////////////////////////////////////////////////////////
-      //index
-      unsigned int KQK  = k_Q[k];
-      //unsigned int kzero= KQK;
-      unsigned int ke   = KQK;
-      unsigned int kw   = neighborX[KQK];
-      unsigned int kn   = KQK;
-      unsigned int ks   = neighborY[KQK];
-      unsigned int kt   = KQK;
-      unsigned int kb   = neighborZ[KQK];
-      unsigned int ksw  = neighborY[kw];
-      unsigned int kne  = KQK;
-      unsigned int kse  = ks;
-      unsigned int knw  = kw;
-      unsigned int kbw  = neighborZ[kw];
-      unsigned int kte  = KQK;
-      unsigned int kbe  = kb;
-      unsigned int ktw  = kw;
-      unsigned int kbs  = neighborZ[ks];
-      unsigned int ktn  = KQK;
-      unsigned int kbn  = kb;
-      unsigned int kts  = ks;
-      unsigned int ktse = ks;
-      unsigned int kbnw = kbw;
-      unsigned int ktnw = kw;
-      unsigned int kbse = kbs;
-      unsigned int ktsw = ksw;
-      unsigned int kbne = kb;
-      unsigned int ktne = KQK;
-      unsigned int kbsw = neighborZ[ksw];
-      ////////////////////////////////////////////////////////////////////////////////
-      //index1
-      unsigned int K1QK  = k_N[k];
-      //unsigned int k1zero= K1QK;
-      unsigned int k1e   = K1QK;
-      unsigned int k1w   = neighborX[K1QK];
-      unsigned int k1n   = K1QK;
-      unsigned int k1s   = neighborY[K1QK];
-      unsigned int k1t   = K1QK;
-      unsigned int k1b   = neighborZ[K1QK];
-      unsigned int k1sw  = neighborY[k1w];
-      unsigned int k1ne  = K1QK;
-      unsigned int k1se  = k1s;
-      unsigned int k1nw  = k1w;
-      unsigned int k1bw  = neighborZ[k1w];
-      unsigned int k1te  = K1QK;
-      unsigned int k1be  = k1b;
-      unsigned int k1tw  = k1w;
-      unsigned int k1bs  = neighborZ[k1s];
-      unsigned int k1tn  = K1QK;
-      unsigned int k1bn  = k1b;
-      unsigned int k1ts  = k1s;
-      unsigned int k1tse = k1s;
-      unsigned int k1bnw = k1bw;
-      unsigned int k1tnw = k1w;
-      unsigned int k1bse = k1bs;
-      unsigned int k1tsw = k1sw;
-      unsigned int k1bne = k1b;
-      unsigned int k1tne = K1QK;
-      unsigned int k1bsw = neighborZ[k1sw];
-      ////////////////////////////////////////////////////////////////////////////////
-      Distributions27 D;
-      if (isEvenTimestep==true)
-      {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
-      else
-      {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
-      }
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      real f1_E    = (D.f[DIR_P00   ])[k1e   ];
-      real f1_W    = (D.f[DIR_M00   ])[k1w   ];
-      real f1_N    = (D.f[DIR_0P0   ])[k1n   ];
-      real f1_S    = (D.f[DIR_0M0   ])[k1s   ];
-      real f1_T    = (D.f[DIR_00P   ])[k1t   ];
-      real f1_B    = (D.f[DIR_00M   ])[k1b   ];
-      real f1_NE   = (D.f[DIR_PP0  ])[k1ne  ];
-      real f1_SW   = (D.f[DIR_MM0  ])[k1sw  ];
-      real f1_SE   = (D.f[DIR_PM0  ])[k1se  ];
-      real f1_NW   = (D.f[DIR_MP0  ])[k1nw  ];
-      real f1_TE   = (D.f[DIR_P0P  ])[k1te  ];
-      real f1_BW   = (D.f[DIR_M0M  ])[k1bw  ];
-      real f1_BE   = (D.f[DIR_P0M  ])[k1be  ];
-      real f1_TW   = (D.f[DIR_M0P  ])[k1tw  ];
-      real f1_TN   = (D.f[DIR_0PP  ])[k1tn  ];
-      real f1_BS   = (D.f[DIR_0MM  ])[k1bs  ];
-      real f1_BN   = (D.f[DIR_0PM  ])[k1bn  ];
-      real f1_TS   = (D.f[DIR_0MP  ])[k1ts  ];
-      //real f1_ZERO = (D.f[DIR_000])[k1zero];
-      real f1_TNE  = (D.f[DIR_PPP ])[k1tne ];
-      real f1_TSW  = (D.f[DIR_MMP ])[k1tsw ];
-      real f1_TSE  = (D.f[DIR_PMP ])[k1tse ];
-      real f1_TNW  = (D.f[DIR_MPP ])[k1tnw ];
-      real f1_BNE  = (D.f[DIR_PPM ])[k1bne ];
-      real f1_BSW  = (D.f[DIR_MMM ])[k1bsw ];
-      real f1_BSE  = (D.f[DIR_PMM ])[k1bse ];
-      real f1_BNW  = (D.f[DIR_MPM ])[k1bnw ];
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      real f_E    = (D.f[DIR_P00   ])[ke   ];
-      real f_W    = (D.f[DIR_M00   ])[kw   ];
-      real f_N    = (D.f[DIR_0P0   ])[kn   ];
-      real f_S    = (D.f[DIR_0M0   ])[ks   ];
-      real f_T    = (D.f[DIR_00P   ])[kt   ];
-      real f_B    = (D.f[DIR_00M   ])[kb   ];
-      real f_NE   = (D.f[DIR_PP0  ])[kne  ];
-      real f_SW   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_SE   = (D.f[DIR_PM0  ])[kse  ];
-      real f_NW   = (D.f[DIR_MP0  ])[knw  ];
-      real f_TE   = (D.f[DIR_P0P  ])[kte  ];
-      real f_BW   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_BE   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_TW   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_TN   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_BS   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_BN   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_TS   = (D.f[DIR_0MP  ])[kts  ];
-      //real f_ZERO = (D.f[DIR_000])[kzero];
-      real f_TNE  = (D.f[DIR_PPP ])[ktne ];
-      real f_TSW  = (D.f[DIR_MMP ])[ktsw ];
-      real f_TSE  = (D.f[DIR_PMP ])[ktse ];
-      real f_TNW  = (D.f[DIR_MPP ])[ktnw ];
-      real f_BNE  = (D.f[DIR_PPM ])[kbne ];
-      real f_BSW  = (D.f[DIR_MMM ])[kbsw ];
-      real f_BSE  = (D.f[DIR_PMM ])[kbse ];
-      real f_BNW  = (D.f[DIR_MPM ])[kbnw ];
-      //////////////////////////////////////////////////////////////////////////
+      case MZZ:
+         (dist.f[DIR_P00])[ke   ] = computeOutflowDistribution(f, f1, DIR_P00, cs);
+         (dist.f[DIR_PM0])[kse  ] = computeOutflowDistribution(f, f1, DIR_PM0, cs);
+         (dist.f[DIR_PP0])[kne  ] = computeOutflowDistribution(f, f1, DIR_PP0, cs);
+         (dist.f[DIR_P0M])[kbe  ] = computeOutflowDistribution(f, f1, DIR_P0M, cs);
+         (dist.f[DIR_P0P])[kte  ] = computeOutflowDistribution(f, f1, DIR_P0P, cs);
+         (dist.f[DIR_PMP])[ktse ] = computeOutflowDistribution(f, f1, DIR_PMP, cs);
+         (dist.f[DIR_PPP])[ktne ] = computeOutflowDistribution(f, f1, DIR_PPP, cs);
+         (dist.f[DIR_PMM])[kbse ] = computeOutflowDistribution(f, f1, DIR_PMM, cs);
+         (dist.f[DIR_PPM])[kbne ] = computeOutflowDistribution(f, f1, DIR_PPM, cs);
+         break;
+
+      case PZZ:
+         (dist.f[DIR_M00])[kw   ] = computeOutflowDistribution(f, f1, DIR_M00, cs);
+         (dist.f[DIR_MM0])[ksw  ] = computeOutflowDistribution(f, f1, DIR_MM0, cs);
+         (dist.f[DIR_MP0])[knw  ] = computeOutflowDistribution(f, f1, DIR_MP0, cs);
+         (dist.f[DIR_M0M])[kbw  ] = computeOutflowDistribution(f, f1, DIR_M0M, cs);
+         (dist.f[DIR_M0P])[ktw  ] = computeOutflowDistribution(f, f1, DIR_M0P, cs);
+         (dist.f[DIR_MMP])[ktsw ] = computeOutflowDistribution(f, f1, DIR_MMP, cs);
+         (dist.f[DIR_MPP])[ktnw ] = computeOutflowDistribution(f, f1, DIR_MPP, cs);
+         (dist.f[DIR_MMM])[kbsw ] = computeOutflowDistribution(f, f1, DIR_MMM, cs);
+         (dist.f[DIR_MPM])[kbnw ] = computeOutflowDistribution(f, f1, DIR_MPM, cs);
+         break;
+
+      case ZMZ:
+         (dist.f[DIR_0P0])[kn   ] = computeOutflowDistribution(f, f1, DIR_0P0, cs);
+         (dist.f[DIR_PP0])[kne  ] = computeOutflowDistribution(f, f1, DIR_PP0, cs);
+         (dist.f[DIR_MP0])[knw  ] = computeOutflowDistribution(f, f1, DIR_MP0, cs);
+         (dist.f[DIR_0PP])[ktn  ] = computeOutflowDistribution(f, f1, DIR_0PP, cs);
+         (dist.f[DIR_0PM])[kbn  ] = computeOutflowDistribution(f, f1, DIR_0PM, cs);
+         (dist.f[DIR_PPP])[ktne ] = computeOutflowDistribution(f, f1, DIR_PPP, cs);
+         (dist.f[DIR_MPP])[ktnw ] = computeOutflowDistribution(f, f1, DIR_MPP, cs);
+         (dist.f[DIR_PPM])[kbne ] = computeOutflowDistribution(f, f1, DIR_PPM, cs);
+         (dist.f[DIR_MPM])[kbnw ] = computeOutflowDistribution(f, f1, DIR_MPM, cs);
+         break;
+
+      case ZPZ:
+         (dist.f[DIR_0M0])[ks   ] = computeOutflowDistribution(f, f1, DIR_0M0, cs);
+         (dist.f[DIR_PM0])[kse  ] = computeOutflowDistribution(f, f1, DIR_PM0, cs);
+         (dist.f[DIR_MM0])[ksw  ] = computeOutflowDistribution(f, f1, DIR_MM0, cs);
+         (dist.f[DIR_0MP])[kts  ] = computeOutflowDistribution(f, f1, DIR_0MP, cs);
+         (dist.f[DIR_0MM])[kbs  ] = computeOutflowDistribution(f, f1, DIR_0MM, cs);
+         (dist.f[DIR_PMP])[ktse ] = computeOutflowDistribution(f, f1, DIR_PMP, cs);
+         (dist.f[DIR_MMP])[ktsw ] = computeOutflowDistribution(f, f1, DIR_MMP, cs);
+         (dist.f[DIR_PMM])[kbse ] = computeOutflowDistribution(f, f1, DIR_PMM, cs);
+         (dist.f[DIR_MMM])[kbsw ] = computeOutflowDistribution(f, f1, DIR_MMM, cs);
+         break;
+
+      case ZZM:
+         (dist.f[DIR_00P])[kt   ] = computeOutflowDistribution(f, f1, DIR_00P, cs);
+         (dist.f[DIR_P0P])[kte  ] = computeOutflowDistribution(f, f1, DIR_P0P, cs);
+         (dist.f[DIR_M0P])[ktw  ] = computeOutflowDistribution(f, f1, DIR_M0P, cs);
+         (dist.f[DIR_0PP])[ktn  ] = computeOutflowDistribution(f, f1, DIR_0PP, cs);
+         (dist.f[DIR_0MP])[kts  ] = computeOutflowDistribution(f, f1, DIR_0MP, cs);
+         (dist.f[DIR_PPP])[ktne ] = computeOutflowDistribution(f, f1, DIR_PPP, cs);
+         (dist.f[DIR_MPP])[ktnw ] = computeOutflowDistribution(f, f1, DIR_MPP, cs);
+         (dist.f[DIR_PMP])[ktse ] = computeOutflowDistribution(f, f1, DIR_PMP, cs);
+         (dist.f[DIR_MMP])[ktsw ] = computeOutflowDistribution(f, f1, DIR_MMP, cs);
+         break;
+
+      case ZZP:
+         (dist.f[DIR_00M])[kb   ] = computeOutflowDistribution(f, f1, DIR_00M, cs);
+         (dist.f[DIR_P0M])[kbe  ] = computeOutflowDistribution(f, f1, DIR_P0M, cs);
+         (dist.f[DIR_M0M])[kbw  ] = computeOutflowDistribution(f, f1, DIR_M0M, cs);
+         (dist.f[DIR_0PM])[kbn  ] = computeOutflowDistribution(f, f1, DIR_0PM, cs);
+         (dist.f[DIR_0MM])[kbs  ] = computeOutflowDistribution(f, f1, DIR_0MM, cs);
+         (dist.f[DIR_PPM])[kbne ] = computeOutflowDistribution(f, f1, DIR_PPM, cs);
+         (dist.f[DIR_MPM])[kbnw ] = computeOutflowDistribution(f, f1, DIR_MPM, cs);
+         (dist.f[DIR_PMM])[kbse ] = computeOutflowDistribution(f, f1, DIR_PMM, cs);
+         (dist.f[DIR_MMM])[kbsw ] = computeOutflowDistribution(f, f1, DIR_MMM, cs);
+         break;
+      default:
+         break;
+   }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
-      //real vx1, vx2, vx3, drho;
-      //real vx1, vx2, vx3, drho, drho1;
-      //////////////////////////////////////////////////////////////////////////
-	  //Dichte
-    //   drho1  =  f1_TSE + f1_TNW + f1_TNE + f1_TSW + f1_BSE + f1_BNW + f1_BNE + f1_BSW +
-    //             f1_BN + f1_TS + f1_TN + f1_BS + f1_BE + f1_TW + f1_TE + f1_BW + f1_SE + f1_NW + f1_NE + f1_SW + 
-    //             f1_T + f1_B + f1_N + f1_S + f1_E + f1_W + ((D.f[DIR_000])[k1zero]); 
-    //   drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
-    //             f_BN + f_TS + f_TN + f_BS + f_BE + f_TW + f_TE + f_BW + f_SE + f_NW + f_NE + f_SW + 
-    //             f_T + f_B + f_N + f_S + f_E + f_W + ((D.f[DIR_000])[kzero]); 
-      
-      //////////////////////////////////////////////////////////////////////////
-	  //Ux
 
-	  //vx1    =  (((f_TSE - f_BNW) - (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
-   //               ((f_BE - f_TW)   + (f_TE - f_BW))   + ((f_SE - f_NW)   + (f_NE - f_SW)) +
-   //               (f_E - f_W)) /(one + drho); 
 
 
-   //   vx2    =   ((-(f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
-   //               ((f_BN - f_TS)   + (f_TN - f_BS))    + (-(f_SE - f_NW)  + (f_NE - f_SW)) +
-   //               (f_N - f_S)) /(one + drho); 
 
-   //   vx3    =   (((f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) + (f_TSW - f_BNE)) +
-   //               (-(f_BN - f_TS)  + (f_TN - f_BS))   + ((f_TE - f_BW)   - (f_BE - f_TW)) +
-   //               (f_T - f_B)) /(one + drho); 
 
 
-      //real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
-   //   //////////////////////////////////////////////////////////////////////////
-	  ////real omega = om1;
-   //   real cusq  = c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
-   //   //////////////////////////////////////////////////////////////////////////
-	  ////T�st MK
-	  ////if(vx1 < zero) vx1 = zero;
-   //   //////////////////////////////////////////////////////////////////////////
-   //   real fZERO = c8over27*  (drho1-(one + drho1)*(cusq))                                                           ;
-   //   real fE    = c2over27*  (drho1+(one + drho1)*(three*( vx1        )+c9over2*( vx1        )*( vx1        )-cusq));
-   //   real fW    = c2over27*  (drho1+(one + drho1)*(three*(-vx1        )+c9over2*(-vx1        )*(-vx1        )-cusq));
-   //   real fN    = c2over27*  (drho1+(one + drho1)*(three*(    vx2     )+c9over2*(     vx2    )*(     vx2    )-cusq));
-   //   real fS    = c2over27*  (drho1+(one + drho1)*(three*(   -vx2     )+c9over2*(    -vx2    )*(    -vx2    )-cusq));
-   //   real fT    = c2over27*  (drho1+(one + drho1)*(three*(         vx3)+c9over2*(         vx3)*(         vx3)-cusq));
-   //   real fB    = c2over27*  (drho1+(one + drho1)*(three*(        -vx3)+c9over2*(        -vx3)*(        -vx3)-cusq));
-   //   real fNE   = c1over54*  (drho1+(one + drho1)*(three*( vx1+vx2    )+c9over2*( vx1+vx2    )*( vx1+vx2    )-cusq));
-   //   real fSW   = c1over54*  (drho1+(one + drho1)*(three*(-vx1-vx2    )+c9over2*(-vx1-vx2    )*(-vx1-vx2    )-cusq));
-   //   real fSE   = c1over54*  (drho1+(one + drho1)*(three*( vx1-vx2    )+c9over2*( vx1-vx2    )*( vx1-vx2    )-cusq));
-   //   real fNW   = c1over54*  (drho1+(one + drho1)*(three*(-vx1+vx2    )+c9over2*(-vx1+vx2    )*(-vx1+vx2    )-cusq));
-   //   real fTE   = c1over54*  (drho1+(one + drho1)*(three*( vx1    +vx3)+c9over2*( vx1    +vx3)*( vx1    +vx3)-cusq));
-   //   real fBW   = c1over54*  (drho1+(one + drho1)*(three*(-vx1    -vx3)+c9over2*(-vx1    -vx3)*(-vx1    -vx3)-cusq));
-   //   real fBE   = c1over54*  (drho1+(one + drho1)*(three*( vx1    -vx3)+c9over2*( vx1    -vx3)*( vx1    -vx3)-cusq));
-   //   real fTW   = c1over54*  (drho1+(one + drho1)*(three*(-vx1    +vx3)+c9over2*(-vx1    +vx3)*(-vx1    +vx3)-cusq));
-   //   real fTN   = c1over54*  (drho1+(one + drho1)*(three*(     vx2+vx3)+c9over2*(     vx2+vx3)*(     vx2+vx3)-cusq));
-   //   real fBS   = c1over54*  (drho1+(one + drho1)*(three*(    -vx2-vx3)+c9over2*(    -vx2-vx3)*(    -vx2-vx3)-cusq));
-   //   real fBN   = c1over54*  (drho1+(one + drho1)*(three*(     vx2-vx3)+c9over2*(     vx2-vx3)*(     vx2-vx3)-cusq));
-   //   real fTS   = c1over54*  (drho1+(one + drho1)*(three*(    -vx2+vx3)+c9over2*(    -vx2+vx3)*(    -vx2+vx3)-cusq));
-   //   real fTNE  = c1over216* (drho1+(one + drho1)*(three*( vx1+vx2+vx3)+c9over2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cusq));
-   //   real fBSW  = c1over216* (drho1+(one + drho1)*(three*(-vx1-vx2-vx3)+c9over2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cusq));
-   //   real fBNE  = c1over216* (drho1+(one + drho1)*(three*( vx1+vx2-vx3)+c9over2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cusq));
-   //   real fTSW  = c1over216* (drho1+(one + drho1)*(three*(-vx1-vx2+vx3)+c9over2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cusq));
-   //   real fTSE  = c1over216* (drho1+(one + drho1)*(three*( vx1-vx2+vx3)+c9over2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cusq));
-   //   real fBNW  = c1over216* (drho1+(one + drho1)*(three*(-vx1+vx2-vx3)+c9over2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cusq));
-   //   real fBSE  = c1over216* (drho1+(one + drho1)*(three*( vx1-vx2-vx3)+c9over2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cusq));
-   //   real fTNW  = c1over216* (drho1+(one + drho1)*(three*(-vx1+vx2+vx3)+c9over2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cusq));
-
-	  real cs = c1o1 / sqrtf(c3o1);
-	  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //no velocity
-	  //////////////////////////////////////////
-      f_E    = f1_E   * cs + (c1o1 - cs) * f_E   ;
-      f_W    = f1_W   * cs + (c1o1 - cs) * f_W   ;
-      f_N    = f1_N   * cs + (c1o1 - cs) * f_N   ;
-      f_S    = f1_S   * cs + (c1o1 - cs) * f_S   ;
-      f_T    = f1_T   * cs + (c1o1 - cs) * f_T   ;
-      f_B    = f1_B   * cs + (c1o1 - cs) * f_B   ;
-      f_NE   = f1_NE  * cs + (c1o1 - cs) * f_NE  ;
-      f_SW   = f1_SW  * cs + (c1o1 - cs) * f_SW  ;
-      f_SE   = f1_SE  * cs + (c1o1 - cs) * f_SE  ;
-      f_NW   = f1_NW  * cs + (c1o1 - cs) * f_NW  ;
-      f_TE   = f1_TE  * cs + (c1o1 - cs) * f_TE  ;
-      f_BW   = f1_BW  * cs + (c1o1 - cs) * f_BW  ;
-      f_BE   = f1_BE  * cs + (c1o1 - cs) * f_BE  ;
-      f_TW   = f1_TW  * cs + (c1o1 - cs) * f_TW  ;
-      f_TN   = f1_TN  * cs + (c1o1 - cs) * f_TN  ;
-      f_BS   = f1_BS  * cs + (c1o1 - cs) * f_BS  ;
-      f_BN   = f1_BN  * cs + (c1o1 - cs) * f_BN  ;
-      f_TS   = f1_TS  * cs + (c1o1 - cs) * f_TS  ;
-      f_TNE  = f1_TNE * cs + (c1o1 - cs) * f_TNE ;
-      f_TSW  = f1_TSW * cs + (c1o1 - cs) * f_TSW ;
-      f_TSE  = f1_TSE * cs + (c1o1 - cs) * f_TSE ;
-      f_TNW  = f1_TNW * cs + (c1o1 - cs) * f_TNW ;
-      f_BNE  = f1_BNE * cs + (c1o1 - cs) * f_BNE ;
-      f_BSW  = f1_BSW * cs + (c1o1 - cs) * f_BSW ;
-      f_BSE  = f1_BSE * cs + (c1o1 - cs) * f_BSE ;
-      f_BNW  = f1_BNW * cs + (c1o1 - cs) * f_BNW ;
-	  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-	  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //with velocity
-	  //if(true){//vx1 >= zero){
-		 // real csMvx = one / sqrtf(three) - vx1;
-		 // //real csMvy = one / sqrtf(three) - vx2;
-		 // ///////////////////////////////////////////
-		 // // X
-		 // f_W   = f1_W   * csMvx + (one - csMvx) * f_W   ;//- c2over27  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_NW  = f1_NW  * csMvx + (one - csMvx) * f_NW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_SW  = f1_SW  * csMvx + (one - csMvx) * f_SW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_TW  = f1_TW  * csMvx + (one - csMvx) * f_TW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_BW  = f1_BW  * csMvx + (one - csMvx) * f_BW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_TNW = f1_TNW * csMvx + (one - csMvx) * f_TNW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_TSW = f1_TSW * csMvx + (one - csMvx) * f_TSW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_BNW = f1_BNW * csMvx + (one - csMvx) * f_BNW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_BSW = f1_BSW * csMvx + (one - csMvx) * f_BSW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // ///////////////////////////////////////////
-		 // // Y
-		 // //f_S   = f1_S   * csMvy + (one - csMvy) * f_S   ;//- c2over27  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_SE  = f1_SE  * csMvy + (one - csMvy) * f_SE  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_SW  = f1_SW  * csMvy + (one - csMvy) * f_SW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_TS  = f1_TS  * csMvy + (one - csMvy) * f_TS  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_BS  = f1_BS  * csMvy + (one - csMvy) * f_BS  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_TSE = f1_TSE * csMvy + (one - csMvy) * f_TSE ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_TSW = f1_TSW * csMvy + (one - csMvy) * f_TSW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_BSE = f1_BSE * csMvy + (one - csMvy) * f_BSE ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_BSW = f1_BSW * csMvy + (one - csMvy) * f_BSW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_S   = f1_S   * csMvy + (one - csMvy) * f_S;
-		 // //f_SE  = f1_SE  * csMvy + (one - csMvy) * f_SE;
-		 // //f_SW  = f1_SW  * csMvy + (one - csMvy) * f_SW;
-		 // //f_TS  = f1_TS  * csMvy + (one - csMvy) * f_TS;
-		 // //f_BS  = f1_BS  * csMvy + (one - csMvy) * f_BS;
-		 // //f_TSE = f1_TSE * csMvy + (one - csMvy) * f_TSE;
-		 // //f_TSW = f1_TSW * csMvy + (one - csMvy) * f_TSW;
-		 // //f_BSE = f1_BSE * csMvy + (one - csMvy) * f_BSE;
-		 // //f_BSW = f1_BSW * csMvy + (one - csMvy) * f_BSW;
-		 // //////////////////////////////////////////////////////////////////////////
-	  //}
-	  //else
-	  //{
-		 // ///////////////////////////////////////////
-		 // // X
-		 // vx1   = vx1 * 0.9;
-		 // f_W   = f_E   - six * c2over27  * ( vx1        );
-		 // f_NW  = f_SE  - six * c1over54  * ( vx1-vx2    );
-		 // f_SW  = f_NE  - six * c1over54  * ( vx1+vx2    );
-		 // f_TW  = f_BE  - six * c1over54  * ( vx1    -vx3);
-		 // f_BW  = f_TE  - six * c1over54  * ( vx1    +vx3);
-		 // f_TNW = f_BSE - six * c1over216 * ( vx1-vx2-vx3);
-		 // f_TSW = f_BNE - six * c1over216 * ( vx1+vx2-vx3);
-		 // f_BNW = f_TSE - six * c1over216 * ( vx1-vx2+vx3);
-		 // f_BSW = f_TNE - six * c1over216 * ( vx1+vx2+vx3);
-		 // ///////////////////////////////////////////
-		 // // Y
-		 // //vx2   = vx2 * 0.9;
-		 // //f_S   = f_N   - six * c2over27  * (     vx2    );
-		 // //f_SE  = f_NW  - six * c1over54  * (-vx1+vx2    );
-		 // //f_SW  = f_NE  - six * c1over54  * ( vx1+vx2    );
-		 // //f_TS  = f_BN  - six * c1over54  * (     vx2-vx3);
-		 // //f_BS  = f_TN  - six * c1over54  * (     vx2+vx3);
-		 // //f_TSE = f_BNW - six * c1over216 * (-vx1+vx2-vx3);
-		 // //f_TSW = f_BNE - six * c1over216 * ( vx1+vx2-vx3);
-		 // //f_BSE = f_TNW - six * c1over216 * (-vx1+vx2+vx3);
-		 // //f_BSW = f_TNE - six * c1over216 * ( vx1+vx2+vx3);
-		 // ///////////////////////////////////////////
-	  //}
-	  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-	  //////////////////////////////////////////////////////////////////////////
-      if (isEvenTimestep==false)
-      {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
-      else
-      {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
-      }
-      //////////////////////////////////////////////////////////////////////////
-      //__syncthreads();
-	  // -X
-	  //(D.f[DIR_P00   ])[ke   ] = f_E   ;
-	  //(D.f[DIR_PM0  ])[kse  ] = f_SE  ;
-	  //(D.f[DIR_PP0  ])[kne  ] = f_NE  ;
-	  //(D.f[DIR_P0M  ])[kbe  ] = f_BE  ;
-	  //(D.f[DIR_P0P  ])[kte  ] = f_TE  ;
-	  //(D.f[DIR_PMP ])[ktse ] = f_TSE ;
-	  //(D.f[DIR_PPP ])[ktne ] = f_TNE ;
-	  //(D.f[DIR_PMM ])[kbse ] = f_BSE ;
-	  //(D.f[DIR_PPM ])[kbne ] = f_BNE ;     
-	  // X
-	  (D.f[DIR_M00   ])[kw   ] = f_W   ;
-	  (D.f[DIR_MM0  ])[ksw  ] = f_SW  ;
-	  (D.f[DIR_MP0  ])[knw  ] = f_NW  ;
-	  (D.f[DIR_M0M  ])[kbw  ] = f_BW  ;
-	  (D.f[DIR_M0P  ])[ktw  ] = f_TW  ;
-	  (D.f[DIR_MMP ])[ktsw ] = f_TSW ;
-	  (D.f[DIR_MPP ])[ktnw ] = f_TNW ;
-	  (D.f[DIR_MMM ])[kbsw ] = f_BSW ;
-	  (D.f[DIR_MPM ])[kbnw ] = f_BNW ;     
-	  // Y
-	  //(D.f[DIR_0M0   ])[ks   ] = f_S   ;
-	  //(D.f[DIR_PM0  ])[kse  ] = f_SE  ;
-	  //(D.f[DIR_MM0  ])[ksw  ] = f_SW  ;
-	  //(D.f[DIR_0MP  ])[kts  ] = f_TS  ;
-	  //(D.f[DIR_0MM  ])[kbs  ] = f_BS  ;
-	  //(D.f[DIR_PMP ])[ktse ] = f_TSE ;
-	  //(D.f[DIR_MMP ])[ktsw ] = f_TSW ;
-	  //(D.f[DIR_PMM ])[kbse ] = f_BSE ;
-	  //(D.f[DIR_MMM ])[kbsw ] = f_BSW ;     
-	  // Z
-	  //(D.f[DIR_00M   ])[kb   ] = f_B   ;
-	  //(D.f[DIR_P0M  ])[kbe  ] = f_BE  ;
-	  //(D.f[DIR_M0M  ])[kbw  ] = f_BW  ;
-	  //(D.f[DIR_0PM  ])[kbn  ] = f_BN  ;
-	  //(D.f[DIR_0MM  ])[kbs  ] = f_BS  ;
-	  //(D.f[DIR_PPM ])[kbne ] = f_BNE ;
-	  //(D.f[DIR_MPM ])[kbnw ] = f_BNW ;
-	  //(D.f[DIR_PMM ])[kbse ] = f_BSE ;
-	  //(D.f[DIR_MMM ])[kbsw ] = f_BSW ;     
-      //////////////////////////////////////////////////////////////////////////
-   }
-}
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__host__ __device__ real computeOutflowDistribution(const real* const &f, const real* const &f1, const int dir, const real rhoCorrection, const real cs, const real weight)
+{
+   return f1[dir  ] * cs + (c1o1 - cs) * f[dir  ] - weight *rhoCorrection;
+}
+
+__global__ void QPressZeroRhoOutflowDevice27(
+    real* rhoBC,
+    real* distributions,
+    int* k_Q,
+    int* k_N,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep,
+    int direction,
+    real densityCorrectionFactor)
+{
+   ////////////////////////////////////////////////////////////////////////////////
+   //! - Get the node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+   //!
+   const unsigned nodeIndex = getNodeIndex();
 
+   //////////////////////////////////////////////////////////////////////////
 
+   if( nodeIndex >= numberOfBCnodes ) return;
 
+   ////////////////////////////////////////////////////////////////////////////////
+   //index
 
+   uint k_000 = k_Q[nodeIndex];
+   uint k_M00 = neighborX[k_000];
+   uint k_0M0 = neighborY[k_000];
+   uint k_00M = neighborZ[k_000];
+   uint k_MM0 = neighborY[k_M00];
+   uint k_M0M = neighborZ[k_M00];
+   uint k_0MM = neighborZ[k_0M0];
+   uint k_MMM = neighborZ[k_MM0];
 
+   ////////////////////////////////////////////////////////////////////////////////
+   //index of neighbor
+   uint kN_000 = k_N[nodeIndex];
+   uint kN_M00 = neighborX[k_000];
+   uint kN_0M0 = neighborY[k_000];
+   uint kN_00M = neighborZ[k_000];
+   uint kN_MM0 = neighborY[k_M00];
+   uint kN_M0M = neighborZ[k_M00];
+   uint kN_0MM = neighborZ[k_0M0];
+   uint kN_MMM = neighborZ[k_MM0];
+   ////////////////////////////////////////////////////////////////////////////////
+   Distributions27 dist;
+   getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+   real f[27], fN[27];
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   f[DIR_000] = (dist.f[DIR_000])[k_000];
+   f[DIR_P00] = (dist.f[DIR_P00])[k_000];
+   f[DIR_M00] = (dist.f[DIR_M00])[k_M00];
+   f[DIR_0P0] = (dist.f[DIR_0P0])[k_000];
+   f[DIR_0M0] = (dist.f[DIR_0M0])[k_0M0];
+   f[DIR_00P] = (dist.f[DIR_00P])[k_000];
+   f[DIR_00M] = (dist.f[DIR_00M])[k_00M];
+   f[DIR_PP0] = (dist.f[DIR_PP0])[k_000];
+   f[DIR_MM0] = (dist.f[DIR_MM0])[k_MM0];
+   f[DIR_PM0] = (dist.f[DIR_PM0])[k_0M0];
+   f[DIR_MP0] = (dist.f[DIR_MP0])[k_M00];
+   f[DIR_P0P] = (dist.f[DIR_P0P])[k_000];
+   f[DIR_M0M] = (dist.f[DIR_M0M])[k_M0M];
+   f[DIR_P0M] = (dist.f[DIR_P0M])[k_00M];
+   f[DIR_M0P] = (dist.f[DIR_M0P])[k_M00];
+   f[DIR_0PP] = (dist.f[DIR_0PP])[k_000];
+   f[DIR_0MM] = (dist.f[DIR_0MM])[k_0MM];
+   f[DIR_0PM] = (dist.f[DIR_0PM])[k_00M];
+   f[DIR_0MP] = (dist.f[DIR_0MP])[k_0M0];
+   f[DIR_PPP] = (dist.f[DIR_PPP])[k_000];
+   f[DIR_MPP] = (dist.f[DIR_MPP])[k_M00];
+   f[DIR_PMP] = (dist.f[DIR_PMP])[k_0M0];
+   f[DIR_MMP] = (dist.f[DIR_MMP])[k_MM0];
+   f[DIR_PPM] = (dist.f[DIR_PPM])[k_00M];
+   f[DIR_MPM] = (dist.f[DIR_MPM])[k_M0M];
+   f[DIR_PMM] = (dist.f[DIR_PMM])[k_0MM];
+   f[DIR_MMM] = (dist.f[DIR_MMM])[k_MMM];
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   fN[DIR_000] = (dist.f[DIR_000])[kN_000];
+   fN[DIR_P00] = (dist.f[DIR_P00])[kN_000];
+   fN[DIR_M00] = (dist.f[DIR_M00])[kN_M00];
+   fN[DIR_0P0] = (dist.f[DIR_0P0])[kN_000];
+   fN[DIR_0M0] = (dist.f[DIR_0M0])[kN_0M0];
+   fN[DIR_00P] = (dist.f[DIR_00P])[kN_000];
+   fN[DIR_00M] = (dist.f[DIR_00M])[kN_00M];
+   fN[DIR_PP0] = (dist.f[DIR_PP0])[kN_000];
+   fN[DIR_MM0] = (dist.f[DIR_MM0])[kN_MM0];
+   fN[DIR_PM0] = (dist.f[DIR_PM0])[kN_0M0];
+   fN[DIR_MP0] = (dist.f[DIR_MP0])[kN_M00];
+   fN[DIR_P0P] = (dist.f[DIR_P0P])[kN_000];
+   fN[DIR_M0M] = (dist.f[DIR_M0M])[kN_M0M];
+   fN[DIR_P0M] = (dist.f[DIR_P0M])[kN_00M];
+   fN[DIR_M0P] = (dist.f[DIR_M0P])[kN_M00];
+   fN[DIR_0PP] = (dist.f[DIR_0PP])[kN_000];
+   fN[DIR_0MM] = (dist.f[DIR_0MM])[kN_0MM];
+   fN[DIR_0PM] = (dist.f[DIR_0PM])[kN_00M];
+   fN[DIR_0MP] = (dist.f[DIR_0MP])[kN_0M0];
+   fN[DIR_PPP] = (dist.f[DIR_PPP])[kN_000];
+   fN[DIR_MPP] = (dist.f[DIR_MPP])[kN_M00];
+   fN[DIR_PMP] = (dist.f[DIR_PMP])[kN_0M0];
+   fN[DIR_MMP] = (dist.f[DIR_MMP])[kN_MM0];
+   fN[DIR_PPM] = (dist.f[DIR_PPM])[kN_00M];
+   fN[DIR_MPM] = (dist.f[DIR_MPM])[kN_M0M];
+   fN[DIR_PMM] = (dist.f[DIR_PMM])[kN_0MM];
+   fN[DIR_MMM] = (dist.f[DIR_MMM])[kN_MMM];
+   //////////////////////////////////////////////////////////////////////////
+   real drho = vf::lbm::getDensity(f);
 
+   real rhoCorrection = densityCorrectionFactor*drho;
 
+   real cs = c1o1 / sqrtf(c3o1);
 
+   getPointersToDistributions(dist, distributions, numberOfLBnodes, !isEvenTimestep);
 
+   switch(direction)
+   {
+      case MZZ:
+         (dist.f[DIR_P00])[k_000] = computeOutflowDistribution(f, fN, DIR_P00  , rhoCorrection, cs, c2o27);
+         (dist.f[DIR_PM0])[k_0M0] = computeOutflowDistribution(f, fN, DIR_PM0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_PP0])[k_000] = computeOutflowDistribution(f, fN, DIR_PP0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_P0M])[k_00M] = computeOutflowDistribution(f, fN, DIR_P0M, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_P0P])[k_000] = computeOutflowDistribution(f, fN, DIR_P0P, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_PMP])[k_0M0] = computeOutflowDistribution(f, fN, DIR_PMP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PPP])[k_000] = computeOutflowDistribution(f, fN, DIR_PPP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PMM])[k_0MM] = computeOutflowDistribution(f, fN, DIR_PMM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PPM])[k_00M] = computeOutflowDistribution(f, fN, DIR_PPM, rhoCorrection, cs, c1o216);
+         break;
+
+      case PZZ:
+         (dist.f[DIR_M00])[k_M00] = computeOutflowDistribution(f, fN, DIR_M00, rhoCorrection, cs, c2o27);
+         (dist.f[DIR_MM0])[k_MM0] = computeOutflowDistribution(f, fN, DIR_MM0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_MP0])[k_M00] = computeOutflowDistribution(f, fN, DIR_MP0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_M0M])[k_M0M] = computeOutflowDistribution(f, fN, DIR_M0M, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_M0P])[k_M00] = computeOutflowDistribution(f, fN, DIR_M0P, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_MMP])[k_MM0] = computeOutflowDistribution(f, fN, DIR_MMP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MPP])[k_M00] = computeOutflowDistribution(f, fN, DIR_MPP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MMM])[k_MMM] = computeOutflowDistribution(f, fN, DIR_MMM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MPM])[k_M0M] = computeOutflowDistribution(f, fN, DIR_MPM, rhoCorrection, cs, c1o216);
+         break;
+
+      case ZMZ:
+         (dist.f[DIR_0P0])[k_000] = computeOutflowDistribution(f, fN, DIR_0P0, rhoCorrection, cs, c2o27);
+         (dist.f[DIR_PP0])[k_000] = computeOutflowDistribution(f, fN, DIR_PP0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_MP0])[k_M00] = computeOutflowDistribution(f, fN, DIR_MP0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0PP])[k_000] = computeOutflowDistribution(f, fN, DIR_0PP, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0PM])[k_00M] = computeOutflowDistribution(f, fN, DIR_0PM, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_PPP])[k_000] = computeOutflowDistribution(f, fN, DIR_PPP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MPP])[k_M00] = computeOutflowDistribution(f, fN, DIR_MPP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PPM])[k_00M] = computeOutflowDistribution(f, fN, DIR_PPM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MPM])[k_M0M] = computeOutflowDistribution(f, fN, DIR_MPM, rhoCorrection, cs, c1o216);
+         break;
+
+      case ZPZ:
+         (dist.f[DIR_0M0])[k_0M0] =computeOutflowDistribution(f, fN, DIR_0M0, rhoCorrection, cs, c2o27);
+         (dist.f[DIR_PM0])[k_0M0] =computeOutflowDistribution(f, fN, DIR_PM0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_MM0])[k_MM0] =computeOutflowDistribution(f, fN, DIR_MM0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0MP])[k_0M0] =computeOutflowDistribution(f, fN, DIR_0MP, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0MM])[k_0MM] =computeOutflowDistribution(f, fN, DIR_0MM, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_PMP])[k_0M0] =computeOutflowDistribution(f, fN, DIR_PMP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MMP])[k_MM0] =computeOutflowDistribution(f, fN, DIR_MMP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PMM])[k_0MM] =computeOutflowDistribution(f, fN, DIR_PMM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MMM])[k_MMM] =computeOutflowDistribution(f, fN, DIR_MMM, rhoCorrection, cs, c1o216);
+         break;
+
+      case ZZM:
+         (dist.f[DIR_00P])[k_000] = computeOutflowDistribution(f, fN, DIR_00P, rhoCorrection, cs, c2o27);
+         (dist.f[DIR_P0P])[k_000] = computeOutflowDistribution(f, fN, DIR_P0P, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_M0P])[k_M00] = computeOutflowDistribution(f, fN, DIR_M0P, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0PP])[k_000] = computeOutflowDistribution(f, fN, DIR_0PP, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0MP])[k_0M0] = computeOutflowDistribution(f, fN, DIR_0MP, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_PPP])[k_000] = computeOutflowDistribution(f, fN, DIR_PPP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MPP])[k_M00] = computeOutflowDistribution(f, fN, DIR_MPP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PMP])[k_0M0] = computeOutflowDistribution(f, fN, DIR_PMP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MMP])[k_MM0] = computeOutflowDistribution(f, fN, DIR_MMP, rhoCorrection, cs, c1o216);
+         break;
+
+      case ZZP:
+         (dist.f[DIR_00M])[k_00M] = computeOutflowDistribution(f, fN, DIR_00M, rhoCorrection, cs, c2o27);
+         (dist.f[DIR_P0M])[k_00M] = computeOutflowDistribution(f, fN, DIR_P0M, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_M0M])[k_M0M] = computeOutflowDistribution(f, fN, DIR_M0M, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0PM])[k_00M] = computeOutflowDistribution(f, fN, DIR_0PM, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0MM])[k_0MM] = computeOutflowDistribution(f, fN, DIR_0MM, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_PPM])[k_00M] = computeOutflowDistribution(f, fN, DIR_PPM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MPM])[k_M0M] = computeOutflowDistribution(f, fN, DIR_MPM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PMM])[k_0MM] = computeOutflowDistribution(f, fN, DIR_PMM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MMM])[k_MMM] = computeOutflowDistribution(f, fN, DIR_MMM, rhoCorrection, cs, c1o216);
+         break;
+      default:
+         break;
+   }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 
 
@@ -3314,22 +3327,23 @@ __global__ void QPressNoRhoDevice27(  real* rhoBC,
 
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-__global__ void QPressDeviceOld27(real* rhoBC,
-                                             real* DD, 
-                                             int* k_Q, 
-                                             int* k_N, 
-                                             int numberOfBCnodes, 
-                                             real om1, 
-                                             unsigned int* neighborX,
-                                             unsigned int* neighborY,
-                                             unsigned int* neighborZ,
-                                             unsigned int size_Mat, 
-                                             bool isEvenTimestep)
+__global__ void QPressDeviceOld27(
+    real* rhoBC,
+    real* DD,
+    int* k_Q,
+    int* k_N,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+   const unsigned  x = threadIdx.x;  // Globaler x-Index
+   const unsigned  y = blockIdx.x;   // Globaler y-Index
+   const unsigned  z = blockIdx.y;   // Globaler z-Index
 
    const unsigned nx = blockDim.x;
    const unsigned ny = gridDim.x;
@@ -3403,133 +3417,133 @@ __global__ void QPressDeviceOld27(real* rhoBC,
       Distributions27 D;
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real        f1_E,f1_W,f1_N,f1_S,f1_T,f1_B,f1_NE,f1_SW,f1_SE,f1_NW,f1_TE,f1_BW,f1_BE,f1_TW,f1_TN,f1_BS,f1_BN,f1_TS,f1_ZERO,
                      f1_TNE,f1_TSW,f1_TSE,f1_TNW,f1_BNE,f1_BSW,f1_BSE,f1_BNW;
 
-      f1_W    = (D.f[DIR_P00   ])[k1e   ];
-      f1_E    = (D.f[DIR_M00   ])[k1w   ];
-      f1_S    = (D.f[DIR_0P0   ])[k1n   ];
-      f1_N    = (D.f[DIR_0M0   ])[k1s   ];
-      f1_B    = (D.f[DIR_00P   ])[k1t   ];
-      f1_T    = (D.f[DIR_00M   ])[k1b   ];
-      f1_SW   = (D.f[DIR_PP0  ])[k1ne  ];
-      f1_NE   = (D.f[DIR_MM0  ])[k1sw  ];
-      f1_NW   = (D.f[DIR_PM0  ])[k1se  ];
-      f1_SE   = (D.f[DIR_MP0  ])[k1nw  ];
-      f1_BW   = (D.f[DIR_P0P  ])[k1te  ];
-      f1_TE   = (D.f[DIR_M0M  ])[k1bw  ];
-      f1_TW   = (D.f[DIR_P0M  ])[k1be  ];
-      f1_BE   = (D.f[DIR_M0P  ])[k1tw  ];
-      f1_BS   = (D.f[DIR_0PP  ])[k1tn  ];
-      f1_TN   = (D.f[DIR_0MM  ])[k1bs  ];
-      f1_TS   = (D.f[DIR_0PM  ])[k1bn  ];
-      f1_BN   = (D.f[DIR_0MP  ])[k1ts  ];
+      f1_W    = (D.f[DIR_P00])[k1e   ];
+      f1_E    = (D.f[DIR_M00])[k1w   ];
+      f1_S    = (D.f[DIR_0P0])[k1n   ];
+      f1_N    = (D.f[DIR_0M0])[k1s   ];
+      f1_B    = (D.f[DIR_00P])[k1t   ];
+      f1_T    = (D.f[DIR_00M])[k1b   ];
+      f1_SW   = (D.f[DIR_PP0])[k1ne  ];
+      f1_NE   = (D.f[DIR_MM0])[k1sw  ];
+      f1_NW   = (D.f[DIR_PM0])[k1se  ];
+      f1_SE   = (D.f[DIR_MP0])[k1nw  ];
+      f1_BW   = (D.f[DIR_P0P])[k1te  ];
+      f1_TE   = (D.f[DIR_M0M])[k1bw  ];
+      f1_TW   = (D.f[DIR_P0M])[k1be  ];
+      f1_BE   = (D.f[DIR_M0P])[k1tw  ];
+      f1_BS   = (D.f[DIR_0PP])[k1tn  ];
+      f1_TN   = (D.f[DIR_0MM])[k1bs  ];
+      f1_TS   = (D.f[DIR_0PM])[k1bn  ];
+      f1_BN   = (D.f[DIR_0MP])[k1ts  ];
       f1_ZERO = (D.f[DIR_000])[k1zero];
-      f1_BSW  = (D.f[DIR_PPP ])[k1tne ];
-      f1_BNE  = (D.f[DIR_MMP ])[k1tsw ];
-      f1_BNW  = (D.f[DIR_PMP ])[k1tse ];
-      f1_BSE  = (D.f[DIR_MPP ])[k1tnw ];
-      f1_TSW  = (D.f[DIR_PPM ])[k1bne ];
-      f1_TNE  = (D.f[DIR_MMM ])[k1bsw ];
-      f1_TNW  = (D.f[DIR_PMM ])[k1bse ];
-      f1_TSE  = (D.f[DIR_MPM ])[k1bnw ];
+      f1_BSW  = (D.f[DIR_PPP])[k1tne ];
+      f1_BNE  = (D.f[DIR_MMP])[k1tsw ];
+      f1_BNW  = (D.f[DIR_PMP])[k1tse ];
+      f1_BSE  = (D.f[DIR_MPP])[k1tnw ];
+      f1_TSW  = (D.f[DIR_PPM])[k1bne ];
+      f1_TNE  = (D.f[DIR_MMM])[k1bsw ];
+      f1_TNW  = (D.f[DIR_PMM])[k1bse ];
+      f1_TSE  = (D.f[DIR_MPM])[k1bnw ];
 
       //////////////////////////////////////////////////////////////////////////
       real drho1    =  f1_ZERO+f1_E+f1_W+f1_N+f1_S+f1_T+f1_B+f1_NE+f1_SW+f1_SE+f1_NW+f1_TE+f1_BW+f1_BE+f1_TW+f1_TN+f1_BS+f1_BN+f1_TS+
                           f1_TNE+f1_TSW+f1_TSE+f1_TNW+f1_BNE+f1_BSW+f1_BSE+f1_BNW;
 
-	  //drho1 = (drho1 + rhoBC[k])/2.f;
-	  drho1 = drho1 - rhoBC[k];
+     //drho1 = (drho1 + rhoBC[k])/2.f;
+     drho1 = drho1 - rhoBC[k];
       //////////////////////////////////////////////////////////////////////////
 
       __syncthreads();
 
-      (D.f[DIR_P00   ])[ke   ] = f1_W   -c2o27*drho1;   //  c1o100;  // zero;  //
-      (D.f[DIR_M00   ])[kw   ] = f1_E   -c2o27*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_0P0   ])[kn   ] = f1_S   -c2o27*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_0M0   ])[ks   ] = f1_N   -c2o27*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_00P   ])[kt   ] = f1_B   -c2o27*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_00M   ])[kb   ] = f1_T   -c2o27*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_PP0  ])[kne  ] = f1_SW  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_MM0  ])[ksw  ] = f1_NE  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_PM0  ])[kse  ] = f1_NW  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_MP0  ])[knw  ] = f1_SE  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_P0P  ])[kte  ] = f1_BW  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_M0M  ])[kbw  ] = f1_TE  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_P0M  ])[kbe  ] = f1_TW  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_M0P  ])[ktw  ] = f1_BE  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_0PP  ])[ktn  ] = f1_BS  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_0MM  ])[kbs  ] = f1_TN  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_0PM  ])[kbn  ] = f1_TS  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_0MP  ])[kts  ] = f1_BN  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_P00])[ke   ] = f1_W   -c2o27*drho1;   //  c1o100;  // zero;  //
+      (D.f[DIR_M00])[kw   ] = f1_E   -c2o27*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_0P0])[kn   ] = f1_S   -c2o27*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_0M0])[ks   ] = f1_N   -c2o27*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_00P])[kt   ] = f1_B   -c2o27*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_00M])[kb   ] = f1_T   -c2o27*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_PP0])[kne  ] = f1_SW  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_MM0])[ksw  ] = f1_NE  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_PM0])[kse  ] = f1_NW  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_MP0])[knw  ] = f1_SE  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_P0P])[kte  ] = f1_BW  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_M0M])[kbw  ] = f1_TE  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_P0M])[kbe  ] = f1_TW  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_M0P])[ktw  ] = f1_BE  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_0PP])[ktn  ] = f1_BS  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_0MM])[kbs  ] = f1_TN  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_0PM])[kbn  ] = f1_TS  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_0MP])[kts  ] = f1_BN  -c1o54*drho1;	//  c1o100;  // zero;  //
       (D.f[DIR_000])[kzero] = f1_ZERO-c8o27*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_PPP ])[ktne ] = f1_BSW -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_MMP ])[ktsw ] = f1_BNE -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_PMP ])[ktse ] = f1_BNW -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_MPP ])[ktnw ] = f1_BSE -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_PPM ])[kbne ] = f1_TSW -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_MMM ])[kbsw ] = f1_TNE -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_PMM ])[kbse ] = f1_TNW -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_MPM ])[kbnw ] = f1_TSE -c1o216*drho1;  //  c1o100;  // zero;  //      
+      (D.f[DIR_PPP])[ktne ] = f1_BSW -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_MMP])[ktsw ] = f1_BNE -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_PMP])[ktse ] = f1_BNW -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_MPP])[ktnw ] = f1_BSE -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_PPM])[kbne ] = f1_TSW -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_MMM])[kbsw ] = f1_TNE -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_PMM])[kbse ] = f1_TNW -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_MPM])[kbnw ] = f1_TSE -c1o216*drho1;  //  c1o100;  // zero;  //
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -3573,23 +3587,24 @@ __global__ void QPressDeviceOld27(real* rhoBC,
 
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-__global__ void QPressDeviceEQZ27(real* rhoBC,
-                                             real* DD, 
-                                             int* k_Q, 
-                                             int* k_N,
-											 real* kTestRE,
-                                             int numberOfBCnodes, 
-                                             real om1, 
-                                             unsigned int* neighborX,
-                                             unsigned int* neighborY,
-                                             unsigned int* neighborZ,
-                                             unsigned int size_Mat, 
-                                             bool isEvenTimestep)
+__global__ void QPressDeviceEQZ27(
+    real* rhoBC,
+    real* DD,
+    int* k_Q,
+    int* k_N,
+    real* kTestRE,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+   const unsigned  x = threadIdx.x;  // Globaler x-Index
+   const unsigned  y = blockIdx.x;   // Globaler y-Index
+   const unsigned  z = blockIdx.y;   // Globaler z-Index
 
    const unsigned nx = blockDim.x;
    const unsigned ny = gridDim.x;
@@ -3663,168 +3678,168 @@ __global__ void QPressDeviceEQZ27(real* rhoBC,
       Distributions27 D;
       if (isEvenTimestep==true)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////
     //   Distributions27 kDistTest;
-    //      kDistTest.f[DIR_P00   ] = &kTestRE[DIR_P00   *numberOfBCnodes];
-    //      kDistTest.f[DIR_M00   ] = &kTestRE[DIR_M00   *numberOfBCnodes];
-    //      kDistTest.f[DIR_0P0   ] = &kTestRE[DIR_0P0   *numberOfBCnodes];
-    //      kDistTest.f[DIR_0M0   ] = &kTestRE[DIR_0M0   *numberOfBCnodes];
-    //      kDistTest.f[DIR_00P   ] = &kTestRE[DIR_00P   *numberOfBCnodes];
-    //      kDistTest.f[DIR_00M   ] = &kTestRE[DIR_00M   *numberOfBCnodes];
-    //      kDistTest.f[DIR_PP0  ] = &kTestRE[DIR_PP0  *numberOfBCnodes];
-    //      kDistTest.f[DIR_MM0  ] = &kTestRE[DIR_MM0  *numberOfBCnodes];
-    //      kDistTest.f[DIR_PM0  ] = &kTestRE[DIR_PM0  *numberOfBCnodes];
-    //      kDistTest.f[DIR_MP0  ] = &kTestRE[DIR_MP0  *numberOfBCnodes];
-    //      kDistTest.f[DIR_P0P  ] = &kTestRE[DIR_P0P  *numberOfBCnodes];
-    //      kDistTest.f[DIR_M0M  ] = &kTestRE[DIR_M0M  *numberOfBCnodes];
-    //      kDistTest.f[DIR_P0M  ] = &kTestRE[DIR_P0M  *numberOfBCnodes];
-    //      kDistTest.f[DIR_M0P  ] = &kTestRE[DIR_M0P  *numberOfBCnodes];
-    //      kDistTest.f[DIR_0PP  ] = &kTestRE[DIR_0PP  *numberOfBCnodes];
-    //      kDistTest.f[DIR_0MM  ] = &kTestRE[DIR_0MM  *numberOfBCnodes];
-    //      kDistTest.f[DIR_0PM  ] = &kTestRE[DIR_0PM  *numberOfBCnodes];
-    //      kDistTest.f[DIR_0MP  ] = &kTestRE[DIR_0MP  *numberOfBCnodes];
-    //      kDistTest.f[DIR_000] = &kTestRE[DIR_000*numberOfBCnodes];
-    //      kDistTest.f[DIR_PPP ] = &kTestRE[DIR_PPP *numberOfBCnodes];
-    //      kDistTest.f[DIR_MMP ] = &kTestRE[DIR_MMP *numberOfBCnodes];
-    //      kDistTest.f[DIR_PMP ] = &kTestRE[DIR_PMP *numberOfBCnodes];
-    //      kDistTest.f[DIR_MPP ] = &kTestRE[DIR_MPP *numberOfBCnodes];
-    //      kDistTest.f[DIR_PPM ] = &kTestRE[DIR_PPM *numberOfBCnodes];
-    //      kDistTest.f[DIR_MMM ] = &kTestRE[DIR_MMM *numberOfBCnodes];
-    //      kDistTest.f[DIR_PMM ] = &kTestRE[DIR_PMM *numberOfBCnodes];
-    //      kDistTest.f[DIR_MPM ] = &kTestRE[DIR_MPM *numberOfBCnodes];
+    //      kDistTest.f[DIR_P00] = &kTestRE[DIR_P00 * numberOfBCnodes];
+    //      kDistTest.f[DIR_M00] = &kTestRE[DIR_M00 * numberOfBCnodes];
+    //      kDistTest.f[DIR_0P0] = &kTestRE[DIR_0P0 * numberOfBCnodes];
+    //      kDistTest.f[DIR_0M0] = &kTestRE[DIR_0M0 * numberOfBCnodes];
+    //      kDistTest.f[DIR_00P] = &kTestRE[DIR_00P * numberOfBCnodes];
+    //      kDistTest.f[DIR_00M] = &kTestRE[DIR_00M * numberOfBCnodes];
+    //      kDistTest.f[DIR_PP0] = &kTestRE[DIR_PP0 * numberOfBCnodes];
+    //      kDistTest.f[DIR_MM0] = &kTestRE[DIR_MM0 * numberOfBCnodes];
+    //      kDistTest.f[DIR_PM0] = &kTestRE[DIR_PM0 * numberOfBCnodes];
+    //      kDistTest.f[DIR_MP0] = &kTestRE[DIR_MP0 * numberOfBCnodes];
+    //      kDistTest.f[DIR_P0P] = &kTestRE[DIR_P0P * numberOfBCnodes];
+    //      kDistTest.f[DIR_M0M] = &kTestRE[DIR_M0M * numberOfBCnodes];
+    //      kDistTest.f[DIR_P0M] = &kTestRE[DIR_P0M * numberOfBCnodes];
+    //      kDistTest.f[DIR_M0P] = &kTestRE[DIR_M0P * numberOfBCnodes];
+    //      kDistTest.f[DIR_0PP] = &kTestRE[DIR_0PP * numberOfBCnodes];
+    //      kDistTest.f[DIR_0MM] = &kTestRE[DIR_0MM * numberOfBCnodes];
+    //      kDistTest.f[DIR_0PM] = &kTestRE[DIR_0PM * numberOfBCnodes];
+    //      kDistTest.f[DIR_0MP] = &kTestRE[DIR_0MP * numberOfBCnodes];
+    //      kDistTest.f[DIR_000] = &kTestRE[DIR_000 * numberOfBCnodes];
+    //      kDistTest.f[DIR_PPP] = &kTestRE[DIR_PPP * numberOfBCnodes];
+    //      kDistTest.f[DIR_MMP] = &kTestRE[DIR_MMP * numberOfBCnodes];
+    //      kDistTest.f[DIR_PMP] = &kTestRE[DIR_PMP * numberOfBCnodes];
+    //      kDistTest.f[DIR_MPP] = &kTestRE[DIR_MPP * numberOfBCnodes];
+    //      kDistTest.f[DIR_PPM] = &kTestRE[DIR_PPM * numberOfBCnodes];
+    //      kDistTest.f[DIR_MMM] = &kTestRE[DIR_MMM * numberOfBCnodes];
+    //      kDistTest.f[DIR_PMM] = &kTestRE[DIR_PMM * numberOfBCnodes];
+    //      kDistTest.f[DIR_MPM] = &kTestRE[DIR_MPM * numberOfBCnodes];
    //   ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //   //real f1_E,f1_W,f1_N,f1_S,f1_T,f1_B,f1_NE,f1_SW,f1_SE,f1_NW,f1_TE,f1_BW,f1_BE,f1_TW,f1_TN,f1_BS,f1_BN,f1_TS,f1_ZERO,f1_TNE,f1_TSW,f1_TSE,f1_TNW,f1_BNE,f1_BSW,f1_BSE,f1_BNW;
-   //   //f1_W    = (D.f[DIR_P00   ])[k1e   ];
-   //   //f1_E    = (D.f[DIR_M00   ])[k1w   ];
-   //   //f1_S    = (D.f[DIR_0P0   ])[k1n   ];
-   //   //f1_N    = (D.f[DIR_0M0   ])[k1s   ];
-   //   //f1_B    = (D.f[DIR_00P   ])[k1t   ];
-   //   //f1_T    = (D.f[DIR_00M   ])[k1b   ];
-   //   //f1_SW   = (D.f[DIR_PP0  ])[k1ne  ];
-   //   //f1_NE   = (D.f[DIR_MM0  ])[k1sw  ];
-   //   //f1_NW   = (D.f[DIR_PM0  ])[k1se  ];
-   //   //f1_SE   = (D.f[DIR_MP0  ])[k1nw  ];
-   //   //f1_BW   = (D.f[DIR_P0P  ])[k1te  ];
-   //   //f1_TE   = (D.f[DIR_M0M  ])[k1bw  ];
-   //   //f1_TW   = (D.f[DIR_P0M  ])[k1be  ];
-   //   //f1_BE   = (D.f[DIR_M0P  ])[k1tw  ];
-   //   //f1_BS   = (D.f[DIR_0PP  ])[k1tn  ];
-   //   //f1_TN   = (D.f[DIR_0MM  ])[k1bs  ];
-   //   //f1_TS   = (D.f[DIR_0PM  ])[k1bn  ];
-   //   //f1_BN   = (D.f[DIR_0MP  ])[k1ts  ];
+   //   //f1_W    = (D.f[DIR_P00])[k1e   ];
+   //   //f1_E    = (D.f[DIR_M00])[k1w   ];
+   //   //f1_S    = (D.f[DIR_0P0])[k1n   ];
+   //   //f1_N    = (D.f[DIR_0M0])[k1s   ];
+   //   //f1_B    = (D.f[DIR_00P])[k1t   ];
+   //   //f1_T    = (D.f[DIR_00M])[k1b   ];
+   //   //f1_SW   = (D.f[DIR_PP0])[k1ne  ];
+   //   //f1_NE   = (D.f[DIR_MM0])[k1sw  ];
+   //   //f1_NW   = (D.f[DIR_PM0])[k1se  ];
+   //   //f1_SE   = (D.f[DIR_MP0])[k1nw  ];
+   //   //f1_BW   = (D.f[DIR_P0P])[k1te  ];
+   //   //f1_TE   = (D.f[DIR_M0M])[k1bw  ];
+   //   //f1_TW   = (D.f[DIR_P0M])[k1be  ];
+   //   //f1_BE   = (D.f[DIR_M0P])[k1tw  ];
+   //   //f1_BS   = (D.f[DIR_0PP])[k1tn  ];
+   //   //f1_TN   = (D.f[DIR_0MM])[k1bs  ];
+   //   //f1_TS   = (D.f[DIR_0PM])[k1bn  ];
+   //   //f1_BN   = (D.f[DIR_0MP])[k1ts  ];
    //   //f1_ZERO = (D.f[DIR_000])[k1zero];
-   //   //f1_BSW  = (D.f[DIR_PPP ])[k1tne ];
-   //   //f1_BNE  = (D.f[DIR_MMP ])[k1tsw ];
-   //   //f1_BNW  = (D.f[DIR_PMP ])[k1tse ];
-   //   //f1_BSE  = (D.f[DIR_MPP ])[k1tnw ];
-   //   //f1_TSW  = (D.f[DIR_PPM ])[k1bne ];
-   //   //f1_TNE  = (D.f[DIR_MMM ])[k1bsw ];
-   //   //f1_TNW  = (D.f[DIR_PMM ])[k1bse ];
-   //   //f1_TSE  = (D.f[DIR_MPM ])[k1bnw ];
+   //   //f1_BSW  = (D.f[DIR_PPP])[k1tne ];
+   //   //f1_BNE  = (D.f[DIR_MMP])[k1tsw ];
+   //   //f1_BNW  = (D.f[DIR_PMP])[k1tse ];
+   //   //f1_BSE  = (D.f[DIR_MPP])[k1tnw ];
+   //   //f1_TSW  = (D.f[DIR_PPM])[k1bne ];
+   //   //f1_TNE  = (D.f[DIR_MMM])[k1bsw ];
+   //   //f1_TNW  = (D.f[DIR_PMM])[k1bse ];
+   //   //f1_TSE  = (D.f[DIR_MPM])[k1bnw ];
    //   ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
    //   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //   real f1_E,f1_W,f1_N,f1_S,f1_T,f1_B,f1_NE,f1_SW,f1_SE,f1_NW,f1_TE,f1_BW,f1_BE,f1_TW,f1_TN,f1_BS,f1_BN,f1_TS,f1_ZERO,f1_TNE,f1_TSW,f1_TSE,f1_TNW,f1_BNE,f1_BSW,f1_BSE,f1_BNW;
-   //   f1_E    = (D.f[DIR_P00   ])[k1e   ];
-   //   f1_W    = (D.f[DIR_M00   ])[k1w   ];
-   //   f1_N    = (D.f[DIR_0P0   ])[k1n   ];
-   //   f1_S    = (D.f[DIR_0M0   ])[k1s   ];
-   //   f1_T    = (D.f[DIR_00P   ])[k1t   ];
-   //   f1_B    = (D.f[DIR_00M   ])[k1b   ];
-   //   f1_NE   = (D.f[DIR_PP0  ])[k1ne  ];
-   //   f1_SW   = (D.f[DIR_MM0  ])[k1sw  ];
-   //   f1_SE   = (D.f[DIR_PM0  ])[k1se  ];
-   //   f1_NW   = (D.f[DIR_MP0  ])[k1nw  ];
-   //   f1_TE   = (D.f[DIR_P0P  ])[k1te  ];
-   //   f1_BW   = (D.f[DIR_M0M  ])[k1bw  ];
-   //   f1_BE   = (D.f[DIR_P0M  ])[k1be  ];
-   //   f1_TW   = (D.f[DIR_M0P  ])[k1tw  ];
-   //   f1_TN   = (D.f[DIR_0PP  ])[k1tn  ];
-   //   f1_BS   = (D.f[DIR_0MM  ])[k1bs  ];
-   //   f1_BN   = (D.f[DIR_0PM  ])[k1bn  ];
-   //   f1_TS   = (D.f[DIR_0MP  ])[k1ts  ];
+   //   f1_E    = (D.f[DIR_P00])[k1e   ];
+   //   f1_W    = (D.f[DIR_M00])[k1w   ];
+   //   f1_N    = (D.f[DIR_0P0])[k1n   ];
+   //   f1_S    = (D.f[DIR_0M0])[k1s   ];
+   //   f1_T    = (D.f[DIR_00P])[k1t   ];
+   //   f1_B    = (D.f[DIR_00M])[k1b   ];
+   //   f1_NE   = (D.f[DIR_PP0])[k1ne  ];
+   //   f1_SW   = (D.f[DIR_MM0])[k1sw  ];
+   //   f1_SE   = (D.f[DIR_PM0])[k1se  ];
+   //   f1_NW   = (D.f[DIR_MP0])[k1nw  ];
+   //   f1_TE   = (D.f[DIR_P0P])[k1te  ];
+   //   f1_BW   = (D.f[DIR_M0M])[k1bw  ];
+   //   f1_BE   = (D.f[DIR_P0M])[k1be  ];
+   //   f1_TW   = (D.f[DIR_M0P])[k1tw  ];
+   //   f1_TN   = (D.f[DIR_0PP])[k1tn  ];
+   //   f1_BS   = (D.f[DIR_0MM])[k1bs  ];
+   //   f1_BN   = (D.f[DIR_0PM])[k1bn  ];
+   //   f1_TS   = (D.f[DIR_0MP])[k1ts  ];
    //   f1_ZERO = (D.f[DIR_000])[k1zero];
-   //   f1_TNE  = (D.f[DIR_PPP ])[k1tne ];
-   //   f1_TSW  = (D.f[DIR_MMP ])[k1tsw ];
-   //   f1_TSE  = (D.f[DIR_PMP ])[k1tse ];
-   //   f1_TNW  = (D.f[DIR_MPP ])[k1tnw ];
-   //   f1_BNE  = (D.f[DIR_PPM ])[k1bne ];
-   //   f1_BSW  = (D.f[DIR_MMM ])[k1bsw ];
-   //   f1_BSE  = (D.f[DIR_PMM ])[k1bse ];
-   //   f1_BNW  = (D.f[DIR_MPM ])[k1bnw ];
+   //   f1_TNE  = (D.f[DIR_PPP])[k1tne ];
+   //   f1_TSW  = (D.f[DIR_MMP])[k1tsw ];
+   //   f1_TSE  = (D.f[DIR_PMP])[k1tse ];
+   //   f1_TNW  = (D.f[DIR_MPP])[k1tnw ];
+   //   f1_BNE  = (D.f[DIR_PPM])[k1bne ];
+   //   f1_BSW  = (D.f[DIR_MMM])[k1bsw ];
+   //   f1_BSE  = (D.f[DIR_PMM])[k1bse ];
+   //   f1_BNW  = (D.f[DIR_MPM])[k1bnw ];
    //   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
    //   //////////////////////////////////////////////////////////////////////////
    //   real drho1    =  f1_ZERO+f1_E+f1_W+f1_N+f1_S+f1_T+f1_B+f1_NE+f1_SW+f1_SE+f1_NW+f1_TE+f1_BW+f1_BE+f1_TW+f1_TN+f1_BS+f1_BN+f1_TS+ f1_TNE+f1_TSW+f1_TSE+f1_TNW+f1_BNE+f1_BSW+f1_BSE+f1_BNW;
-	  //real vx1      = (((f1_TNE-f1_BSW)+(f1_BSE-f1_TNW)+(f1_BNE-f1_TSW)+(f1_TSE-f1_BNW)) + (((f1_NE-f1_SW)+(f1_TE-f1_BW))+((f1_SE-f1_NW)+(f1_BE-f1_TW))) + (f1_E-f1_W)) / (one + drho1);
-	  //real vx2      = (((f1_TNE-f1_BSW)+(f1_TNW-f1_BSE)+(f1_BNE-f1_TSW)+(f1_BNW-f1_TSE)) + (((f1_NE-f1_SW)+(f1_TN-f1_BS))+((f1_BN-f1_TS)+(f1_NW-f1_SE))) + (f1_N-f1_S)) / (one + drho1);
-	  //real vx3      = (((f1_TNE-f1_BSW)+(f1_TNW-f1_BSE)+(f1_TSW-f1_BNE)+(f1_TSE-f1_BNW)) + (((f1_TE-f1_BW)+(f1_TN-f1_BS))+((f1_TW-f1_BE)+(f1_TS-f1_BN))) + (f1_T-f1_B)) / (one + drho1);
+     //real vx1      = (((f1_TNE-f1_BSW)+(f1_BSE-f1_TNW)+(f1_BNE-f1_TSW)+(f1_TSE-f1_BNW)) + (((f1_NE-f1_SW)+(f1_TE-f1_BW))+((f1_SE-f1_NW)+(f1_BE-f1_TW))) + (f1_E-f1_W)) / (one + drho1);
+     //real vx2      = (((f1_TNE-f1_BSW)+(f1_TNW-f1_BSE)+(f1_BNE-f1_TSW)+(f1_BNW-f1_TSE)) + (((f1_NE-f1_SW)+(f1_TN-f1_BS))+((f1_BN-f1_TS)+(f1_NW-f1_SE))) + (f1_N-f1_S)) / (one + drho1);
+     //real vx3      = (((f1_TNE-f1_BSW)+(f1_TNW-f1_BSE)+(f1_TSW-f1_BNE)+(f1_TSE-f1_BNW)) + (((f1_TE-f1_BW)+(f1_TN-f1_BS))+((f1_TW-f1_BE)+(f1_TS-f1_BN))) + (f1_T-f1_B)) / (one + drho1);
    //   //////////////////////////////////////////////////////////////////////////
-	  ////real omega = om1;
+     ////real omega = om1;
    //   real cusq  = c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
    //   //////////////////////////////////////////////////////////////////////////
-	  ////T�st MK
-	  ////if(vx1 < zero) vx1 = zero;
+     ////T�st MK
+     ////if(vx1 < zero) vx1 = zero;
    //   //////////////////////////////////////////////////////////////////////////
-	  ////becomes higher with neighbor source and lower with local source
+     ////becomes higher with neighbor source and lower with local source
    //   //real fZERO = c8over27*  (rhoBC[k]-(one + rhoBC[k])*(cusq))                                                           ;
    //   //real fE    = c2over27*  (rhoBC[k]+(one + rhoBC[k])*(three*( vx1        )+c9over2*( vx1        )*( vx1        )-cusq));
    //   //real fW    = c2over27*  (rhoBC[k]+(one + rhoBC[k])*(three*(-vx1        )+c9over2*(-vx1        )*(-vx1        )-cusq));
@@ -3853,7 +3868,7 @@ __global__ void QPressDeviceEQZ27(real* rhoBC,
    //   //real fBSE  = c1over216* (rhoBC[k]+(one + rhoBC[k])*(three*( vx1-vx2-vx3)+c9over2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cusq));
    //   //real fTNW  = c1over216* (rhoBC[k]+(one + rhoBC[k])*(three*(-vx1+vx2+vx3)+c9over2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cusq));
    //   //////////////////////////////////////////////////////////////////////////
-	  //// based on VirtualFluids (kucher + fard)
+     //// based on VirtualFluids (kucher + fard)
    //   real fZERO = c8over27  * rhoBC[k] * (one                                                                      - cusq);
    //   real fE    = c2over27  * rhoBC[k] * (one + three * ( vx1        ) + c9over2 * ( vx1        ) * ( vx1        ) - cusq);
    //   real fW    = c2over27  * rhoBC[k] * (one + three * (-vx1        ) + c9over2 * (-vx1        ) * (-vx1        ) - cusq);
@@ -3882,7 +3897,7 @@ __global__ void QPressDeviceEQZ27(real* rhoBC,
    //   real fBSE  = c1over216 * rhoBC[k] * (one + three * ( vx1-vx2-vx3) + c9over2 * ( vx1-vx2-vx3) * ( vx1-vx2-vx3) - cusq);
    //   real fTNW  = c1over216 * rhoBC[k] * (one + three * (-vx1+vx2+vx3) + c9over2 * (-vx1+vx2+vx3) * (-vx1+vx2+vx3) - cusq);
    ////   ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //////test
+     //////test
    ////   real fZERO = c8over27  * ((drho1 + rhoBC[k]) / two) * (one                                                                      - cusq);
    ////   real fE    = c2over27  * ((drho1 + rhoBC[k]) / two) * (one + three * ( vx1        ) + c9over2 * ( vx1        ) * ( vx1        ) - cusq);
    ////   real fW    = c2over27  * ((drho1 + rhoBC[k]) / two) * (one + three * (-vx1        ) + c9over2 * (-vx1        ) * (-vx1        ) - cusq);
@@ -3911,190 +3926,190 @@ __global__ void QPressDeviceEQZ27(real* rhoBC,
    ////   real fBSE  = c1over216 * ((drho1 + rhoBC[k]) / two) * (one + three * ( vx1-vx2-vx3) + c9over2 * ( vx1-vx2-vx3) * ( vx1-vx2-vx3) - cusq);
    ////   real fTNW  = c1over216 * ((drho1 + rhoBC[k]) / two) * (one + three * (-vx1+vx2+vx3) + c9over2 * (-vx1+vx2+vx3) * (-vx1+vx2+vx3) - cusq);
 
-			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+         //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
             // based on BGK Plus Comp
-			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			//double mfabb = (D.f[DIR_P00   ])[k1e   ];
-			//double mfcbb = (D.f[DIR_M00   ])[k1w   ];
-			//double mfbab = (D.f[DIR_0P0   ])[k1n   ];
-			//double mfbcb = (D.f[DIR_0M0   ])[k1s   ];
-			//double mfbba = (D.f[DIR_00P   ])[k1t   ];
-			//double mfbbc = (D.f[DIR_00M   ])[k1b   ];
-			//double mfaab = (D.f[DIR_PP0  ])[k1ne  ];
-			//double mfccb = (D.f[DIR_MM0  ])[k1sw  ];
-			//double mfacb = (D.f[DIR_PM0  ])[k1se  ];
-			//double mfcab = (D.f[DIR_MP0  ])[k1nw  ];
-			//double mfaba = (D.f[DIR_P0P  ])[k1te  ];
-			//double mfcbc = (D.f[DIR_M0M  ])[k1bw  ];
-			//double mfabc = (D.f[DIR_P0M  ])[k1be  ];
-			//double mfcba = (D.f[DIR_M0P  ])[k1tw  ];
-			//double mfbaa = (D.f[DIR_0PP  ])[k1tn  ];
-			//double mfbcc = (D.f[DIR_0MM  ])[k1bs  ];
-			//double mfbac = (D.f[DIR_0PM  ])[k1bn  ];
-			//double mfbca = (D.f[DIR_0MP  ])[k1ts  ];
-			//double mfbbb = (D.f[DIR_000])[k1zero];
-			//double mfaaa = (D.f[DIR_PPP ])[k1tne ];
-			//double mfcca = (D.f[DIR_MMP ])[k1tsw ];
-			//double mfaca = (D.f[DIR_PMP ])[k1tse ];
-			//double mfcaa = (D.f[DIR_MPP ])[k1tnw ];
-			//double mfaac = (D.f[DIR_PPM ])[k1bne ];
-			//double mfccc = (D.f[DIR_MMM ])[k1bsw ];
-			//double mfacc = (D.f[DIR_PMM ])[k1bse ];
-			//double mfcac = (D.f[DIR_MPM ])[k1bnw ];
-			real mfabb = (D.f[DIR_P00   ])[k1e   ];
-			real mfcbb = (D.f[DIR_M00   ])[k1w   ];
-			real mfbab = (D.f[DIR_0P0   ])[k1n   ];
-			real mfbcb = (D.f[DIR_0M0   ])[k1s   ];
-			real mfbba = (D.f[DIR_00P   ])[k1t   ];
-			real mfbbc = (D.f[DIR_00M   ])[k1b   ];
-			real mfaab = (D.f[DIR_PP0  ])[k1ne  ];
-			real mfccb = (D.f[DIR_MM0  ])[k1sw  ];
-			real mfacb = (D.f[DIR_PM0  ])[k1se  ];
-			real mfcab = (D.f[DIR_MP0  ])[k1nw  ];
-			real mfaba = (D.f[DIR_P0P  ])[k1te  ];
-			real mfcbc = (D.f[DIR_M0M  ])[k1bw  ];
-			real mfabc = (D.f[DIR_P0M  ])[k1be  ];
-			real mfcba = (D.f[DIR_M0P  ])[k1tw  ];
-			real mfbaa = (D.f[DIR_0PP  ])[k1tn  ];
-			real mfbcc = (D.f[DIR_0MM  ])[k1bs  ];
-			real mfbac = (D.f[DIR_0PM  ])[k1bn  ];
-			real mfbca = (D.f[DIR_0MP  ])[k1ts  ];
-			real mfbbb = (D.f[DIR_000])[k1zero];
-			real mfaaa = (D.f[DIR_PPP ])[k1tne ];
-			real mfcca = (D.f[DIR_MMP ])[k1tsw ];
-			real mfaca = (D.f[DIR_PMP ])[k1tse ];
-			real mfcaa = (D.f[DIR_MPP ])[k1tnw ];
-			real mfaac = (D.f[DIR_PPM ])[k1bne ];
-			real mfccc = (D.f[DIR_MMM ])[k1bsw ];
-			real mfacc = (D.f[DIR_PMM ])[k1bse ];
-			real mfcac = (D.f[DIR_MPM ])[k1bnw ];
-
-			//real mfcbb = (D.f[DIR_P00   ])[ke   ];
-			//real mfabb = (D.f[DIR_M00   ])[kw   ];
-			//real mfbcb = (D.f[DIR_0P0   ])[kn   ];
-			//real mfbab = (D.f[DIR_0M0   ])[ks   ];
-			//real mfbbc = (D.f[DIR_00P   ])[kt   ];
-			//real mfbba = (D.f[DIR_00M   ])[kb   ];
-			//real mfccb = (D.f[DIR_PP0  ])[kne  ];
-			//real mfaab = (D.f[DIR_MM0  ])[ksw  ];
-			//real mfcab = (D.f[DIR_PM0  ])[kse  ];
-			//real mfacb = (D.f[DIR_MP0  ])[knw  ];
-			//real mfcbc = (D.f[DIR_P0P  ])[kte  ];
-			//real mfaba = (D.f[DIR_M0M  ])[kbw  ];
-			//real mfcba = (D.f[DIR_P0M  ])[kbe  ];
-			//real mfabc = (D.f[DIR_M0P  ])[ktw  ];
-			//real mfbcc = (D.f[DIR_0PP  ])[ktn  ];
-			//real mfbaa = (D.f[DIR_0MM  ])[kbs  ];
-			//real mfbca = (D.f[DIR_0PM  ])[kbn  ];
-			//real mfbac = (D.f[DIR_0MP  ])[kts  ];
-			//real mfbbb = (D.f[DIR_000])[kzero];
-			//real mfccc = (D.f[DIR_PPP ])[ktne ];
-			//real mfaac = (D.f[DIR_MMP ])[ktsw ];
-			//real mfcac = (D.f[DIR_PMP ])[ktse ];
-			//real mfacc = (D.f[DIR_MPP ])[ktnw ];
-			//real mfcca = (D.f[DIR_PPM ])[kbne ];
-			//real mfaaa = (D.f[DIR_MMM ])[kbsw ];
-			//real mfcaa = (D.f[DIR_PMM ])[kbse ];
-			//real mfaca = (D.f[DIR_MPM ])[kbnw ];
-			////////////////////////////////////////////////////////////////////////////////////
-			//real rho   = (((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
-			//				(((mfbac+mfbca) + (mfbaa+mfbcc)) + ((mfabc+mfcba) + (mfaba+mfcbc)) + ((mfacb+mfcab) + (mfaab+mfccb))) +
-			//				((mfabb+mfcbb) + (mfbab+mfbcb)) + (mfbba+mfbbc)) + mfbbb) + one;//!!!!Achtung + one
-			////////////////////////////////////////////////////////////////////////////////////
-			real rho = rhoBC[k];
-			////////////////////////////////////////////////////////////////////////////////////
-			real OoRho = c1o1 / (rho * 1.5f);
-			////////////////////////////////////////////////////////////////////////////////////
-			real vvx    = ((((mfccc-mfaaa) + (mfcac-mfaca)) + ((mfcaa-mfacc) + (mfcca-mfaac))) + 
-						     (((mfcba-mfabc) + (mfcbc-mfaba)) + ((mfcab-mfacb) + (mfccb-mfaab))) +
-						       (mfcbb-mfabb)) * OoRho;
-			real vvy    =((((mfccc-mfaaa) + (mfaca-mfcac)) + ((mfacc-mfcaa) + (mfcca-mfaac))) + 
-				             (((mfbca-mfbac) + (mfbcc-mfbaa)) + ((mfacb-mfcab) + (mfccb-mfaab))) +
-				               (mfbcb-mfbab)) * OoRho;
-			real vvz    =((((mfccc-mfaaa) + (mfcac-mfaca)) + ((mfacc-mfcaa) + (mfaac-mfcca))) + 
-				             (((mfbac-mfbca) + (mfbcc-mfbaa)) + ((mfabc-mfcba) + (mfcbc-mfaba))) +
-				               (mfbbc-mfbba)) * OoRho;
-			/////////////////////////
-			//Test Values
-			//double vvx    = 0.016;
-			//double vvy    = zero;
-			//double vvz    = zero;
-			////////////////////////////////////////////////////////////////////////////////////////
-			////round off error test
-			//if(vvx!=zero){
-			//	(kDistTest.f[DIR_P00   ])[k] = mfabb;
-			//	(kDistTest.f[DIR_M00   ])[k] = mfcbb;
-			//	(kDistTest.f[DIR_0P0   ])[k] = mfbab;
-			//	(kDistTest.f[DIR_0M0   ])[k] = mfbcb;
-			//	(kDistTest.f[DIR_00P   ])[k] = mfbba;
-			//	(kDistTest.f[DIR_00M   ])[k] = mfbbc;
-			//	(kDistTest.f[DIR_PP0  ])[k] = mfaab;
-			//	(kDistTest.f[DIR_MM0  ])[k] = mfccb;
-			//	(kDistTest.f[DIR_PM0  ])[k] = mfacb;
-			//	(kDistTest.f[DIR_MP0  ])[k] = mfcab;
-			//	(kDistTest.f[DIR_P0P  ])[k] = mfaba;
-			//	(kDistTest.f[DIR_M0M  ])[k] = mfcbc;
-			//	(kDistTest.f[DIR_P0M  ])[k] = mfabc;
-			//	(kDistTest.f[DIR_M0P  ])[k] = mfcba;
-			//	(kDistTest.f[DIR_0PP  ])[k] = mfbaa;
-			//	(kDistTest.f[DIR_0MM  ])[k] = mfbcc;
-			//	(kDistTest.f[DIR_0PM  ])[k] = mfbac;
-			//	(kDistTest.f[DIR_0MP  ])[k] = mfbca;
-			//	(kDistTest.f[DIR_000])[k] = KQK;
-			//	(kDistTest.f[DIR_PPP ])[k] = mfaaa;
-			//	(kDistTest.f[DIR_MMP ])[k] = mfcca;
-			//	(kDistTest.f[DIR_PMP ])[k] = mfaca;
-			//	(kDistTest.f[DIR_MPP ])[k] = mfcaa;
-			//	(kDistTest.f[DIR_PPM ])[k] = mfaac;
-			//	(kDistTest.f[DIR_MMM ])[k] = mfccc;
-			//	(kDistTest.f[DIR_PMM ])[k] = mfacc;
-			//	(kDistTest.f[DIR_MPM ])[k] = mfcac;
-			//}else{
-			//	(kDistTest.f[DIR_P00   ])[k] = zero;
-			//	(kDistTest.f[DIR_M00   ])[k] = zero;
-			//	(kDistTest.f[DIR_0P0   ])[k] = zero;
-			//	(kDistTest.f[DIR_0M0   ])[k] = zero;
-			//	(kDistTest.f[DIR_00P   ])[k] = zero;
-			//	(kDistTest.f[DIR_00M   ])[k] = zero;
-			//	(kDistTest.f[DIR_PP0  ])[k] = zero;
-			//	(kDistTest.f[DIR_MM0  ])[k] = zero;
-			//	(kDistTest.f[DIR_PM0  ])[k] = zero;
-			//	(kDistTest.f[DIR_MP0  ])[k] = zero;
-			//	(kDistTest.f[DIR_P0P  ])[k] = zero;
-			//	(kDistTest.f[DIR_M0M  ])[k] = zero;
-			//	(kDistTest.f[DIR_P0M  ])[k] = zero;
-			//	(kDistTest.f[DIR_M0P  ])[k] = zero;
-			//	(kDistTest.f[DIR_0PP  ])[k] = zero;
-			//	(kDistTest.f[DIR_0MM  ])[k] = zero;
-			//	(kDistTest.f[DIR_0PM  ])[k] = zero;
-			//	(kDistTest.f[DIR_0MP  ])[k] = zero;
-			//	(kDistTest.f[DIR_000])[k] = zero;
-			//	(kDistTest.f[DIR_PPP ])[k] = zero;
-			//	(kDistTest.f[DIR_MMP ])[k] = zero;
-			//	(kDistTest.f[DIR_PMP ])[k] = zero;
-			//	(kDistTest.f[DIR_MPP ])[k] = zero;
-			//	(kDistTest.f[DIR_PPM ])[k] = zero;
-			//	(kDistTest.f[DIR_MMM ])[k] = zero;
-			//	(kDistTest.f[DIR_PMM ])[k] = zero;
-			//	(kDistTest.f[DIR_MPM ])[k] = zero;
-			//}
-
-			//////////////////////////////////////////////////////////////////////////////////////
-			//// first bad fix for negative x velocity
-			////if(vvx > zero) vvx = zero;
-			//////////////////////////////////////////////////////////////////////////////////////
-			////// second bad fix for negative x velocity
-			////if(vvx > zero){
-			////	vvx = -vvx;
-			////	vvy = -vvy;
-			////	vvz = -vvz;
-			////}
-			////////////////////////////////////////////////////////////////////////////////////
-			double vx2    = vvx * vvx;
-			double vy2    = vvy * vvy;
-			double vz2    = vvz * vvz;
-			//////////////////////////////////////////////////////////////////////////////////
-			//original
+         //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+         //double mfabb = (D.f[DIR_P00])[k1e   ];
+         //double mfcbb = (D.f[DIR_M00])[k1w   ];
+         //double mfbab = (D.f[DIR_0P0])[k1n   ];
+         //double mfbcb = (D.f[DIR_0M0])[k1s   ];
+         //double mfbba = (D.f[DIR_00P])[k1t   ];
+         //double mfbbc = (D.f[DIR_00M])[k1b   ];
+         //double mfaab = (D.f[DIR_PP0])[k1ne  ];
+         //double mfccb = (D.f[DIR_MM0])[k1sw  ];
+         //double mfacb = (D.f[DIR_PM0])[k1se  ];
+         //double mfcab = (D.f[DIR_MP0])[k1nw  ];
+         //double mfaba = (D.f[DIR_P0P])[k1te  ];
+         //double mfcbc = (D.f[DIR_M0M])[k1bw  ];
+         //double mfabc = (D.f[DIR_P0M])[k1be  ];
+         //double mfcba = (D.f[DIR_M0P])[k1tw  ];
+         //double mfbaa = (D.f[DIR_0PP])[k1tn  ];
+         //double mfbcc = (D.f[DIR_0MM])[k1bs  ];
+         //double mfbac = (D.f[DIR_0PM])[k1bn  ];
+         //double mfbca = (D.f[DIR_0MP])[k1ts  ];
+         //double mfbbb = (D.f[DIR_000])[k1zero];
+         //double mfaaa = (D.f[DIR_PPP])[k1tne ];
+         //double mfcca = (D.f[DIR_MMP])[k1tsw ];
+         //double mfaca = (D.f[DIR_PMP])[k1tse ];
+         //double mfcaa = (D.f[DIR_MPP])[k1tnw ];
+         //double mfaac = (D.f[DIR_PPM])[k1bne ];
+         //double mfccc = (D.f[DIR_MMM])[k1bsw ];
+         //double mfacc = (D.f[DIR_PMM])[k1bse ];
+         //double mfcac = (D.f[DIR_MPM])[k1bnw ];
+         real mfabb = (D.f[DIR_P00])[k1e   ];
+         real mfcbb = (D.f[DIR_M00])[k1w   ];
+         real mfbab = (D.f[DIR_0P0])[k1n   ];
+         real mfbcb = (D.f[DIR_0M0])[k1s   ];
+         real mfbba = (D.f[DIR_00P])[k1t   ];
+         real mfbbc = (D.f[DIR_00M])[k1b   ];
+         real mfaab = (D.f[DIR_PP0])[k1ne  ];
+         real mfccb = (D.f[DIR_MM0])[k1sw  ];
+         real mfacb = (D.f[DIR_PM0])[k1se  ];
+         real mfcab = (D.f[DIR_MP0])[k1nw  ];
+         real mfaba = (D.f[DIR_P0P])[k1te  ];
+         real mfcbc = (D.f[DIR_M0M])[k1bw  ];
+         real mfabc = (D.f[DIR_P0M])[k1be  ];
+         real mfcba = (D.f[DIR_M0P])[k1tw  ];
+         real mfbaa = (D.f[DIR_0PP])[k1tn  ];
+         real mfbcc = (D.f[DIR_0MM])[k1bs  ];
+         real mfbac = (D.f[DIR_0PM])[k1bn  ];
+         real mfbca = (D.f[DIR_0MP])[k1ts  ];
+         real mfbbb = (D.f[DIR_000])[k1zero];
+         real mfaaa = (D.f[DIR_PPP])[k1tne ];
+         real mfcca = (D.f[DIR_MMP])[k1tsw ];
+         real mfaca = (D.f[DIR_PMP])[k1tse ];
+         real mfcaa = (D.f[DIR_MPP])[k1tnw ];
+         real mfaac = (D.f[DIR_PPM])[k1bne ];
+         real mfccc = (D.f[DIR_MMM])[k1bsw ];
+         real mfacc = (D.f[DIR_PMM])[k1bse ];
+         real mfcac = (D.f[DIR_MPM])[k1bnw ];
+
+         //real mfcbb = (D.f[DIR_P00])[ke   ];
+         //real mfabb = (D.f[DIR_M00])[kw   ];
+         //real mfbcb = (D.f[DIR_0P0])[kn   ];
+         //real mfbab = (D.f[DIR_0M0])[ks   ];
+         //real mfbbc = (D.f[DIR_00P])[kt   ];
+         //real mfbba = (D.f[DIR_00M])[kb   ];
+         //real mfccb = (D.f[DIR_PP0])[kne  ];
+         //real mfaab = (D.f[DIR_MM0])[ksw  ];
+         //real mfcab = (D.f[DIR_PM0])[kse  ];
+         //real mfacb = (D.f[DIR_MP0])[knw  ];
+         //real mfcbc = (D.f[DIR_P0P])[kte  ];
+         //real mfaba = (D.f[DIR_M0M])[kbw  ];
+         //real mfcba = (D.f[DIR_P0M])[kbe  ];
+         //real mfabc = (D.f[DIR_M0P])[ktw  ];
+         //real mfbcc = (D.f[DIR_0PP])[ktn  ];
+         //real mfbaa = (D.f[DIR_0MM])[kbs  ];
+         //real mfbca = (D.f[DIR_0PM])[kbn  ];
+         //real mfbac = (D.f[DIR_0MP])[kts  ];
+         //real mfbbb = (D.f[DIR_000])[kzero];
+         //real mfccc = (D.f[DIR_PPP])[ktne ];
+         //real mfaac = (D.f[DIR_MMP])[ktsw ];
+         //real mfcac = (D.f[DIR_PMP])[ktse ];
+         //real mfacc = (D.f[DIR_MPP])[ktnw ];
+         //real mfcca = (D.f[DIR_PPM])[kbne ];
+         //real mfaaa = (D.f[DIR_MMM])[kbsw ];
+         //real mfcaa = (D.f[DIR_PMM])[kbse ];
+         //real mfaca = (D.f[DIR_MPM])[kbnw ];
+         ////////////////////////////////////////////////////////////////////////////////////
+         //real rho   = (((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) +
+         //				(((mfbac+mfbca) + (mfbaa+mfbcc)) + ((mfabc+mfcba) + (mfaba+mfcbc)) + ((mfacb+mfcab) + (mfaab+mfccb))) +
+         //				((mfabb+mfcbb) + (mfbab+mfbcb)) + (mfbba+mfbbc)) + mfbbb) + one;//!!!!Achtung + one
+         ////////////////////////////////////////////////////////////////////////////////////
+         real rho = rhoBC[k];
+         ////////////////////////////////////////////////////////////////////////////////////
+         real OoRho = c1o1 / (rho * 1.5f);
+         ////////////////////////////////////////////////////////////////////////////////////
+         real vvx    = ((((mfccc-mfaaa) + (mfcac-mfaca)) + ((mfcaa-mfacc) + (mfcca-mfaac))) +
+                       (((mfcba-mfabc) + (mfcbc-mfaba)) + ((mfcab-mfacb) + (mfccb-mfaab))) +
+                         (mfcbb-mfabb)) * OoRho;
+         real vvy    =((((mfccc-mfaaa) + (mfaca-mfcac)) + ((mfacc-mfcaa) + (mfcca-mfaac))) +
+                         (((mfbca-mfbac) + (mfbcc-mfbaa)) + ((mfacb-mfcab) + (mfccb-mfaab))) +
+                           (mfbcb-mfbab)) * OoRho;
+         real vvz    =((((mfccc-mfaaa) + (mfcac-mfaca)) + ((mfacc-mfcaa) + (mfaac-mfcca))) +
+                         (((mfbac-mfbca) + (mfbcc-mfbaa)) + ((mfabc-mfcba) + (mfcbc-mfaba))) +
+                           (mfbbc-mfbba)) * OoRho;
+         /////////////////////////
+         //Test Values
+         //double vvx    = 0.016;
+         //double vvy    = zero;
+         //double vvz    = zero;
+         ////////////////////////////////////////////////////////////////////////////////////////
+         ////round off error test
+         //if(vvx!=zero){
+         //	(kDistTest.f[DIR_P00])[k] = mfabb;
+         //	(kDistTest.f[DIR_M00])[k] = mfcbb;
+         //	(kDistTest.f[DIR_0P0])[k] = mfbab;
+         //	(kDistTest.f[DIR_0M0])[k] = mfbcb;
+         //	(kDistTest.f[DIR_00P])[k] = mfbba;
+         //	(kDistTest.f[DIR_00M])[k] = mfbbc;
+         //	(kDistTest.f[DIR_PP0])[k] = mfaab;
+         //	(kDistTest.f[DIR_MM0])[k] = mfccb;
+         //	(kDistTest.f[DIR_PM0])[k] = mfacb;
+         //	(kDistTest.f[DIR_MP0])[k] = mfcab;
+         //	(kDistTest.f[DIR_P0P])[k] = mfaba;
+         //	(kDistTest.f[DIR_M0M])[k] = mfcbc;
+         //	(kDistTest.f[DIR_P0M])[k] = mfabc;
+         //	(kDistTest.f[DIR_M0P])[k] = mfcba;
+         //	(kDistTest.f[DIR_0PP])[k] = mfbaa;
+         //	(kDistTest.f[DIR_0MM])[k] = mfbcc;
+         //	(kDistTest.f[DIR_0PM])[k] = mfbac;
+         //	(kDistTest.f[DIR_0MP])[k] = mfbca;
+         //	(kDistTest.f[DIR_000])[k] = KQK;
+         //	(kDistTest.f[DIR_PPP])[k] = mfaaa;
+         //	(kDistTest.f[DIR_MMP])[k] = mfcca;
+         //	(kDistTest.f[DIR_PMP])[k] = mfaca;
+         //	(kDistTest.f[DIR_MPP])[k] = mfcaa;
+         //	(kDistTest.f[DIR_PPM])[k] = mfaac;
+         //	(kDistTest.f[DIR_MMM])[k] = mfccc;
+         //	(kDistTest.f[DIR_PMM])[k] = mfacc;
+         //	(kDistTest.f[DIR_MPM])[k] = mfcac;
+         //}else{
+         //	(kDistTest.f[DIR_P00])[k] = zero;
+         //	(kDistTest.f[DIR_M00])[k] = zero;
+         //	(kDistTest.f[DIR_0P0])[k] = zero;
+         //	(kDistTest.f[DIR_0M0])[k] = zero;
+         //	(kDistTest.f[DIR_00P])[k] = zero;
+         //	(kDistTest.f[DIR_00M])[k] = zero;
+         //	(kDistTest.f[DIR_PP0])[k] = zero;
+         //	(kDistTest.f[DIR_MM0])[k] = zero;
+         //	(kDistTest.f[DIR_PM0])[k] = zero;
+         //	(kDistTest.f[DIR_MP0])[k] = zero;
+         //	(kDistTest.f[DIR_P0P])[k] = zero;
+         //	(kDistTest.f[DIR_M0M])[k] = zero;
+         //	(kDistTest.f[DIR_P0M])[k] = zero;
+         //	(kDistTest.f[DIR_M0P])[k] = zero;
+         //	(kDistTest.f[DIR_0PP])[k] = zero;
+         //	(kDistTest.f[DIR_0MM])[k] = zero;
+         //	(kDistTest.f[DIR_0PM])[k] = zero;
+         //	(kDistTest.f[DIR_0MP])[k] = zero;
+         //	(kDistTest.f[DIR_000])[k] = zero;
+         //	(kDistTest.f[DIR_PPP])[k] = zero;
+         //	(kDistTest.f[DIR_MMP])[k] = zero;
+         //	(kDistTest.f[DIR_PMP])[k] = zero;
+         //	(kDistTest.f[DIR_MPP])[k] = zero;
+         //	(kDistTest.f[DIR_PPM])[k] = zero;
+         //	(kDistTest.f[DIR_MMM])[k] = zero;
+         //	(kDistTest.f[DIR_PMM])[k] = zero;
+         //	(kDistTest.f[DIR_MPM])[k] = zero;
+         //}
+
+         //////////////////////////////////////////////////////////////////////////////////////
+         //// first bad fix for negative x velocity
+         ////if(vvx > zero) vvx = zero;
+         //////////////////////////////////////////////////////////////////////////////////////
+         ////// second bad fix for negative x velocity
+         ////if(vvx > zero){
+         ////	vvx = -vvx;
+         ////	vvy = -vvy;
+         ////	vvz = -vvz;
+         ////}
+         ////////////////////////////////////////////////////////////////////////////////////
+         double vx2    = vvx * vvx;
+         double vy2    = vvy * vvy;
+         double vz2    = vvz * vvz;
+         //////////////////////////////////////////////////////////////////////////////////
+         //original
             real XXb    = -c2o3 + vx2;
             real XXc    = -c1o2 * (XXb + c1o1 + vvx);
             real XXa    = XXc + vvx;
@@ -4104,213 +4119,213 @@ __global__ void QPressDeviceEQZ27(real* rhoBC,
             real ZZb    = -c2o3 + vz2;
             real ZZc    = -c1o2 * (ZZb + c1o1 + vvz);
             real ZZa    = ZZc + vvz;
-			//////////////////////////////////////////////////////////////////////////////////
-			//unkonditioniert
-            mfcbb = -(rhoBC[k] + c1o1) * XXc * YYb * ZZb - c2o27; 
-			mfabb = -(rhoBC[k] + c1o1) * XXa * YYb * ZZb - c2o27;
-			mfbcb = -(rhoBC[k] + c1o1) * XXb * YYc * ZZb - c2o27;
-			mfbab = -(rhoBC[k] + c1o1) * XXb * YYa * ZZb - c2o27;
-			mfbbc = -(rhoBC[k] + c1o1) * XXb * YYb * ZZc - c2o27;
-			mfbba = -(rhoBC[k] + c1o1) * XXb * YYb * ZZa - c2o27;
-			mfccb = -(rhoBC[k] + c1o1) * XXc * YYc * ZZb - c1o54;
-			mfaab = -(rhoBC[k] + c1o1) * XXa * YYa * ZZb - c1o54;
-			mfcab = -(rhoBC[k] + c1o1) * XXc * YYa * ZZb - c1o54;
-			mfacb = -(rhoBC[k] + c1o1) * XXa * YYc * ZZb - c1o54;
-			mfcbc = -(rhoBC[k] + c1o1) * XXc * YYb * ZZc - c1o54;
-			mfaba = -(rhoBC[k] + c1o1) * XXa * YYb * ZZa - c1o54;
-			mfcba = -(rhoBC[k] + c1o1) * XXc * YYb * ZZa - c1o54;
-			mfabc = -(rhoBC[k] + c1o1) * XXa * YYb * ZZc - c1o54;
-			mfbcc = -(rhoBC[k] + c1o1) * XXb * YYc * ZZc - c1o54;
-			mfbaa = -(rhoBC[k] + c1o1) * XXb * YYa * ZZa - c1o54;
-			mfbca = -(rhoBC[k] + c1o1) * XXb * YYc * ZZa - c1o54;
-			mfbac = -(rhoBC[k] + c1o1) * XXb * YYa * ZZc - c1o54;
-			mfbbb = -(rhoBC[k] + c1o1) * XXb * YYb * ZZb - c8o27;
-			mfccc = -(rhoBC[k] + c1o1) * XXc * YYc * ZZc - c1o216;
-			mfaac = -(rhoBC[k] + c1o1) * XXa * YYa * ZZc - c1o216;
-			mfcac = -(rhoBC[k] + c1o1) * XXc * YYa * ZZc - c1o216;
-			mfacc = -(rhoBC[k] + c1o1) * XXa * YYc * ZZc - c1o216;
-			mfcca = -(rhoBC[k] + c1o1) * XXc * YYc * ZZa - c1o216;
-			mfaaa = -(rhoBC[k] + c1o1) * XXa * YYa * ZZa - c1o216;
-			mfcaa = -(rhoBC[k] + c1o1) * XXc * YYa * ZZa - c1o216;
-			mfaca = -(rhoBC[k] + c1o1) * XXa * YYc * ZZa - c1o216;
-			//////////////////////////////////////////////////////////
-			////konditioniert
-			//double OneOver216RhoPlusOne = c1over216*(rhoBC[k]+one);
-			//double OnoOver216Rho        = c1over216*rhoBC[k];
-			//mfcbb = OnoOver216Rho*sixteen + OneOver216RhoPlusOne*twelve*(-(two*vy2) - two*vz2 + three*vy2*vz2 + vvx*(-two + three*vy2)*(-two + three*vz2) + vx2*(-two + three*vy2)*(-two + three*vz2));
-			//mfabb = OnoOver216Rho*sixteen - OneOver216RhoPlusOne*twelve*(two*vy2 + two*vz2 - three*vy2*vz2 + vvx*(-two + three*vy2)*(-two + three*vz2) + vx2*(-four + six*vy2 + six*vz2 - nine*vy2*vz2));
-			//mfbcb = four*(-(four*OneOver216RhoPlusOne) + four*OnoOver216Rho + OneOver216RhoPlusOne*(-two + three*vx2)*(one + three*vvy + three*vy2)*(-two + three*vz2));
-			//mfbab = four*(four*OnoOver216Rho - OneOver216RhoPlusOne*three*(vvy*(-two + three*vx2)*(-two + three*vz2) - one*vx2*(one + three*vy2)*(-two + three*vz2) + two*(-(two*vy2) + vz2 + three*vy2*vz2)));
-			//mfbbc = four*(-(four*OneOver216RhoPlusOne) + four*OnoOver216Rho + OneOver216RhoPlusOne*(-two + three*vx2)*(-two + three*vy2)*(one + three*vvz + three*vz2));
-			//mfbba = four*(four*OnoOver216Rho - OneOver216RhoPlusOne*three*(vvz*(-two + three*vx2)*(-two + three*vy2) - one*vx2*(-two + three*vy2)*(one + three*vz2) + two*(vy2 - two*vz2 + three*vy2*vz2)));
-			//mfccb = -(two*(-(OnoOver216Rho*two) + OneOver216RhoPlusOne*three*(-(two*vx2) - two*vy2 - six*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 + vvy*(one + three*vx2)*(-two + three*vz2) + vvx*(one + three*vvy + three*vy2)*(-two + three*vz2))));
-			//mfaab = -(two*(-(OnoOver216Rho*two) + OneOver216RhoPlusOne*three*(-(two*vx2) - two*vy2 - six*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(-two + three*vz2) + vvx*(-one + three*vvy - three*vy2)*(-two + three*vz2))));
-			//mfcab = two*(OnoOver216Rho*two + OneOver216RhoPlusOne*three*(two*vx2 + two*vy2 + six*vx2*vy2 - one*vz2 - three*vx2*vz2 - three*vy2*vz2 - nine*vx2*vy2*vz2 + vvy*(one + three*vx2)*(-two + three*vz2) + vvx*(-one + three*vvy - three*vy2)*(-two + three*vz2)));
-			//mfacb = two*(OnoOver216Rho*two + OneOver216RhoPlusOne*three*(two*vx2 + two*vy2 + six*vx2*vy2 - one*vz2 - three*vx2*vz2 - three*vy2*vz2 - nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(-two + three*vz2) + vvx*(one + three*vvy + three*vy2)*(-two + three*vz2)));
-			//mfcbc = -(two*(-(OnoOver216Rho*two) + OneOver216RhoPlusOne*three*(-(two*vx2) + vy2 + three*vx2*vy2 + vvz*(one + three*vx2)*(-two + three*vy2) - two*vz2 - six*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 + vvx*(-two + three*vy2)*(one + three*vvz + three*vz2))));
-			//mfaba = -(two*(-(OnoOver216Rho*two) + OneOver216RhoPlusOne*three*(-(two*vx2) + vy2 + three*vx2*vy2 - one*vvz*(one + three*vx2)*(-two + three*vy2) - two*vz2 - six*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 + vvx*(-two + three*vy2)*(-one + three*vvz - three*vz2))));
-			//mfcba = two*(OnoOver216Rho*two + OneOver216RhoPlusOne*three*(two*vx2 - one*vy2 - three*vx2*vy2 + vvz*(one + three*vx2)*(-two + three*vy2) + two*vz2 + six*vx2*vz2 - three*vy2*vz2 - nine*vx2*vy2*vz2 + vvx*(-two + three*vy2)*(-one + three*vvz - three*vz2)));
-			//mfabc = two*(OnoOver216Rho*two + OneOver216RhoPlusOne*three*(two*vx2 - one*vy2 - three*vx2*vy2 - one*vvz*(one + three*vx2)*(-two + three*vy2) + two*vz2 + six*vx2*vz2 - three*vy2*vz2 - nine*vx2*vy2*vz2 + vvx*(-two + three*vy2)*(one + three*vvz + three*vz2)));
-			//mfbcc = -(two*(-(OnoOver216Rho*two) + OneOver216RhoPlusOne*three*(vx2 - two*vy2 + three*vx2*vy2 + vvz*(-two + three*vx2)*(one + three*vy2) - two*vz2 + three*vx2*vz2 - six*vy2*vz2 + nine*vx2*vy2*vz2 + vvy*(-two + three*vx2)*(one + three*vvz + three*vz2))));
-			//mfbaa = -(two*(-(OnoOver216Rho*two) + OneOver216RhoPlusOne*three*(vx2 - two*vy2 + three*vx2*vy2 - one*vvz*(-two + three*vx2)*(one + three*vy2) - two*vz2 + three*vx2*vz2 - six*vy2*vz2 + nine*vx2*vy2*vz2 + vvy*(-two + three*vx2)*(-one + three*vvz - three*vz2))));
-			//mfbca = two*(OnoOver216Rho*two + OneOver216RhoPlusOne*three*(-(one*vx2) + two*vy2 - three*vx2*vy2 + vvz*(-two + three*vx2)*(one + three*vy2) + two*vz2 - three*vx2*vz2 + six*vy2*vz2 - nine*vx2*vy2*vz2 + vvy*(-two + three*vx2)*(-one + three*vvz - three*vz2)));
-			//mfbac = two*(OnoOver216Rho*two + OneOver216RhoPlusOne*three*(-(one*vx2) + two*vy2 - three*vx2*vy2 - one*vvz*(-two + three*vx2)*(one + three*vy2) + two*vz2 - three*vx2*vz2 + six*vy2*vz2 - nine*vx2*vy2*vz2 + vvy*(-two + three*vx2)*(one + three*vvz + three*vz2)));
-			//mfbbb = eight*(eight*OnoOver216Rho + OneOver216RhoPlusOne*three*(four*vy2 + four*vz2 - six*vy2*vz2 + vx2*(-two + three*vy2)*(-two + three*vz2)));
-			//mfccc = OnoOver216Rho + OneOver216RhoPlusOne*three*(vvz + vx2 + three*vvz*vx2 + vy2 + three*vvz*vy2 + three*vx2*vy2 + nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 + vvy*(one + three*vx2)*(one + three*vvz + three*vz2) + vvx*(one + three*vvy + three*vy2)*(one + three*vvz + three*vz2));
-			//mfaac = OnoOver216Rho + OneOver216RhoPlusOne*three*(vvz + vx2 + three*vvz*vx2 + vy2 + three*vvz*vy2 + three*vx2*vy2 + nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(one + three*vvz + three*vz2) + vvx*(-one + three*vvy - three*vy2)*(one + three*vvz + three*vz2));
-			//mfcac = OnoOver216Rho + OneOver216RhoPlusOne*three*(vvz + vx2 + three*vvz*vx2 + vy2 + three*vvz*vy2 + three*vx2*vy2 + nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(one + three*vvz + three*vz2) - one*vvx*(-one + three*vvy - three*vy2)*(one + three*vvz + three*vz2));
-			//mfacc = OnoOver216Rho + OneOver216RhoPlusOne*three*(vvz + vx2 + three*vvz*vx2 + vy2 + three*vvz*vy2 + three*vx2*vy2 + nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 + vvy*(one + three*vx2)*(one + three*vvz + three*vz2) - one*vvx*(one + three*vvy + three*vy2)*(one + three*vvz + three*vz2));
-			//mfcca = OnoOver216Rho + OneOver216RhoPlusOne*three*(-(one*vvz) + vx2 - three*vvz*vx2 + vy2 - three*vvz*vy2 + three*vx2*vy2 - nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(-one + three*vvz - three*vz2) - one*vvx*(one + three*vvy + three*vy2)*(-one + three*vvz - three*vz2));
-			//mfaaa = OnoOver216Rho - OneOver216RhoPlusOne*three*(vvz - one*vx2 + three*vvz*vx2 - one*vy2 + three*vvz*vy2 - three*vx2*vy2 + nine*vvz*vx2*vy2 - one*vz2 - three*vx2*vz2 - three*vy2*vz2 - nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(-one + three*vvz - three*vz2) + vvx*(-one + three*vvy - three*vy2)*(-one + three*vvz - three*vz2));
-			//mfcaa = OnoOver216Rho + OneOver216RhoPlusOne*three*(-(one*vvz) + vx2 - three*vvz*vx2 + vy2 - three*vvz*vy2 + three*vx2*vy2 - nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 + vvy*(one + three*vx2)*(-one + three*vvz - three*vz2) + vvx*(-one + three*vvy - three*vy2)*(-one + three*vvz - three*vz2));
-			//mfaca = OnoOver216Rho + OneOver216RhoPlusOne*three*(-(one*vvz) + vx2 - three*vvz*vx2 + vy2 - three*vvz*vy2 + three*vx2*vy2 - nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(-one + three*vvz - three*vz2) + vvx*(one + three*vvy + three*vy2)*(-one + three*vvz - three*vz2));
+         //////////////////////////////////////////////////////////////////////////////////
+         //unkonditioniert
+            mfcbb = -(rhoBC[k] + c1o1) * XXc * YYb * ZZb - c2o27;
+         mfabb = -(rhoBC[k] + c1o1) * XXa * YYb * ZZb - c2o27;
+         mfbcb = -(rhoBC[k] + c1o1) * XXb * YYc * ZZb - c2o27;
+         mfbab = -(rhoBC[k] + c1o1) * XXb * YYa * ZZb - c2o27;
+         mfbbc = -(rhoBC[k] + c1o1) * XXb * YYb * ZZc - c2o27;
+         mfbba = -(rhoBC[k] + c1o1) * XXb * YYb * ZZa - c2o27;
+         mfccb = -(rhoBC[k] + c1o1) * XXc * YYc * ZZb - c1o54;
+         mfaab = -(rhoBC[k] + c1o1) * XXa * YYa * ZZb - c1o54;
+         mfcab = -(rhoBC[k] + c1o1) * XXc * YYa * ZZb - c1o54;
+         mfacb = -(rhoBC[k] + c1o1) * XXa * YYc * ZZb - c1o54;
+         mfcbc = -(rhoBC[k] + c1o1) * XXc * YYb * ZZc - c1o54;
+         mfaba = -(rhoBC[k] + c1o1) * XXa * YYb * ZZa - c1o54;
+         mfcba = -(rhoBC[k] + c1o1) * XXc * YYb * ZZa - c1o54;
+         mfabc = -(rhoBC[k] + c1o1) * XXa * YYb * ZZc - c1o54;
+         mfbcc = -(rhoBC[k] + c1o1) * XXb * YYc * ZZc - c1o54;
+         mfbaa = -(rhoBC[k] + c1o1) * XXb * YYa * ZZa - c1o54;
+         mfbca = -(rhoBC[k] + c1o1) * XXb * YYc * ZZa - c1o54;
+         mfbac = -(rhoBC[k] + c1o1) * XXb * YYa * ZZc - c1o54;
+         mfbbb = -(rhoBC[k] + c1o1) * XXb * YYb * ZZb - c8o27;
+         mfccc = -(rhoBC[k] + c1o1) * XXc * YYc * ZZc - c1o216;
+         mfaac = -(rhoBC[k] + c1o1) * XXa * YYa * ZZc - c1o216;
+         mfcac = -(rhoBC[k] + c1o1) * XXc * YYa * ZZc - c1o216;
+         mfacc = -(rhoBC[k] + c1o1) * XXa * YYc * ZZc - c1o216;
+         mfcca = -(rhoBC[k] + c1o1) * XXc * YYc * ZZa - c1o216;
+         mfaaa = -(rhoBC[k] + c1o1) * XXa * YYa * ZZa - c1o216;
+         mfcaa = -(rhoBC[k] + c1o1) * XXc * YYa * ZZa - c1o216;
+         mfaca = -(rhoBC[k] + c1o1) * XXa * YYc * ZZa - c1o216;
+         //////////////////////////////////////////////////////////
+         ////konditioniert
+         //double OneOver216RhoPlusOne = c1over216*(rhoBC[k]+one);
+         //double OnoOver216Rho        = c1over216*rhoBC[k];
+         //mfcbb = OnoOver216Rho*sixteen + OneOver216RhoPlusOne*twelve*(-(two*vy2) - two*vz2 + three*vy2*vz2 + vvx*(-two + three*vy2)*(-two + three*vz2) + vx2*(-two + three*vy2)*(-two + three*vz2));
+         //mfabb = OnoOver216Rho*sixteen - OneOver216RhoPlusOne*twelve*(two*vy2 + two*vz2 - three*vy2*vz2 + vvx*(-two + three*vy2)*(-two + three*vz2) + vx2*(-four + six*vy2 + six*vz2 - nine*vy2*vz2));
+         //mfbcb = four*(-(four*OneOver216RhoPlusOne) + four*OnoOver216Rho + OneOver216RhoPlusOne*(-two + three*vx2)*(one + three*vvy + three*vy2)*(-two + three*vz2));
+         //mfbab = four*(four*OnoOver216Rho - OneOver216RhoPlusOne*three*(vvy*(-two + three*vx2)*(-two + three*vz2) - one*vx2*(one + three*vy2)*(-two + three*vz2) + two*(-(two*vy2) + vz2 + three*vy2*vz2)));
+         //mfbbc = four*(-(four*OneOver216RhoPlusOne) + four*OnoOver216Rho + OneOver216RhoPlusOne*(-two + three*vx2)*(-two + three*vy2)*(one + three*vvz + three*vz2));
+         //mfbba = four*(four*OnoOver216Rho - OneOver216RhoPlusOne*three*(vvz*(-two + three*vx2)*(-two + three*vy2) - one*vx2*(-two + three*vy2)*(one + three*vz2) + two*(vy2 - two*vz2 + three*vy2*vz2)));
+         //mfccb = -(two*(-(OnoOver216Rho*two) + OneOver216RhoPlusOne*three*(-(two*vx2) - two*vy2 - six*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 + vvy*(one + three*vx2)*(-two + three*vz2) + vvx*(one + three*vvy + three*vy2)*(-two + three*vz2))));
+         //mfaab = -(two*(-(OnoOver216Rho*two) + OneOver216RhoPlusOne*three*(-(two*vx2) - two*vy2 - six*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(-two + three*vz2) + vvx*(-one + three*vvy - three*vy2)*(-two + three*vz2))));
+         //mfcab = two*(OnoOver216Rho*two + OneOver216RhoPlusOne*three*(two*vx2 + two*vy2 + six*vx2*vy2 - one*vz2 - three*vx2*vz2 - three*vy2*vz2 - nine*vx2*vy2*vz2 + vvy*(one + three*vx2)*(-two + three*vz2) + vvx*(-one + three*vvy - three*vy2)*(-two + three*vz2)));
+         //mfacb = two*(OnoOver216Rho*two + OneOver216RhoPlusOne*three*(two*vx2 + two*vy2 + six*vx2*vy2 - one*vz2 - three*vx2*vz2 - three*vy2*vz2 - nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(-two + three*vz2) + vvx*(one + three*vvy + three*vy2)*(-two + three*vz2)));
+         //mfcbc = -(two*(-(OnoOver216Rho*two) + OneOver216RhoPlusOne*three*(-(two*vx2) + vy2 + three*vx2*vy2 + vvz*(one + three*vx2)*(-two + three*vy2) - two*vz2 - six*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 + vvx*(-two + three*vy2)*(one + three*vvz + three*vz2))));
+         //mfaba = -(two*(-(OnoOver216Rho*two) + OneOver216RhoPlusOne*three*(-(two*vx2) + vy2 + three*vx2*vy2 - one*vvz*(one + three*vx2)*(-two + three*vy2) - two*vz2 - six*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 + vvx*(-two + three*vy2)*(-one + three*vvz - three*vz2))));
+         //mfcba = two*(OnoOver216Rho*two + OneOver216RhoPlusOne*three*(two*vx2 - one*vy2 - three*vx2*vy2 + vvz*(one + three*vx2)*(-two + three*vy2) + two*vz2 + six*vx2*vz2 - three*vy2*vz2 - nine*vx2*vy2*vz2 + vvx*(-two + three*vy2)*(-one + three*vvz - three*vz2)));
+         //mfabc = two*(OnoOver216Rho*two + OneOver216RhoPlusOne*three*(two*vx2 - one*vy2 - three*vx2*vy2 - one*vvz*(one + three*vx2)*(-two + three*vy2) + two*vz2 + six*vx2*vz2 - three*vy2*vz2 - nine*vx2*vy2*vz2 + vvx*(-two + three*vy2)*(one + three*vvz + three*vz2)));
+         //mfbcc = -(two*(-(OnoOver216Rho*two) + OneOver216RhoPlusOne*three*(vx2 - two*vy2 + three*vx2*vy2 + vvz*(-two + three*vx2)*(one + three*vy2) - two*vz2 + three*vx2*vz2 - six*vy2*vz2 + nine*vx2*vy2*vz2 + vvy*(-two + three*vx2)*(one + three*vvz + three*vz2))));
+         //mfbaa = -(two*(-(OnoOver216Rho*two) + OneOver216RhoPlusOne*three*(vx2 - two*vy2 + three*vx2*vy2 - one*vvz*(-two + three*vx2)*(one + three*vy2) - two*vz2 + three*vx2*vz2 - six*vy2*vz2 + nine*vx2*vy2*vz2 + vvy*(-two + three*vx2)*(-one + three*vvz - three*vz2))));
+         //mfbca = two*(OnoOver216Rho*two + OneOver216RhoPlusOne*three*(-(one*vx2) + two*vy2 - three*vx2*vy2 + vvz*(-two + three*vx2)*(one + three*vy2) + two*vz2 - three*vx2*vz2 + six*vy2*vz2 - nine*vx2*vy2*vz2 + vvy*(-two + three*vx2)*(-one + three*vvz - three*vz2)));
+         //mfbac = two*(OnoOver216Rho*two + OneOver216RhoPlusOne*three*(-(one*vx2) + two*vy2 - three*vx2*vy2 - one*vvz*(-two + three*vx2)*(one + three*vy2) + two*vz2 - three*vx2*vz2 + six*vy2*vz2 - nine*vx2*vy2*vz2 + vvy*(-two + three*vx2)*(one + three*vvz + three*vz2)));
+         //mfbbb = eight*(eight*OnoOver216Rho + OneOver216RhoPlusOne*three*(four*vy2 + four*vz2 - six*vy2*vz2 + vx2*(-two + three*vy2)*(-two + three*vz2)));
+         //mfccc = OnoOver216Rho + OneOver216RhoPlusOne*three*(vvz + vx2 + three*vvz*vx2 + vy2 + three*vvz*vy2 + three*vx2*vy2 + nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 + vvy*(one + three*vx2)*(one + three*vvz + three*vz2) + vvx*(one + three*vvy + three*vy2)*(one + three*vvz + three*vz2));
+         //mfaac = OnoOver216Rho + OneOver216RhoPlusOne*three*(vvz + vx2 + three*vvz*vx2 + vy2 + three*vvz*vy2 + three*vx2*vy2 + nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(one + three*vvz + three*vz2) + vvx*(-one + three*vvy - three*vy2)*(one + three*vvz + three*vz2));
+         //mfcac = OnoOver216Rho + OneOver216RhoPlusOne*three*(vvz + vx2 + three*vvz*vx2 + vy2 + three*vvz*vy2 + three*vx2*vy2 + nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(one + three*vvz + three*vz2) - one*vvx*(-one + three*vvy - three*vy2)*(one + three*vvz + three*vz2));
+         //mfacc = OnoOver216Rho + OneOver216RhoPlusOne*three*(vvz + vx2 + three*vvz*vx2 + vy2 + three*vvz*vy2 + three*vx2*vy2 + nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 + vvy*(one + three*vx2)*(one + three*vvz + three*vz2) - one*vvx*(one + three*vvy + three*vy2)*(one + three*vvz + three*vz2));
+         //mfcca = OnoOver216Rho + OneOver216RhoPlusOne*three*(-(one*vvz) + vx2 - three*vvz*vx2 + vy2 - three*vvz*vy2 + three*vx2*vy2 - nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(-one + three*vvz - three*vz2) - one*vvx*(one + three*vvy + three*vy2)*(-one + three*vvz - three*vz2));
+         //mfaaa = OnoOver216Rho - OneOver216RhoPlusOne*three*(vvz - one*vx2 + three*vvz*vx2 - one*vy2 + three*vvz*vy2 - three*vx2*vy2 + nine*vvz*vx2*vy2 - one*vz2 - three*vx2*vz2 - three*vy2*vz2 - nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(-one + three*vvz - three*vz2) + vvx*(-one + three*vvy - three*vy2)*(-one + three*vvz - three*vz2));
+         //mfcaa = OnoOver216Rho + OneOver216RhoPlusOne*three*(-(one*vvz) + vx2 - three*vvz*vx2 + vy2 - three*vvz*vy2 + three*vx2*vy2 - nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 + vvy*(one + three*vx2)*(-one + three*vvz - three*vz2) + vvx*(-one + three*vvy - three*vy2)*(-one + three*vvz - three*vz2));
+         //mfaca = OnoOver216Rho + OneOver216RhoPlusOne*three*(-(one*vvz) + vx2 - three*vvz*vx2 + vy2 - three*vvz*vy2 + three*vx2*vy2 - nine*vvz*vx2*vy2 + vz2 + three*vx2*vz2 + three*vy2*vz2 + nine*vx2*vy2*vz2 - one*vvy*(one + three*vx2)*(-one + three*vvz - three*vz2) + vvx*(one + three*vvy + three*vy2)*(-one + three*vvz - three*vz2));
 
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //if (isEvenTimestep==true)
       //{
-      //   D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      //   D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      //   D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      //   D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      //   D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      //   D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      //   D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      //   D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      //   D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      //   D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      //   D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      //   D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      //   D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      //   D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      //   D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      //   D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      //   D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      //   D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      //   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      //   D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      //   D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      //   D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      //   D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      //   D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      //   D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      //   D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      //   D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      //} 
+      //   D.f[DIR_P00] = &DD[DIR_P00 * size_Mat];
+      //   D.f[DIR_M00] = &DD[DIR_M00 * size_Mat];
+      //   D.f[DIR_0P0] = &DD[DIR_0P0 * size_Mat];
+      //   D.f[DIR_0M0] = &DD[DIR_0M0 * size_Mat];
+      //   D.f[DIR_00P] = &DD[DIR_00P * size_Mat];
+      //   D.f[DIR_00M] = &DD[DIR_00M * size_Mat];
+      //   D.f[DIR_PP0] = &DD[DIR_PP0 * size_Mat];
+      //   D.f[DIR_MM0] = &DD[DIR_MM0 * size_Mat];
+      //   D.f[DIR_PM0] = &DD[DIR_PM0 * size_Mat];
+      //   D.f[DIR_MP0] = &DD[DIR_MP0 * size_Mat];
+      //   D.f[DIR_P0P] = &DD[DIR_P0P * size_Mat];
+      //   D.f[DIR_M0M] = &DD[DIR_M0M * size_Mat];
+      //   D.f[DIR_P0M] = &DD[DIR_P0M * size_Mat];
+      //   D.f[DIR_M0P] = &DD[DIR_M0P * size_Mat];
+      //   D.f[DIR_0PP] = &DD[DIR_0PP * size_Mat];
+      //   D.f[DIR_0MM] = &DD[DIR_0MM * size_Mat];
+      //   D.f[DIR_0PM] = &DD[DIR_0PM * size_Mat];
+      //   D.f[DIR_0MP] = &DD[DIR_0MP * size_Mat];
+      //   D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+      //   D.f[DIR_PPP] = &DD[DIR_PPP * size_Mat];
+      //   D.f[DIR_MMP] = &DD[DIR_MMP * size_Mat];
+      //   D.f[DIR_PMP] = &DD[DIR_PMP * size_Mat];
+      //   D.f[DIR_MPP] = &DD[DIR_MPP * size_Mat];
+      //   D.f[DIR_PPM] = &DD[DIR_PPM * size_Mat];
+      //   D.f[DIR_MMM] = &DD[DIR_MMM * size_Mat];
+      //   D.f[DIR_PMM] = &DD[DIR_PMM * size_Mat];
+      //   D.f[DIR_MPM] = &DD[DIR_MPM * size_Mat];
+      //}
       //else
       //{
-      //   D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      //   D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      //   D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      //   D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      //   D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      //   D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      //   D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      //   D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      //   D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      //   D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      //   D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      //   D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      //   D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      //   D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      //   D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      //   D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      //   D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      //   D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      //   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      //   D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      //   D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      //   D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      //   D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      //   D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      //   D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      //   D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      //   D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      //   D.f[DIR_M00] = &DD[DIR_P00 * size_Mat];
+      //   D.f[DIR_P00] = &DD[DIR_M00 * size_Mat];
+      //   D.f[DIR_0M0] = &DD[DIR_0P0 * size_Mat];
+      //   D.f[DIR_0P0] = &DD[DIR_0M0 * size_Mat];
+      //   D.f[DIR_00M] = &DD[DIR_00P * size_Mat];
+      //   D.f[DIR_00P] = &DD[DIR_00M * size_Mat];
+      //   D.f[DIR_MM0] = &DD[DIR_PP0 * size_Mat];
+      //   D.f[DIR_PP0] = &DD[DIR_MM0 * size_Mat];
+      //   D.f[DIR_MP0] = &DD[DIR_PM0 * size_Mat];
+      //   D.f[DIR_PM0] = &DD[DIR_MP0 * size_Mat];
+      //   D.f[DIR_M0M] = &DD[DIR_P0P * size_Mat];
+      //   D.f[DIR_P0P] = &DD[DIR_M0M * size_Mat];
+      //   D.f[DIR_M0P] = &DD[DIR_P0M * size_Mat];
+      //   D.f[DIR_P0M] = &DD[DIR_M0P * size_Mat];
+      //   D.f[DIR_0MM] = &DD[DIR_0PP * size_Mat];
+      //   D.f[DIR_0PP] = &DD[DIR_0MM * size_Mat];
+      //   D.f[DIR_0MP] = &DD[DIR_0PM * size_Mat];
+      //   D.f[DIR_0PM] = &DD[DIR_0MP * size_Mat];
+      //   D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+      //   D.f[DIR_PPP] = &DD[DIR_MMM * size_Mat];
+      //   D.f[DIR_MMP] = &DD[DIR_PPM * size_Mat];
+      //   D.f[DIR_PMP] = &DD[DIR_MPM * size_Mat];
+      //   D.f[DIR_MPP] = &DD[DIR_PMM * size_Mat];
+      //   D.f[DIR_PPM] = &DD[DIR_MMP * size_Mat];
+      //   D.f[DIR_MMM] = &DD[DIR_PPP * size_Mat];
+      //   D.f[DIR_PMM] = &DD[DIR_MPP * size_Mat];
+      //   D.f[DIR_MPM] = &DD[DIR_PMP * size_Mat];
       //}
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //__syncthreads();
 
-			(D.f[DIR_P00   ])[ke   ] = mfabb;//mfcbb;
-			(D.f[DIR_M00   ])[kw   ] = mfcbb;//mfabb;
-			(D.f[DIR_0P0   ])[kn   ] = mfbab;//mfbcb;
-			(D.f[DIR_0M0   ])[ks   ] = mfbcb;//mfbab;
-			(D.f[DIR_00P   ])[kt   ] = mfbba;//mfbbc;
-			(D.f[DIR_00M   ])[kb   ] = mfbbc;//mfbba;
-			(D.f[DIR_PP0  ])[kne  ] = mfaab;//mfccb;
-			(D.f[DIR_MM0  ])[ksw  ] = mfccb;//mfaab;
-			(D.f[DIR_PM0  ])[kse  ] = mfacb;//mfcab;
-			(D.f[DIR_MP0  ])[knw  ] = mfcab;//mfacb;
-			(D.f[DIR_P0P  ])[kte  ] = mfaba;//mfcbc;
-			(D.f[DIR_M0M  ])[kbw  ] = mfcbc;//mfaba;
-			(D.f[DIR_P0M  ])[kbe  ] = mfabc;//mfcba;
-			(D.f[DIR_M0P  ])[ktw  ] = mfcba;//mfabc;
-			(D.f[DIR_0PP  ])[ktn  ] = mfbaa;//mfbcc;
-			(D.f[DIR_0MM  ])[kbs  ] = mfbcc;//mfbaa;
-			(D.f[DIR_0PM  ])[kbn  ] = mfbac;//mfbca;
-			(D.f[DIR_0MP  ])[kts  ] = mfbca;//mfbac;
-			(D.f[DIR_000])[kzero] = mfbbb;//mfbbb;
-			(D.f[DIR_PPP ])[ktne ] = mfaaa;//mfccc;
-			(D.f[DIR_MMP ])[ktsw ] = mfcca;//mfaac;
-			(D.f[DIR_PMP ])[ktse ] = mfaca;//mfcac;
-			(D.f[DIR_MPP ])[ktnw ] = mfcaa;//mfacc;
-			(D.f[DIR_PPM ])[kbne ] = mfaac;//mfcca;
-			(D.f[DIR_MMM ])[kbsw ] = mfccc;//mfaaa;
-			(D.f[DIR_PMM ])[kbse ] = mfacc;//mfcaa;
-			(D.f[DIR_MPM ])[kbnw ] = mfcac;//mfaca;
-			//(D.f[DIR_P00   ])[ke   ] = mfcbb;
-			//(D.f[DIR_M00   ])[kw   ] = mfabb;
-			//(D.f[DIR_0P0   ])[kn   ] = mfbcb;
-			//(D.f[DIR_0M0   ])[ks   ] = mfbab;
-			//(D.f[DIR_00P   ])[kt   ] = mfbbc;
-			//(D.f[DIR_00M   ])[kb   ] = mfbba;
-			//(D.f[DIR_PP0  ])[kne  ] = mfccb;
-			//(D.f[DIR_MM0  ])[ksw  ] = mfaab;
-			//(D.f[DIR_PM0  ])[kse  ] = mfcab;
-			//(D.f[DIR_MP0  ])[knw  ] = mfacb;
-			//(D.f[DIR_P0P  ])[kte  ] = mfcbc;
-			//(D.f[DIR_M0M  ])[kbw  ] = mfaba;
-			//(D.f[DIR_P0M  ])[kbe  ] = mfcba;
-			//(D.f[DIR_M0P  ])[ktw  ] = mfabc;
-			//(D.f[DIR_0PP  ])[ktn  ] = mfbcc;
-			//(D.f[DIR_0MM  ])[kbs  ] = mfbaa;
-			//(D.f[DIR_0PM  ])[kbn  ] = mfbca;
-			//(D.f[DIR_0MP  ])[kts  ] = mfbac;
-			//(D.f[DIR_000])[kzero] = mfbbb;
-			//(D.f[DIR_PPP ])[ktne ] = mfccc;
-			//(D.f[DIR_MMP ])[ktsw ] = mfaac;
-			//(D.f[DIR_PMP ])[ktse ] = mfcac;
-			//(D.f[DIR_MPP ])[ktnw ] = mfacc;
-			//(D.f[DIR_PPM ])[kbne ] = mfcca;
-			//(D.f[DIR_MMM ])[kbsw ] = mfaaa;
-			//(D.f[DIR_PMM ])[kbse ] = mfcaa;
-			//(D.f[DIR_MPM ])[kbnw ] = mfaca;
-
-      //(D.f[DIR_P00   ])[ke   ] = fE ;  //f1_E ;   //fW;    //fE ;  
-      //(D.f[DIR_M00   ])[kw   ] = fW ;  //f1_W ;   //fE;    //fW ;  
-      //(D.f[DIR_0P0   ])[kn   ] = fN ;  //f1_N ;   //fS;    //fN ;  
-      //(D.f[DIR_0M0   ])[ks   ] = fS ;  //f1_S ;   //fN;    //fS ;  
-      //(D.f[DIR_00P   ])[kt   ] = fT ;  //f1_T ;   //fB;    //fT ;  
-      //(D.f[DIR_00M   ])[kb   ] = fB ;  //f1_B ;   //fT;    //fB ;  
-      //(D.f[DIR_PP0  ])[kne  ] = fNE;  //f1_NE;   //fSW;   //fNE;  
-      //(D.f[DIR_MM0  ])[ksw  ] = fSW;  //f1_SW;   //fNE;   //fSW;  
-      //(D.f[DIR_PM0  ])[kse  ] = fSE;  //f1_SE;   //fNW;   //fSE;  
-      //(D.f[DIR_MP0  ])[knw  ] = fNW;  //f1_NW;   //fSE;   //fNW;  
-      //(D.f[DIR_P0P  ])[kte  ] = fTE;  //f1_TE;   //fBW;   //fTE;  
-      //(D.f[DIR_M0M  ])[kbw  ] = fBW;  //f1_BW;   //fTE;   //fBW;  
-      //(D.f[DIR_P0M  ])[kbe  ] = fBE;  //f1_BE;   //fTW;   //fBE;  
-      //(D.f[DIR_M0P  ])[ktw  ] = fTW;  //f1_TW;   //fBE;   //fTW;  
-      //(D.f[DIR_0PP  ])[ktn  ] = fTN;  //f1_TN;   //fBS;   //fTN;  
-      //(D.f[DIR_0MM  ])[kbs  ] = fBS;  //f1_BS;   //fTN;   //fBS;  
-      //(D.f[DIR_0PM  ])[kbn  ] = fBN;  //f1_BN;   //fTS;   //fBN;  
-      //(D.f[DIR_0MP  ])[kts  ] = fTS;  //f1_TS;   //fBN;   //fTS;  
+         (D.f[DIR_P00])[ke   ] = mfabb;//mfcbb;
+         (D.f[DIR_M00])[kw   ] = mfcbb;//mfabb;
+         (D.f[DIR_0P0])[kn   ] = mfbab;//mfbcb;
+         (D.f[DIR_0M0])[ks   ] = mfbcb;//mfbab;
+         (D.f[DIR_00P])[kt   ] = mfbba;//mfbbc;
+         (D.f[DIR_00M])[kb   ] = mfbbc;//mfbba;
+         (D.f[DIR_PP0])[kne  ] = mfaab;//mfccb;
+         (D.f[DIR_MM0])[ksw  ] = mfccb;//mfaab;
+         (D.f[DIR_PM0])[kse  ] = mfacb;//mfcab;
+         (D.f[DIR_MP0])[knw  ] = mfcab;//mfacb;
+         (D.f[DIR_P0P])[kte  ] = mfaba;//mfcbc;
+         (D.f[DIR_M0M])[kbw  ] = mfcbc;//mfaba;
+         (D.f[DIR_P0M])[kbe  ] = mfabc;//mfcba;
+         (D.f[DIR_M0P])[ktw  ] = mfcba;//mfabc;
+         (D.f[DIR_0PP])[ktn  ] = mfbaa;//mfbcc;
+         (D.f[DIR_0MM])[kbs  ] = mfbcc;//mfbaa;
+         (D.f[DIR_0PM])[kbn  ] = mfbac;//mfbca;
+         (D.f[DIR_0MP])[kts  ] = mfbca;//mfbac;
+         (D.f[DIR_000])[kzero] = mfbbb;//mfbbb;
+         (D.f[DIR_PPP])[ktne ] = mfaaa;//mfccc;
+         (D.f[DIR_MMP])[ktsw ] = mfcca;//mfaac;
+         (D.f[DIR_PMP])[ktse ] = mfaca;//mfcac;
+         (D.f[DIR_MPP])[ktnw ] = mfcaa;//mfacc;
+         (D.f[DIR_PPM])[kbne ] = mfaac;//mfcca;
+         (D.f[DIR_MMM])[kbsw ] = mfccc;//mfaaa;
+         (D.f[DIR_PMM])[kbse ] = mfacc;//mfcaa;
+         (D.f[DIR_MPM])[kbnw ] = mfcac;//mfaca;
+         //(D.f[DIR_P00])[ke   ] = mfcbb;
+         //(D.f[DIR_M00])[kw   ] = mfabb;
+         //(D.f[DIR_0P0])[kn   ] = mfbcb;
+         //(D.f[DIR_0M0])[ks   ] = mfbab;
+         //(D.f[DIR_00P])[kt   ] = mfbbc;
+         //(D.f[DIR_00M])[kb   ] = mfbba;
+         //(D.f[DIR_PP0])[kne  ] = mfccb;
+         //(D.f[DIR_MM0])[ksw  ] = mfaab;
+         //(D.f[DIR_PM0])[kse  ] = mfcab;
+         //(D.f[DIR_MP0])[knw  ] = mfacb;
+         //(D.f[DIR_P0P])[kte  ] = mfcbc;
+         //(D.f[DIR_M0M])[kbw  ] = mfaba;
+         //(D.f[DIR_P0M])[kbe  ] = mfcba;
+         //(D.f[DIR_M0P])[ktw  ] = mfabc;
+         //(D.f[DIR_0PP])[ktn  ] = mfbcc;
+         //(D.f[DIR_0MM])[kbs  ] = mfbaa;
+         //(D.f[DIR_0PM])[kbn  ] = mfbca;
+         //(D.f[DIR_0MP])[kts  ] = mfbac;
+         //(D.f[DIR_000])[kzero] = mfbbb;
+         //(D.f[DIR_PPP])[ktne ] = mfccc;
+         //(D.f[DIR_MMP])[ktsw ] = mfaac;
+         //(D.f[DIR_PMP])[ktse ] = mfcac;
+         //(D.f[DIR_MPP])[ktnw ] = mfacc;
+         //(D.f[DIR_PPM])[kbne ] = mfcca;
+         //(D.f[DIR_MMM])[kbsw ] = mfaaa;
+         //(D.f[DIR_PMM])[kbse ] = mfcaa;
+         //(D.f[DIR_MPM])[kbnw ] = mfaca;
+
+      //(D.f[DIR_P00])[ke   ] = fE ;  //f1_E ;   //fW;    //fE ;
+      //(D.f[DIR_M00])[kw   ] = fW ;  //f1_W ;   //fE;    //fW ;
+      //(D.f[DIR_0P0])[kn   ] = fN ;  //f1_N ;   //fS;    //fN ;
+      //(D.f[DIR_0M0])[ks   ] = fS ;  //f1_S ;   //fN;    //fS ;
+      //(D.f[DIR_00P])[kt   ] = fT ;  //f1_T ;   //fB;    //fT ;
+      //(D.f[DIR_00M])[kb   ] = fB ;  //f1_B ;   //fT;    //fB ;
+      //(D.f[DIR_PP0])[kne  ] = fNE;  //f1_NE;   //fSW;   //fNE;
+      //(D.f[DIR_MM0])[ksw  ] = fSW;  //f1_SW;   //fNE;   //fSW;
+      //(D.f[DIR_PM0])[kse  ] = fSE;  //f1_SE;   //fNW;   //fSE;
+      //(D.f[DIR_MP0])[knw  ] = fNW;  //f1_NW;   //fSE;   //fNW;
+      //(D.f[DIR_P0P])[kte  ] = fTE;  //f1_TE;   //fBW;   //fTE;
+      //(D.f[DIR_M0M])[kbw  ] = fBW;  //f1_BW;   //fTE;   //fBW;
+      //(D.f[DIR_P0M])[kbe  ] = fBE;  //f1_BE;   //fTW;   //fBE;
+      //(D.f[DIR_M0P])[ktw  ] = fTW;  //f1_TW;   //fBE;   //fTW;
+      //(D.f[DIR_0PP])[ktn  ] = fTN;  //f1_TN;   //fBS;   //fTN;
+      //(D.f[DIR_0MM])[kbs  ] = fBS;  //f1_BS;   //fTN;   //fBS;
+      //(D.f[DIR_0PM])[kbn  ] = fBN;  //f1_BN;   //fTS;   //fBN;
+      //(D.f[DIR_0MP])[kts  ] = fTS;  //f1_TS;   //fBN;   //fTS;
       //(D.f[DIR_000])[kzero] = fZERO;//f1_ZERO; //fZERO; //fZERO;
-      //(D.f[DIR_PPP ])[ktne ] = fTNE; //f1_TNE;  //fBSW;  //fTNE; 
-      //(D.f[DIR_MMM ])[kbsw ] = fBSW; //f1_BSW;  //fTNE;  //fBSW; 
-      //(D.f[DIR_PPM ])[kbne ] = fBNE; //f1_BNE;  //fTSW;  //fBNE; 
-      //(D.f[DIR_MMP ])[ktsw ] = fTSW; //f1_TSW;  //fBNE;  //fTSW; 
-      //(D.f[DIR_PMP ])[ktse ] = fTSE; //f1_TSE;  //fBNW;  //fTSE; 
-      //(D.f[DIR_MPM ])[kbnw ] = fBNW; //f1_BNW;  //fTSE;  //fBNW; 
-      //(D.f[DIR_PMM ])[kbse ] = fBSE; //f1_BSE;  //fTNW;  //fBSE; 
-      //(D.f[DIR_MPP ])[ktnw ] = fTNW; //f1_TNW;  //fBSE;  //fTNW; 
+      //(D.f[DIR_PPP])[ktne ] = fTNE; //f1_TNE;  //fBSW;  //fTNE;
+      //(D.f[DIR_MMM])[kbsw ] = fBSW; //f1_BSW;  //fTNE;  //fBSW;
+      //(D.f[DIR_PPM])[kbne ] = fBNE; //f1_BNE;  //fTSW;  //fBNE;
+      //(D.f[DIR_MMP])[ktsw ] = fTSW; //f1_TSW;  //fBNE;  //fTSW;
+      //(D.f[DIR_PMP])[ktse ] = fTSE; //f1_TSE;  //fBNW;  //fTSE;
+      //(D.f[DIR_MPM])[kbnw ] = fBNW; //f1_BNW;  //fTSE;  //fBNW;
+      //(D.f[DIR_PMM])[kbse ] = fBSE; //f1_BSE;  //fTNW;  //fBSE;
+      //(D.f[DIR_MPP])[ktnw ] = fTNW; //f1_TNW;  //fBSE;  //fTNW;
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -4354,19 +4369,20 @@ __global__ void QPressDeviceEQZ27(real* rhoBC,
 
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-__global__ void QPressDeviceZero27(	 real* DD, 
-												 int* k_Q, 
-												 unsigned int numberOfBCnodes, 
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int size_Mat, 
-												 bool isEvenTimestep)
+__global__ void QPressDeviceZero27(
+    real* DD,
+    int* k_Q,
+    unsigned int numberOfBCnodes,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+   const unsigned  x = threadIdx.x;  // Globaler x-Index
+   const unsigned  y = blockIdx.x;   // Globaler y-Index
+   const unsigned  z = blockIdx.y;   // Globaler z-Index
 
    const unsigned nx = blockDim.x;
    const unsigned ny = gridDim.x;
@@ -4410,94 +4426,94 @@ __global__ void QPressDeviceZero27(	 real* DD,
       Distributions27 D;
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //__syncthreads();
-	  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      (D.f[DIR_P00   ])[ke   ] =c0o1;
-      (D.f[DIR_M00   ])[kw   ] =c0o1;
-      (D.f[DIR_0P0   ])[kn   ] =c0o1;
-      (D.f[DIR_0M0   ])[ks   ] =c0o1;
-      (D.f[DIR_00P   ])[kt   ] =c0o1;
-      (D.f[DIR_00M   ])[kb   ] =c0o1;
-      (D.f[DIR_PP0  ])[kne  ] =c0o1;
-      (D.f[DIR_MM0  ])[ksw  ] =c0o1;
-      (D.f[DIR_PM0  ])[kse  ] =c0o1;
-      (D.f[DIR_MP0  ])[knw  ] =c0o1;
-      (D.f[DIR_P0P  ])[kte  ] =c0o1;
-      (D.f[DIR_M0M  ])[kbw  ] =c0o1;
-      (D.f[DIR_P0M  ])[kbe  ] =c0o1;
-      (D.f[DIR_M0P  ])[ktw  ] =c0o1;
-      (D.f[DIR_0PP  ])[ktn  ] =c0o1;
-      (D.f[DIR_0MM  ])[kbs  ] =c0o1;
-      (D.f[DIR_0PM  ])[kbn  ] =c0o1;
-      (D.f[DIR_0MP  ])[kts  ] =c0o1;
+     //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      (D.f[DIR_P00])[ke   ] =c0o1;
+      (D.f[DIR_M00])[kw   ] =c0o1;
+      (D.f[DIR_0P0])[kn   ] =c0o1;
+      (D.f[DIR_0M0])[ks   ] =c0o1;
+      (D.f[DIR_00P])[kt   ] =c0o1;
+      (D.f[DIR_00M])[kb   ] =c0o1;
+      (D.f[DIR_PP0])[kne  ] =c0o1;
+      (D.f[DIR_MM0])[ksw  ] =c0o1;
+      (D.f[DIR_PM0])[kse  ] =c0o1;
+      (D.f[DIR_MP0])[knw  ] =c0o1;
+      (D.f[DIR_P0P])[kte  ] =c0o1;
+      (D.f[DIR_M0M])[kbw  ] =c0o1;
+      (D.f[DIR_P0M])[kbe  ] =c0o1;
+      (D.f[DIR_M0P])[ktw  ] =c0o1;
+      (D.f[DIR_0PP])[ktn  ] =c0o1;
+      (D.f[DIR_0MM])[kbs  ] =c0o1;
+      (D.f[DIR_0PM])[kbn  ] =c0o1;
+      (D.f[DIR_0MP])[kts  ] =c0o1;
       (D.f[DIR_000])[kzero] =c0o1;
-      (D.f[DIR_PPP ])[ktne ] =c0o1;
-      (D.f[DIR_MMP ])[ktsw ] =c0o1;
-      (D.f[DIR_PMP ])[ktse ] =c0o1;
-      (D.f[DIR_MPP ])[ktnw ] =c0o1;
-      (D.f[DIR_PPM ])[kbne ] =c0o1;
-      (D.f[DIR_MMM ])[kbsw ] =c0o1;
-      (D.f[DIR_PMM ])[kbse ] =c0o1;
-      (D.f[DIR_MPM ])[kbnw ] =c0o1;
+      (D.f[DIR_PPP])[ktne ] =c0o1;
+      (D.f[DIR_MMP])[ktsw ] =c0o1;
+      (D.f[DIR_PMP])[ktse ] =c0o1;
+      (D.f[DIR_MPP])[ktnw ] =c0o1;
+      (D.f[DIR_PPM])[kbne ] =c0o1;
+      (D.f[DIR_MMM])[kbsw ] =c0o1;
+      (D.f[DIR_PMM])[kbse ] =c0o1;
+      (D.f[DIR_MPM])[kbnw ] =c0o1;
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -4541,22 +4557,23 @@ __global__ void QPressDeviceZero27(	 real* DD,
 
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-__global__ void QPressDeviceFake27(	 real* rhoBC,
-												 real* DD, 
-												 int* k_Q, 
-												 int* k_N, 
-												 int numberOfBCnodes, 
-												 real om1, 
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int size_Mat, 
-												 bool isEvenTimestep)
+__global__ void QPressDeviceFake27(
+    real* rhoBC,
+    real* DD,
+    int* k_Q,
+    int* k_N,
+    int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+   const unsigned  x = threadIdx.x;  // Globaler x-Index
+   const unsigned  y = blockIdx.x;   // Globaler y-Index
+   const unsigned  z = blockIdx.y;   // Globaler z-Index
 
    const unsigned nx = blockDim.x;
    const unsigned ny = gridDim.x;
@@ -4630,148 +4647,148 @@ __global__ void QPressDeviceFake27(	 real* rhoBC,
       Distributions27 D;
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real        f1_E,f1_W,f1_N,f1_S,f1_T,f1_B,f1_NE,f1_SW,f1_SE,f1_NW,f1_TE,f1_BW,f1_BE,f1_TW,f1_TN,f1_BS,f1_BN,f1_TS,f1_ZERO,
          f1_TNE,f1_TSW,f1_TSE,f1_TNW,f1_BNE,f1_BSW,f1_BSE,f1_BNW;
 
-      f1_W    = (D.f[DIR_P00   ])[k1e   ];
-      f1_E    = (D.f[DIR_M00   ])[k1w   ];
-      f1_S    = (D.f[DIR_0P0   ])[k1n   ];
-      f1_N    = (D.f[DIR_0M0   ])[k1s   ];
-      f1_B    = (D.f[DIR_00P   ])[k1t   ];
-      f1_T    = (D.f[DIR_00M   ])[k1b   ];
-      f1_SW   = (D.f[DIR_PP0  ])[k1ne  ];
-      f1_NE   = (D.f[DIR_MM0  ])[k1sw  ];
-      f1_NW   = (D.f[DIR_PM0  ])[k1se  ];
-      f1_SE   = (D.f[DIR_MP0  ])[k1nw  ];
-      f1_BW   = (D.f[DIR_P0P  ])[k1te  ];
-      f1_TE   = (D.f[DIR_M0M  ])[k1bw  ];
-      f1_TW   = (D.f[DIR_P0M  ])[k1be  ];
-      f1_BE   = (D.f[DIR_M0P  ])[k1tw  ];
-      f1_BS   = (D.f[DIR_0PP  ])[k1tn  ];
-      f1_TN   = (D.f[DIR_0MM  ])[k1bs  ];
-      f1_TS   = (D.f[DIR_0PM  ])[k1bn  ];
-      f1_BN   = (D.f[DIR_0MP  ])[k1ts  ];
+      f1_W    = (D.f[DIR_P00])[k1e   ];
+      f1_E    = (D.f[DIR_M00])[k1w   ];
+      f1_S    = (D.f[DIR_0P0])[k1n   ];
+      f1_N    = (D.f[DIR_0M0])[k1s   ];
+      f1_B    = (D.f[DIR_00P])[k1t   ];
+      f1_T    = (D.f[DIR_00M])[k1b   ];
+      f1_SW   = (D.f[DIR_PP0])[k1ne  ];
+      f1_NE   = (D.f[DIR_MM0])[k1sw  ];
+      f1_NW   = (D.f[DIR_PM0])[k1se  ];
+      f1_SE   = (D.f[DIR_MP0])[k1nw  ];
+      f1_BW   = (D.f[DIR_P0P])[k1te  ];
+      f1_TE   = (D.f[DIR_M0M])[k1bw  ];
+      f1_TW   = (D.f[DIR_P0M])[k1be  ];
+      f1_BE   = (D.f[DIR_M0P])[k1tw  ];
+      f1_BS   = (D.f[DIR_0PP])[k1tn  ];
+      f1_TN   = (D.f[DIR_0MM])[k1bs  ];
+      f1_TS   = (D.f[DIR_0PM])[k1bn  ];
+      f1_BN   = (D.f[DIR_0MP])[k1ts  ];
       f1_ZERO = (D.f[DIR_000])[k1zero];
-      f1_BSW  = (D.f[DIR_PPP ])[k1tne ];
-      f1_BNE  = (D.f[DIR_MMP ])[k1tsw ];
-      f1_BNW  = (D.f[DIR_PMP ])[k1tse ];
-      f1_BSE  = (D.f[DIR_MPP ])[k1tnw ];
-      f1_TSW  = (D.f[DIR_PPM ])[k1bne ];
-      f1_TNE  = (D.f[DIR_MMM ])[k1bsw ];
-      f1_TNW  = (D.f[DIR_PMM ])[k1bse ];
-      f1_TSE  = (D.f[DIR_MPM ])[k1bnw ];
+      f1_BSW  = (D.f[DIR_PPP])[k1tne ];
+      f1_BNE  = (D.f[DIR_MMP])[k1tsw ];
+      f1_BNW  = (D.f[DIR_PMP])[k1tse ];
+      f1_BSE  = (D.f[DIR_MPP])[k1tnw ];
+      f1_TSW  = (D.f[DIR_PPM])[k1bne ];
+      f1_TNE  = (D.f[DIR_MMM])[k1bsw ];
+      f1_TNW  = (D.f[DIR_PMM])[k1bse ];
+      f1_TSE  = (D.f[DIR_MPM])[k1bnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3;
       vx1    =  ((f1_TSE - f1_BNW) - (f1_TNW - f1_BSE)) + ((f1_TNE - f1_BSW) - (f1_TSW - f1_BNE)) +
                   ((f1_BE - f1_TW)   + (f1_TE - f1_BW))   + ((f1_SE - f1_NW)   + (f1_NE - f1_SW)) +
-                  (f1_E - f1_W); 
+                  (f1_E - f1_W);
 
 
       vx2    =   (-(f1_TSE - f1_BNW) + (f1_TNW - f1_BSE)) + ((f1_TNE - f1_BSW) - (f1_TSW - f1_BNE)) +
                   ((f1_BN - f1_TS)   + (f1_TN - f1_BS))    + (-(f1_SE - f1_NW)  + (f1_NE - f1_SW)) +
-                  (f1_N - f1_S); 
+                  (f1_N - f1_S);
 
       vx3    =   ((f1_TSE - f1_BNW) + (f1_TNW - f1_BSE)) + ((f1_TNE - f1_BSW) + (f1_TSW - f1_BNE)) +
                   (-(f1_BN - f1_TS)  + (f1_TN - f1_BS))   + ((f1_TE - f1_BW)   - (f1_BE - f1_TW)) +
-                  (f1_T - f1_B); 
+                  (f1_T - f1_B);
 
       real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
       //////////////////////////////////////////////////////////////////////////
       real drho1    =  f1_ZERO+f1_E+f1_W+f1_N+f1_S+f1_T+f1_B+f1_NE+f1_SW+f1_SE+f1_NW+f1_TE+f1_BW+f1_BE+f1_TW+f1_TN+f1_BS+f1_BN+f1_TS+
          f1_TNE+f1_TSW+f1_TSE+f1_TNW+f1_BNE+f1_BSW+f1_BSE+f1_BNW;
 
-	  //drho1 = (drho1 + rhoBC[k])/2.f;
-	  drho1 = drho1 - rhoBC[k];
+     //drho1 = (drho1 + rhoBC[k])/2.f;
+     drho1 = drho1 - rhoBC[k];
 
       __syncthreads();
 
-      (D.f[DIR_P00   ])[ke   ] = c2o27* (rhoBC[k]+c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cu_sq);
-      (D.f[DIR_M00   ])[kw   ] = c2o27* (rhoBC[k]+c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cu_sq);
-      (D.f[DIR_0P0   ])[kn   ] = c2o27* (rhoBC[k]+c3o1*(    -vx2    )+c9o2*(    -vx2    )*(    -vx2    )-cu_sq);
-      (D.f[DIR_0M0   ])[ks   ] = c2o27* (rhoBC[k]+c3o1*(     vx2    )+c9o2*(     vx2    )*(     vx2    )-cu_sq);
-      (D.f[DIR_00P   ])[kt   ] = c2o27* (rhoBC[k]+c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cu_sq);
-      (D.f[DIR_00M   ])[kb   ] = c2o27* (rhoBC[k]+c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cu_sq);
-      (D.f[DIR_PP0  ])[kne  ] = f1_SW  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_MM0  ])[ksw  ] = f1_NE  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_PM0  ])[kse  ] = f1_NW  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_MP0  ])[knw  ] = f1_SE  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_P0P  ])[kte  ] = f1_BW  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_M0M  ])[kbw  ] = f1_TE  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_P0M  ])[kbe  ] = f1_TW  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_M0P  ])[ktw  ] = f1_BE  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_0PP  ])[ktn  ] = f1_BS  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_0MM  ])[kbs  ] = f1_TN  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_0PM  ])[kbn  ] = f1_TS  -c1o54*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_0MP  ])[kts  ] = f1_BN  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_P00])[ke   ] = c2o27* (rhoBC[k]+c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cu_sq);
+      (D.f[DIR_M00])[kw   ] = c2o27* (rhoBC[k]+c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cu_sq);
+      (D.f[DIR_0P0])[kn   ] = c2o27* (rhoBC[k]+c3o1*(    -vx2    )+c9o2*(    -vx2    )*(    -vx2    )-cu_sq);
+      (D.f[DIR_0M0])[ks   ] = c2o27* (rhoBC[k]+c3o1*(     vx2    )+c9o2*(     vx2    )*(     vx2    )-cu_sq);
+      (D.f[DIR_00P])[kt   ] = c2o27* (rhoBC[k]+c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cu_sq);
+      (D.f[DIR_00M])[kb   ] = c2o27* (rhoBC[k]+c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cu_sq);
+      (D.f[DIR_PP0])[kne  ] = f1_SW  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_MM0])[ksw  ] = f1_NE  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_PM0])[kse  ] = f1_NW  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_MP0])[knw  ] = f1_SE  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_P0P])[kte  ] = f1_BW  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_M0M])[kbw  ] = f1_TE  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_P0M])[kbe  ] = f1_TW  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_M0P])[ktw  ] = f1_BE  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_0PP])[ktn  ] = f1_BS  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_0MM])[kbs  ] = f1_TN  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_0PM])[kbn  ] = f1_TS  -c1o54*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_0MP])[kts  ] = f1_BN  -c1o54*drho1;	//  c1o100;  // zero;  //
       (D.f[DIR_000])[kzero] = f1_ZERO-c8o27*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_PPP ])[ktne ] = f1_BSW -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_MMP ])[ktsw ] = f1_BNE -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_PMP ])[ktse ] = f1_BNW -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_MPP ])[ktnw ] = f1_BSE -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_PPM ])[kbne ] = f1_TSW -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_MMM ])[kbsw ] = f1_TNE -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_PMM ])[kbse ] = f1_TNW -c1o216*drho1;	//  c1o100;  // zero;  //
-      (D.f[DIR_MPM ])[kbnw ] = f1_TSE -c1o216*drho1;  //  c1o100;  // zero;  //      
+      (D.f[DIR_PPP])[ktne ] = f1_BSW -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_MMP])[ktsw ] = f1_BNE -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_PMP])[ktse ] = f1_BNW -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_MPP])[ktnw ] = f1_BSE -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_PPM])[kbne ] = f1_TSW -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_MMM])[kbsw ] = f1_TNE -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_PMM])[kbse ] = f1_TNW -c1o216*drho1;	//  c1o100;  // zero;  //
+      (D.f[DIR_MPM])[kbnw ] = f1_TSE -c1o216*drho1;  //  c1o100;  // zero;  //
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -4815,461 +4832,462 @@ __global__ void QPressDeviceFake27(	 real* rhoBC,
 
 
 //////////////////////////////////////////////////////////////////////////
-__global__ void QPressDevice27_IntBB(real* rho,
-												real* DD, 
-												int* k_Q, 
-												real* QQ,
-												unsigned int numberOfBCnodes, 
-												real om1, 
-												unsigned int* neighborX,
-												unsigned int* neighborY,
-												unsigned int* neighborZ,
-												unsigned int size_Mat, 
-												bool isEvenTimestep)
+__global__ void QPressDevice27_IntBB(
+    real* rho,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
-	Distributions27 D;
-	if (isEvenTimestep==true)
-	{
-		D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-	} 
-	else
-	{
-		D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
-	}
-	////////////////////////////////////////////////////////////////////////////////
-	const unsigned  x = threadIdx.x;  // Globaler x-Index 
-	const unsigned  y = blockIdx.x;   // Globaler y-Index 
-	const unsigned  z = blockIdx.y;   // Globaler z-Index 
-
-	const unsigned nx = blockDim.x;
-	const unsigned ny = gridDim.x;
-
-	const unsigned k = nx*(ny*z + y) + x;
-	//////////////////////////////////////////////////////////////////////////
-
-	if(k < numberOfBCnodes)
-	{
-		////////////////////////////////////////////////////////////////////////////////
-		//real VeloX = vx[k];
-		//real VeloY = vy[k];
-		//real VeloZ = vz[k]; //(16.0*(u0*2.0)*bbx*bby*(grid_nx-bbx)*(grid_ny-bby))/(grid_nx*grid_nx*grid_ny*grid_ny)
-		////////////////////////////////////////////////////////////////////////////////
-		real *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB, 
-			*q_dirNE,  *q_dirSW,  *q_dirSE,  *q_dirNW,  *q_dirTE,  *q_dirBW,
-			*q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
-			*q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
-			*q_dirBSE, *q_dirBNW; 
-		q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-		q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-		q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-		q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-		q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-		q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-		q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-		q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-		q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-		q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-		q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-		q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-		q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-		q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-		q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-		q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-		q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-		q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
-		q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
-		q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
-		q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
-		q_dirTNW = &QQ[DIR_MPP * numberOfBCnodes];
-		q_dirBNE = &QQ[DIR_PPM * numberOfBCnodes];
-		q_dirBSW = &QQ[DIR_MMM * numberOfBCnodes];
-		q_dirBSE = &QQ[DIR_PMM * numberOfBCnodes];
-		q_dirBNW = &QQ[DIR_MPM * numberOfBCnodes];
-		////////////////////////////////////////////////////////////////////////////////
-		//index
-		unsigned int KQK  = k_Q[k];
-		unsigned int kzero= KQK;
-		unsigned int ke   = KQK;
-		unsigned int kw   = neighborX[KQK];
-		unsigned int kn   = KQK;
-		unsigned int ks   = neighborY[KQK];
-		unsigned int kt   = KQK;
-		unsigned int kb   = neighborZ[KQK];
-		unsigned int ksw  = neighborY[kw];
-		unsigned int kne  = KQK;
-		unsigned int kse  = ks;
-		unsigned int knw  = kw;
-		unsigned int kbw  = neighborZ[kw];
-		unsigned int kte  = KQK;
-		unsigned int kbe  = kb;
-		unsigned int ktw  = kw;
-		unsigned int kbs  = neighborZ[ks];
-		unsigned int ktn  = KQK;
-		unsigned int kbn  = kb;
-		unsigned int kts  = ks;
-		unsigned int ktse = ks;
-		unsigned int kbnw = kbw;
-		unsigned int ktnw = kw;
-		unsigned int kbse = kbs;
-		unsigned int ktsw = ksw;
-		unsigned int kbne = kb;
-		unsigned int ktne = KQK;
-		unsigned int kbsw = neighborZ[ksw];
-		////////////////////////////////////////////////////////////////////////////////
-		real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
-			f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
-
-		f_W    = (D.f[DIR_P00   ])[ke   ];
-		f_E    = (D.f[DIR_M00   ])[kw   ];
-		f_S    = (D.f[DIR_0P0   ])[kn   ];
-		f_N    = (D.f[DIR_0M0   ])[ks   ];
-		f_B    = (D.f[DIR_00P   ])[kt   ];
-		f_T    = (D.f[DIR_00M   ])[kb   ];
-		f_SW   = (D.f[DIR_PP0  ])[kne  ];
-		f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-		f_NW   = (D.f[DIR_PM0  ])[kse  ];
-		f_SE   = (D.f[DIR_MP0  ])[knw  ];
-		f_BW   = (D.f[DIR_P0P  ])[kte  ];
-		f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-		f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-		f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-		f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-		f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-		f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-		f_BN   = (D.f[DIR_0MP  ])[kts  ];
-		f_BSW  = (D.f[DIR_PPP ])[ktne ];
-		f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-		f_BNW  = (D.f[DIR_PMP ])[ktse ];
-		f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-		f_TSW  = (D.f[DIR_PPM ])[kbne ];
-		f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-		f_TNW  = (D.f[DIR_PMM ])[kbse ];
-		f_TSE  = (D.f[DIR_MPM ])[kbnw ];
-		////////////////////////////////////////////////////////////////////////////////
-		real vx1, vx2, vx3, drho, feq, q;
-		drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
-			f_BN + f_TS + f_TN + f_BS + f_BE + f_TW + f_TE + f_BW + f_SE + f_NW + f_NE + f_SW + 
-			f_T + f_B + f_N + f_S + f_E + f_W + ((D.f[DIR_000])[kzero]); 
-
-		vx1    = (((f_TSE - f_BNW) - (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
-			((f_BE - f_TW)   + (f_TE - f_BW))   + ((f_SE - f_NW)   + (f_NE - f_SW)) +
-			(f_E - f_W))/(c1o1+drho); 
-
-
-		vx2    =  ((-(f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
-			((f_BN - f_TS)   + (f_TN - f_BS))    + (-(f_SE - f_NW)  + (f_NE - f_SW)) +
-			(f_N - f_S))/(c1o1+drho); 
-
-		vx3    =  (((f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) + (f_TSW - f_BNE)) +
-			(-(f_BN - f_TS)  + (f_TN - f_BS))   + ((f_TE - f_BW)   - (f_BE - f_TW)) +
-			(f_T - f_B))/(c1o1+drho); 
-
-		real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
-
-		//////////////////////////////////////////////////////////////////////////
-		if (isEvenTimestep==false)
-		{
-			D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-			D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-			D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-			D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-			D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-			D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-			D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-			D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-			D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-			D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-			D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-			D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-			D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-			D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-			D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-			D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-			D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-			D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-			D.f[DIR_000] = &DD[DIR_000*size_Mat];
-			D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-			D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-			D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-			D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-			D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-			D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-			D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-			D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-		} 
-		else
-		{
-			D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-			D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-			D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-			D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-			D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-			D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-			D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-			D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-			D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-			D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-			D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-			D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-			D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-			D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-			D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-			D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-			D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-			D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-			D.f[DIR_000] = &DD[DIR_000*size_Mat];
-			D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-			D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-			D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-			D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-			D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-			D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-			D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-			D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
-		}
-		////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-		//Test
-		//(D.f[DIR_000])[k]=c1o10;
-		real rhoDiff = drho - rho[k];
-		real VeloX = vx1;
-		real VeloY = vx2;
-		real VeloZ = vx3;
-		////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-		q = q_dirE[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c2o27* (drho+c9o2*( vx1        )*( vx1        )-cu_sq); 
-			(D.f[DIR_M00])[kw]=(c1o1-q)/(c1o1+q)*(f_E-f_W+(f_E+f_W-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_E+f_W)-c2o27*(rhoDiff + c6o1*( VeloX     )))/(c1o1+q);
-		}
-
-		q = q_dirW[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c2o27* (drho+c9o2*(-vx1        )*(-vx1        )-cu_sq); 
-			(D.f[DIR_P00])[ke]=(c1o1-q)/(c1o1+q)*(f_W-f_E+(f_W+f_E-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_W+f_E)-c2o27*(rhoDiff + c6o1*(-VeloX     )))/(c1o1+q);
-		}
-
-		q = q_dirN[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c2o27* (drho+c9o2*(     vx2    )*(     vx2    )-cu_sq); 
-			(D.f[DIR_0M0])[ks]=(c1o1-q)/(c1o1+q)*(f_N-f_S+(f_N+f_S-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_N+f_S)-c2o27*(rhoDiff + c6o1*( VeloY     )))/(c1o1+q);
-		}
-
-		q = q_dirS[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c2o27* (drho+c9o2*(    -vx2    )*(    -vx2    )-cu_sq); 
-			(D.f[DIR_0P0])[kn]=(c1o1-q)/(c1o1+q)*(f_S-f_N+(f_S+f_N-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_S+f_N)-c2o27*(rhoDiff + c6o1*(-VeloY     )))/(c1o1+q);
-		}
-
-		q = q_dirT[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c2o27* (drho+c9o2*(         vx3)*(         vx3)-cu_sq); 
-			(D.f[DIR_00M])[kb]=(c1o1-q)/(c1o1+q)*(f_T-f_B+(f_T+f_B-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_T+f_B)-c2o27*(rhoDiff + c6o1*( VeloZ     )))/(c1o1+q);
-		}
-
-		q = q_dirB[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c2o27* (drho+c9o2*(        -vx3)*(        -vx3)-cu_sq); 
-			(D.f[DIR_00P])[kt]=(c1o1-q)/(c1o1+q)*(f_B-f_T+(f_B+f_T-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_B+f_T)-c2o27*(rhoDiff + c6o1*(-VeloZ     )))/(c1o1+q);
-		}
-
-		q = q_dirNE[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o54* (drho+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq); 
-			(D.f[DIR_MM0])[ksw]=(c1o1-q)/(c1o1+q)*(f_NE-f_SW+(f_NE+f_SW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_NE+f_SW)-c1o54*(rhoDiff + c6o1*(VeloX+VeloY)))/(c1o1+q);
-		}
-
-		q = q_dirSW[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o54* (drho+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq); 
-			(D.f[DIR_PP0])[kne]=(c1o1-q)/(c1o1+q)*(f_SW-f_NE+(f_SW+f_NE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_SW+f_NE)-c1o54*(rhoDiff + c6o1*(-VeloX-VeloY)))/(c1o1+q);
-		}
-
-		q = q_dirSE[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o54* (drho+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq); 
-			(D.f[DIR_MP0])[knw]=(c1o1-q)/(c1o1+q)*(f_SE-f_NW+(f_SE+f_NW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_SE+f_NW)-c1o54*(rhoDiff + c6o1*( VeloX-VeloY)))/(c1o1+q);
-		}
-
-		q = q_dirNW[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o54* (drho+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq); 
-			(D.f[DIR_PM0])[kse]=(c1o1-q)/(c1o1+q)*(f_NW-f_SE+(f_NW+f_SE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_NW+f_SE)-c1o54*(rhoDiff + c6o1*(-VeloX+VeloY)))/(c1o1+q);
-		}
-
-		q = q_dirTE[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o54* (drho+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq); 
-			(D.f[DIR_M0M])[kbw]=(c1o1-q)/(c1o1+q)*(f_TE-f_BW+(f_TE+f_BW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TE+f_BW)-c1o54*(rhoDiff + c6o1*( VeloX+VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirBW[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o54* (drho+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq); 
-			(D.f[DIR_P0P])[kte]=(c1o1-q)/(c1o1+q)*(f_BW-f_TE+(f_BW+f_TE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BW+f_TE)-c1o54*(rhoDiff + c6o1*(-VeloX-VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirBE[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o54* (drho+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq); 
-			(D.f[DIR_M0P])[ktw]=(c1o1-q)/(c1o1+q)*(f_BE-f_TW+(f_BE+f_TW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BE+f_TW)-c1o54*(rhoDiff + c6o1*( VeloX-VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirTW[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o54* (drho+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq); 
-			(D.f[DIR_P0M])[kbe]=(c1o1-q)/(c1o1+q)*(f_TW-f_BE+(f_TW+f_BE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TW+f_BE)-c1o54*(rhoDiff + c6o1*(-VeloX+VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirTN[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o54* (drho+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq); 
-			(D.f[DIR_0MM])[kbs]=(c1o1-q)/(c1o1+q)*(f_TN-f_BS+(f_TN+f_BS-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TN+f_BS)-c1o54*(rhoDiff + c6o1*( VeloY+VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirBS[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o54* (drho+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq); 
-			(D.f[DIR_0PP])[ktn]=(c1o1-q)/(c1o1+q)*(f_BS-f_TN+(f_BS+f_TN-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BS+f_TN)-c1o54*(rhoDiff + c6o1*( -VeloY-VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirBN[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o54* (drho+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq); 
-			(D.f[DIR_0MP])[kts]=(c1o1-q)/(c1o1+q)*(f_BN-f_TS+(f_BN+f_TS-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BN+f_TS)-c1o54*(rhoDiff + c6o1*( VeloY-VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirTS[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o54* (drho+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq); 
-			(D.f[DIR_0PM])[kbn]=(c1o1-q)/(c1o1+q)*(f_TS-f_BN+(f_TS+f_BN-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TS+f_BN)-c1o54*(rhoDiff + c6o1*( -VeloY+VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirTNE[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o216*(drho+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq); 
-			(D.f[DIR_MMM])[kbsw]=(c1o1-q)/(c1o1+q)*(f_TNE-f_BSW+(f_TNE+f_BSW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TNE+f_BSW)-c1o216*(rhoDiff + c6o1*( VeloX+VeloY+VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirBSW[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o216*(drho+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq); 
-			(D.f[DIR_PPP])[ktne]=(c1o1-q)/(c1o1+q)*(f_BSW-f_TNE+(f_BSW+f_TNE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BSW+f_TNE)-c1o216*(rhoDiff + c6o1*(-VeloX-VeloY-VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirBNE[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o216*(drho+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq); 
-			(D.f[DIR_MMP])[ktsw]=(c1o1-q)/(c1o1+q)*(f_BNE-f_TSW+(f_BNE+f_TSW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BNE+f_TSW)-c1o216*(rhoDiff + c6o1*( VeloX+VeloY-VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirTSW[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o216*(drho+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq); 
-			(D.f[DIR_PPM])[kbne]=(c1o1-q)/(c1o1+q)*(f_TSW-f_BNE+(f_TSW+f_BNE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TSW+f_BNE)-c1o216*(rhoDiff + c6o1*(-VeloX-VeloY+VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirTSE[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o216*(drho+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq); 
-			(D.f[DIR_MPM])[kbnw]=(c1o1-q)/(c1o1+q)*(f_TSE-f_BNW+(f_TSE+f_BNW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TSE+f_BNW)-c1o216*(rhoDiff + c6o1*( VeloX-VeloY+VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirBNW[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o216*(drho+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq); 
-			(D.f[DIR_PMP])[ktse]=(c1o1-q)/(c1o1+q)*(f_BNW-f_TSE+(f_BNW+f_TSE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BNW+f_TSE)-c1o216*(rhoDiff + c6o1*(-VeloX+VeloY-VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirBSE[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o216*(drho+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq); 
-			(D.f[DIR_MPP])[ktnw]=(c1o1-q)/(c1o1+q)*(f_BSE-f_TNW+(f_BSE+f_TNW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BSE+f_TNW)-c1o216*(rhoDiff + c6o1*( VeloX-VeloY-VeloZ)))/(c1o1+q);
-		}
-
-		q = q_dirTNW[k];
-		if (q>=c0o1 && q<=c1o1)
-		{
-			feq=c1o216*(drho+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq); 
-			(D.f[DIR_PMM])[kbse]=(c1o1-q)/(c1o1+q)*(f_TNW-f_BSE+(f_TNW+f_BSE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TNW+f_BSE)-c1o216*(rhoDiff + c6o1*(-VeloX+VeloY+VeloZ)))/(c1o1+q);
-		}
-	}
+   Distributions27 D;
+   if (isEvenTimestep==true)
+   {
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+   }
+   else
+   {
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
+   }
+   ////////////////////////////////////////////////////////////////////////////////
+   const unsigned  x = threadIdx.x;  // Globaler x-Index
+   const unsigned  y = blockIdx.x;   // Globaler y-Index
+   const unsigned  z = blockIdx.y;   // Globaler z-Index
+
+   const unsigned nx = blockDim.x;
+   const unsigned ny = gridDim.x;
+
+   const unsigned k = nx*(ny*z + y) + x;
+   //////////////////////////////////////////////////////////////////////////
+
+   if(k < numberOfBCnodes)
+   {
+      ////////////////////////////////////////////////////////////////////////////////
+      //real VeloX = vx[k];
+      //real VeloY = vy[k];
+      //real VeloZ = vz[k]; //(16.0*(u0*2.0)*bbx*bby*(grid_nx-bbx)*(grid_ny-bby))/(grid_nx*grid_nx*grid_ny*grid_ny)
+      ////////////////////////////////////////////////////////////////////////////////
+      real *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB,
+         *q_dirNE,  *q_dirSW,  *q_dirSE,  *q_dirNW,  *q_dirTE,  *q_dirBW,
+         *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
+         *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
+         *q_dirBSE, *q_dirBNW;
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
+      q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
+      q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
+      q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
+      q_dirTNW = &QQ[DIR_MPP * numberOfBCnodes];
+      q_dirBNE = &QQ[DIR_PPM * numberOfBCnodes];
+      q_dirBSW = &QQ[DIR_MMM * numberOfBCnodes];
+      q_dirBSE = &QQ[DIR_PMM * numberOfBCnodes];
+      q_dirBNW = &QQ[DIR_MPM * numberOfBCnodes];
+      ////////////////////////////////////////////////////////////////////////////////
+      //index
+      unsigned int KQK  = k_Q[k];
+      unsigned int kzero= KQK;
+      unsigned int ke   = KQK;
+      unsigned int kw   = neighborX[KQK];
+      unsigned int kn   = KQK;
+      unsigned int ks   = neighborY[KQK];
+      unsigned int kt   = KQK;
+      unsigned int kb   = neighborZ[KQK];
+      unsigned int ksw  = neighborY[kw];
+      unsigned int kne  = KQK;
+      unsigned int kse  = ks;
+      unsigned int knw  = kw;
+      unsigned int kbw  = neighborZ[kw];
+      unsigned int kte  = KQK;
+      unsigned int kbe  = kb;
+      unsigned int ktw  = kw;
+      unsigned int kbs  = neighborZ[ks];
+      unsigned int ktn  = KQK;
+      unsigned int kbn  = kb;
+      unsigned int kts  = ks;
+      unsigned int ktse = ks;
+      unsigned int kbnw = kbw;
+      unsigned int ktnw = kw;
+      unsigned int kbse = kbs;
+      unsigned int ktsw = ksw;
+      unsigned int kbne = kb;
+      unsigned int ktne = KQK;
+      unsigned int kbsw = neighborZ[ksw];
+      ////////////////////////////////////////////////////////////////////////////////
+      real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
+         f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
+
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
+      ////////////////////////////////////////////////////////////////////////////////
+      real vx1, vx2, vx3, drho, feq, q;
+      drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
+         f_BN + f_TS + f_TN + f_BS + f_BE + f_TW + f_TE + f_BW + f_SE + f_NW + f_NE + f_SW +
+         f_T + f_B + f_N + f_S + f_E + f_W + ((D.f[DIR_000])[kzero]);
+
+      vx1    = (((f_TSE - f_BNW) - (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
+         ((f_BE - f_TW)   + (f_TE - f_BW))   + ((f_SE - f_NW)   + (f_NE - f_SW)) +
+         (f_E - f_W))/(c1o1+drho);
+
+
+      vx2    =  ((-(f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
+         ((f_BN - f_TS)   + (f_TN - f_BS))    + (-(f_SE - f_NW)  + (f_NE - f_SW)) +
+         (f_N - f_S))/(c1o1+drho);
+
+      vx3    =  (((f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) + (f_TSW - f_BNE)) +
+         (-(f_BN - f_TS)  + (f_TN - f_BS))   + ((f_TE - f_BW)   - (f_BE - f_TW)) +
+         (f_T - f_B))/(c1o1+drho);
+
+      real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
+
+      //////////////////////////////////////////////////////////////////////////
+      if (isEvenTimestep==false)
+      {
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
+      }
+      else
+      {
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
+      }
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      //Test
+      //(D.f[DIR_000])[k]=c1o10;
+      real rhoDiff = drho - rho[k];
+      real VeloX = vx1;
+      real VeloY = vx2;
+      real VeloZ = vx3;
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      q = q_dirE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c2o27* (drho+c9o2*( vx1        )*( vx1        )-cu_sq);
+         (D.f[DIR_M00])[kw]=(c1o1-q)/(c1o1+q)*(f_E-f_W+(f_E+f_W-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_E+f_W)-c2o27*(rhoDiff + c6o1*( VeloX     )))/(c1o1+q);
+      }
+
+      q = q_dirW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c2o27* (drho+c9o2*(-vx1        )*(-vx1        )-cu_sq);
+         (D.f[DIR_P00])[ke]=(c1o1-q)/(c1o1+q)*(f_W-f_E+(f_W+f_E-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_W+f_E)-c2o27*(rhoDiff + c6o1*(-VeloX     )))/(c1o1+q);
+      }
+
+      q = q_dirN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c2o27* (drho+c9o2*(     vx2    )*(     vx2    )-cu_sq);
+         (D.f[DIR_0M0])[ks]=(c1o1-q)/(c1o1+q)*(f_N-f_S+(f_N+f_S-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_N+f_S)-c2o27*(rhoDiff + c6o1*( VeloY     )))/(c1o1+q);
+      }
+
+      q = q_dirS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c2o27* (drho+c9o2*(    -vx2    )*(    -vx2    )-cu_sq);
+         (D.f[DIR_0P0])[kn]=(c1o1-q)/(c1o1+q)*(f_S-f_N+(f_S+f_N-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_S+f_N)-c2o27*(rhoDiff + c6o1*(-VeloY     )))/(c1o1+q);
+      }
+
+      q = q_dirT[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c2o27* (drho+c9o2*(         vx3)*(         vx3)-cu_sq);
+         (D.f[DIR_00M])[kb]=(c1o1-q)/(c1o1+q)*(f_T-f_B+(f_T+f_B-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_T+f_B)-c2o27*(rhoDiff + c6o1*( VeloZ     )))/(c1o1+q);
+      }
+
+      q = q_dirB[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c2o27* (drho+c9o2*(        -vx3)*(        -vx3)-cu_sq);
+         (D.f[DIR_00P])[kt]=(c1o1-q)/(c1o1+q)*(f_B-f_T+(f_B+f_T-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_B+f_T)-c2o27*(rhoDiff + c6o1*(-VeloZ     )))/(c1o1+q);
+      }
+
+      q = q_dirNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
+         (D.f[DIR_MM0])[ksw]=(c1o1-q)/(c1o1+q)*(f_NE-f_SW+(f_NE+f_SW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_NE+f_SW)-c1o54*(rhoDiff + c6o1*(VeloX+VeloY)))/(c1o1+q);
+      }
+
+      q = q_dirSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
+         (D.f[DIR_PP0])[kne]=(c1o1-q)/(c1o1+q)*(f_SW-f_NE+(f_SW+f_NE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_SW+f_NE)-c1o54*(rhoDiff + c6o1*(-VeloX-VeloY)))/(c1o1+q);
+      }
+
+      q = q_dirSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
+         (D.f[DIR_MP0])[knw]=(c1o1-q)/(c1o1+q)*(f_SE-f_NW+(f_SE+f_NW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_SE+f_NW)-c1o54*(rhoDiff + c6o1*( VeloX-VeloY)))/(c1o1+q);
+      }
+
+      q = q_dirNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
+         (D.f[DIR_PM0])[kse]=(c1o1-q)/(c1o1+q)*(f_NW-f_SE+(f_NW+f_SE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_NW+f_SE)-c1o54*(rhoDiff + c6o1*(-VeloX+VeloY)))/(c1o1+q);
+      }
+
+      q = q_dirTE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
+         (D.f[DIR_M0M])[kbw]=(c1o1-q)/(c1o1+q)*(f_TE-f_BW+(f_TE+f_BW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TE+f_BW)-c1o54*(rhoDiff + c6o1*( VeloX+VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirBW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
+         (D.f[DIR_P0P])[kte]=(c1o1-q)/(c1o1+q)*(f_BW-f_TE+(f_BW+f_TE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BW+f_TE)-c1o54*(rhoDiff + c6o1*(-VeloX-VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirBE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
+         (D.f[DIR_M0P])[ktw]=(c1o1-q)/(c1o1+q)*(f_BE-f_TW+(f_BE+f_TW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BE+f_TW)-c1o54*(rhoDiff + c6o1*( VeloX-VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirTW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
+         (D.f[DIR_P0M])[kbe]=(c1o1-q)/(c1o1+q)*(f_TW-f_BE+(f_TW+f_BE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TW+f_BE)-c1o54*(rhoDiff + c6o1*(-VeloX+VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirTN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
+         (D.f[DIR_0MM])[kbs]=(c1o1-q)/(c1o1+q)*(f_TN-f_BS+(f_TN+f_BS-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TN+f_BS)-c1o54*(rhoDiff + c6o1*( VeloY+VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirBS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
+         (D.f[DIR_0PP])[ktn]=(c1o1-q)/(c1o1+q)*(f_BS-f_TN+(f_BS+f_TN-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BS+f_TN)-c1o54*(rhoDiff + c6o1*( -VeloY-VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirBN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
+         (D.f[DIR_0MP])[kts]=(c1o1-q)/(c1o1+q)*(f_BN-f_TS+(f_BN+f_TS-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BN+f_TS)-c1o54*(rhoDiff + c6o1*( VeloY-VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirTS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
+         (D.f[DIR_0PM])[kbn]=(c1o1-q)/(c1o1+q)*(f_TS-f_BN+(f_TS+f_BN-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TS+f_BN)-c1o54*(rhoDiff + c6o1*( -VeloY+VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirTNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
+         (D.f[DIR_MMM])[kbsw]=(c1o1-q)/(c1o1+q)*(f_TNE-f_BSW+(f_TNE+f_BSW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TNE+f_BSW)-c1o216*(rhoDiff + c6o1*( VeloX+VeloY+VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirBSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
+         (D.f[DIR_PPP])[ktne]=(c1o1-q)/(c1o1+q)*(f_BSW-f_TNE+(f_BSW+f_TNE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BSW+f_TNE)-c1o216*(rhoDiff + c6o1*(-VeloX-VeloY-VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirBNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
+         (D.f[DIR_MMP])[ktsw]=(c1o1-q)/(c1o1+q)*(f_BNE-f_TSW+(f_BNE+f_TSW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BNE+f_TSW)-c1o216*(rhoDiff + c6o1*( VeloX+VeloY-VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirTSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
+         (D.f[DIR_PPM])[kbne]=(c1o1-q)/(c1o1+q)*(f_TSW-f_BNE+(f_TSW+f_BNE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TSW+f_BNE)-c1o216*(rhoDiff + c6o1*(-VeloX-VeloY+VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirTSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
+         (D.f[DIR_MPM])[kbnw]=(c1o1-q)/(c1o1+q)*(f_TSE-f_BNW+(f_TSE+f_BNW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TSE+f_BNW)-c1o216*(rhoDiff + c6o1*( VeloX-VeloY+VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirBNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
+         (D.f[DIR_PMP])[ktse]=(c1o1-q)/(c1o1+q)*(f_BNW-f_TSE+(f_BNW+f_TSE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BNW+f_TSE)-c1o216*(rhoDiff + c6o1*(-VeloX+VeloY-VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirBSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
+         (D.f[DIR_MPP])[ktnw]=(c1o1-q)/(c1o1+q)*(f_BSE-f_TNW+(f_BSE+f_TNW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_BSE+f_TNW)-c1o216*(rhoDiff + c6o1*( VeloX-VeloY-VeloZ)))/(c1o1+q);
+      }
+
+      q = q_dirTNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
+         (D.f[DIR_PMM])[kbse]=(c1o1-q)/(c1o1+q)*(f_TNW-f_BSE+(f_TNW+f_BSE-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TNW+f_BSE)-c1o216*(rhoDiff + c6o1*(-VeloX+VeloY+VeloZ)))/(c1o1+q);
+      }
+   }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/src/gpu/VirtualFluids_GPU/GPU/SchlafferBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/SchlafferBCs27.cu
index 8675780d26e63656b04fdfc1f9836b1eba8d1b87..5d4572e234fdcad072e9b666c911f3250c32346a 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/SchlafferBCs27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/SchlafferBCs27.cu
@@ -21,7 +21,7 @@ __global__ void PressSchlaff27(real* rhoBC,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat,
+                                          unsigned long long numberOfLBnodes,
                                           bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -71,94 +71,94 @@ __global__ void PressSchlaff27(real* rhoBC,
       Distributions27 D;
       if (isEvenTimestep==true)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real        f1_E,f1_W,f1_N,f1_S,f1_T,f1_B,f1_NE,f1_SW,f1_SE,f1_NW,f1_TE,f1_BW,f1_BE,f1_TW,f1_TN,f1_BS,f1_BN,f1_TS,f1_ZERO,f1_TNE,f1_TSW,f1_TSE,f1_TNW,f1_BNE,f1_BSW,f1_BSE,f1_BNW;
 
-      f1_E    = (D.f[DIR_P00   ])[ke   ];
-      f1_W    = (D.f[DIR_M00   ])[kw   ];
-      f1_N    = (D.f[DIR_0P0   ])[kn   ];
-      f1_S    = (D.f[DIR_0M0   ])[ks   ];
-      f1_T    = (D.f[DIR_00P   ])[kt   ];
-      f1_B    = (D.f[DIR_00M   ])[kb   ];
-      f1_NE   = (D.f[DIR_PP0  ])[kne  ];
-      f1_SW   = (D.f[DIR_MM0  ])[ksw  ];
-      f1_SE   = (D.f[DIR_PM0  ])[kse  ];
-      f1_NW   = (D.f[DIR_MP0  ])[knw  ];
-      f1_TE   = (D.f[DIR_P0P  ])[kte  ];
-      f1_BW   = (D.f[DIR_M0M  ])[kbw  ];
-      f1_BE   = (D.f[DIR_P0M  ])[kbe  ];
-      f1_TW   = (D.f[DIR_M0P  ])[ktw  ];
-      f1_TN   = (D.f[DIR_0PP  ])[ktn  ];
-      f1_BS   = (D.f[DIR_0MM  ])[kbs  ];
-      f1_BN   = (D.f[DIR_0PM  ])[kbn  ];
-      f1_TS   = (D.f[DIR_0MP  ])[kts  ];
+      f1_E    = (D.f[DIR_P00])[ke   ];
+      f1_W    = (D.f[DIR_M00])[kw   ];
+      f1_N    = (D.f[DIR_0P0])[kn   ];
+      f1_S    = (D.f[DIR_0M0])[ks   ];
+      f1_T    = (D.f[DIR_00P])[kt   ];
+      f1_B    = (D.f[DIR_00M])[kb   ];
+      f1_NE   = (D.f[DIR_PP0])[kne  ];
+      f1_SW   = (D.f[DIR_MM0])[ksw  ];
+      f1_SE   = (D.f[DIR_PM0])[kse  ];
+      f1_NW   = (D.f[DIR_MP0])[knw  ];
+      f1_TE   = (D.f[DIR_P0P])[kte  ];
+      f1_BW   = (D.f[DIR_M0M])[kbw  ];
+      f1_BE   = (D.f[DIR_P0M])[kbe  ];
+      f1_TW   = (D.f[DIR_M0P])[ktw  ];
+      f1_TN   = (D.f[DIR_0PP])[ktn  ];
+      f1_BS   = (D.f[DIR_0MM])[kbs  ];
+      f1_BN   = (D.f[DIR_0PM])[kbn  ];
+      f1_TS   = (D.f[DIR_0MP])[kts  ];
       f1_ZERO = (D.f[DIR_000])[kzero];
-      f1_TNE  = (D.f[DIR_PPP ])[ktne ];
-      f1_TSW  = (D.f[DIR_MMP ])[ktsw ];
-      f1_TSE  = (D.f[DIR_PMP ])[ktse ];
-      f1_TNW  = (D.f[DIR_MPP ])[ktnw ];
-      f1_BNE  = (D.f[DIR_PPM ])[kbne ];
-      f1_BSW  = (D.f[DIR_MMM ])[kbsw ];
-      f1_BSE  = (D.f[DIR_PMM ])[kbse ];
-      f1_BNW  = (D.f[DIR_MPM ])[kbnw ];
+      f1_TNE  = (D.f[DIR_PPP])[ktne ];
+      f1_TSW  = (D.f[DIR_MMP])[ktsw ];
+      f1_TSE  = (D.f[DIR_PMP])[ktse ];
+      f1_TNW  = (D.f[DIR_MPP])[ktnw ];
+      f1_BNE  = (D.f[DIR_PPM])[kbne ];
+      f1_BSW  = (D.f[DIR_MMM])[kbsw ];
+      f1_BSE  = (D.f[DIR_PMM])[kbse ];
+      f1_BNW  = (D.f[DIR_MPM])[kbnw ];
       //////////////////////////////////////////////////////////////////////////
       real cs       = c1o1/sqrt(c3o1);
       real csp1     = cs + c1o1;
@@ -222,15 +222,15 @@ __global__ void PressSchlaff27(real* rhoBC,
 
       deltaVz0[k] = tempDeltaV;
 
-      (D.f[DIR_00M   ])[kb   ] = f1_B   ;
-      (D.f[DIR_M0M  ])[kbw  ] = f1_BW  ;
-      (D.f[DIR_P0M  ])[kbe  ] = f1_BE  ;
-      (D.f[DIR_0MM  ])[kbs  ] = f1_BS  ;
-      (D.f[DIR_0PM  ])[kbn  ] = f1_BN  ;
-      (D.f[DIR_PPM ])[kbne ] = f1_BNE ;
-      (D.f[DIR_MMM ])[kbsw ] = f1_BSW ;
-      (D.f[DIR_PMM ])[kbse ] = f1_BSE ;
-      (D.f[DIR_MPM ])[kbnw ] = f1_BNW ;
+      (D.f[DIR_00M])[kb   ] = f1_B   ;
+      (D.f[DIR_M0M])[kbw  ] = f1_BW  ;
+      (D.f[DIR_P0M])[kbe  ] = f1_BE  ;
+      (D.f[DIR_0MM])[kbs  ] = f1_BS  ;
+      (D.f[DIR_0PM])[kbn  ] = f1_BN  ;
+      (D.f[DIR_PPM])[kbne ] = f1_BNE ;
+      (D.f[DIR_MMM])[kbsw ] = f1_BSW ;
+      (D.f[DIR_PMM])[kbse ] = f1_BSE ;
+      (D.f[DIR_MPM])[kbnw ] = f1_BNW ;
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -285,7 +285,7 @@ __global__ void VelSchlaff27(  int t,
                                           unsigned int* neighborX,
                                           unsigned int* neighborY,
                                           unsigned int* neighborZ,
-                                          unsigned int size_Mat,
+                                          unsigned long long numberOfLBnodes,
                                           bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -335,122 +335,122 @@ __global__ void VelSchlaff27(  int t,
       Distributions27 D;
       if (isEvenTimestep==true)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real        f1_E,f1_W,f1_N,f1_S,f1_T,f1_B,f1_NE,f1_SW,f1_SE,f1_NW,f1_TE,f1_BW,f1_BE,f1_TW,f1_TN,f1_BS,f1_BN,f1_TS,f1_ZERO,
                      f1_TNE,f1_TSW,f1_TSE,f1_TNW,f1_BNE,f1_BSW,f1_BSE,f1_BNW;
 
-      f1_E    = (D.f[DIR_P00   ])[ke   ];
-      f1_W    = (D.f[DIR_M00   ])[kw   ];
-      f1_N    = (D.f[DIR_0P0   ])[kn   ];
-      f1_S    = (D.f[DIR_0M0   ])[ks   ];
-      f1_T    = (D.f[DIR_00P   ])[kt   ];
-      f1_B    = (D.f[DIR_00M   ])[kb   ];
-      f1_NE   = (D.f[DIR_PP0  ])[kne  ];
-      f1_SW   = (D.f[DIR_MM0  ])[ksw  ];
-      f1_SE   = (D.f[DIR_PM0  ])[kse  ];
-      f1_NW   = (D.f[DIR_MP0  ])[knw  ];
-      f1_TE   = (D.f[DIR_P0P  ])[kte  ];
-      f1_BW   = (D.f[DIR_M0M  ])[kbw  ];
-      f1_BE   = (D.f[DIR_P0M  ])[kbe  ];
-      f1_TW   = (D.f[DIR_M0P  ])[ktw  ];
-      f1_TN   = (D.f[DIR_0PP  ])[ktn  ];
-      f1_BS   = (D.f[DIR_0MM  ])[kbs  ];
-      f1_BN   = (D.f[DIR_0PM  ])[kbn  ];
-      f1_TS   = (D.f[DIR_0MP  ])[kts  ];
+      f1_E    = (D.f[DIR_P00])[ke   ];
+      f1_W    = (D.f[DIR_M00])[kw   ];
+      f1_N    = (D.f[DIR_0P0])[kn   ];
+      f1_S    = (D.f[DIR_0M0])[ks   ];
+      f1_T    = (D.f[DIR_00P])[kt   ];
+      f1_B    = (D.f[DIR_00M])[kb   ];
+      f1_NE   = (D.f[DIR_PP0])[kne  ];
+      f1_SW   = (D.f[DIR_MM0])[ksw  ];
+      f1_SE   = (D.f[DIR_PM0])[kse  ];
+      f1_NW   = (D.f[DIR_MP0])[knw  ];
+      f1_TE   = (D.f[DIR_P0P])[kte  ];
+      f1_BW   = (D.f[DIR_M0M])[kbw  ];
+      f1_BE   = (D.f[DIR_P0M])[kbe  ];
+      f1_TW   = (D.f[DIR_M0P])[ktw  ];
+      f1_TN   = (D.f[DIR_0PP])[ktn  ];
+      f1_BS   = (D.f[DIR_0MM])[kbs  ];
+      f1_BN   = (D.f[DIR_0PM])[kbn  ];
+      f1_TS   = (D.f[DIR_0MP])[kts  ];
       f1_ZERO = (D.f[DIR_000])[kzero];
-      f1_TNE  = (D.f[DIR_PPP ])[ktne ];
-      f1_TSW  = (D.f[DIR_MMP ])[ktsw ];
-      f1_TSE  = (D.f[DIR_PMP ])[ktse ];
-      f1_TNW  = (D.f[DIR_MPP ])[ktnw ];
-      f1_BNE  = (D.f[DIR_PPM ])[kbne ];
-      f1_BSW  = (D.f[DIR_MMM ])[kbsw ];
-      f1_BSE  = (D.f[DIR_PMM ])[kbse ];
-      f1_BNW  = (D.f[DIR_MPM ])[kbnw ];
-      //f1_W    = (D.f[DIR_P00   ])[ke   ];
-      //f1_E    = (D.f[DIR_M00   ])[kw   ];
-      //f1_S    = (D.f[DIR_0P0   ])[kn   ];
-      //f1_N    = (D.f[DIR_0M0   ])[ks   ];
-      //f1_B    = (D.f[DIR_00P   ])[kt   ];
-      //f1_T    = (D.f[DIR_00M   ])[kb   ];
-      //f1_SW   = (D.f[DIR_PP0  ])[kne  ];
-      //f1_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      //f1_NW   = (D.f[DIR_PM0  ])[kse  ];
-      //f1_SE   = (D.f[DIR_MP0  ])[knw  ];
-      //f1_BW   = (D.f[DIR_P0P  ])[kte  ];
-      //f1_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      //f1_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      //f1_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      //f1_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      //f1_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      //f1_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      //f1_BN   = (D.f[DIR_0MP  ])[kts  ];
+      f1_TNE  = (D.f[DIR_PPP])[ktne ];
+      f1_TSW  = (D.f[DIR_MMP])[ktsw ];
+      f1_TSE  = (D.f[DIR_PMP])[ktse ];
+      f1_TNW  = (D.f[DIR_MPP])[ktnw ];
+      f1_BNE  = (D.f[DIR_PPM])[kbne ];
+      f1_BSW  = (D.f[DIR_MMM])[kbsw ];
+      f1_BSE  = (D.f[DIR_PMM])[kbse ];
+      f1_BNW  = (D.f[DIR_MPM])[kbnw ];
+      //f1_W    = (D.f[DIR_P00])[ke   ];
+      //f1_E    = (D.f[DIR_M00])[kw   ];
+      //f1_S    = (D.f[DIR_0P0])[kn   ];
+      //f1_N    = (D.f[DIR_0M0])[ks   ];
+      //f1_B    = (D.f[DIR_00P])[kt   ];
+      //f1_T    = (D.f[DIR_00M])[kb   ];
+      //f1_SW   = (D.f[DIR_PP0])[kne  ];
+      //f1_NE   = (D.f[DIR_MM0])[ksw  ];
+      //f1_NW   = (D.f[DIR_PM0])[kse  ];
+      //f1_SE   = (D.f[DIR_MP0])[knw  ];
+      //f1_BW   = (D.f[DIR_P0P])[kte  ];
+      //f1_TE   = (D.f[DIR_M0M])[kbw  ];
+      //f1_TW   = (D.f[DIR_P0M])[kbe  ];
+      //f1_BE   = (D.f[DIR_M0P])[ktw  ];
+      //f1_BS   = (D.f[DIR_0PP])[ktn  ];
+      //f1_TN   = (D.f[DIR_0MM])[kbs  ];
+      //f1_TS   = (D.f[DIR_0PM])[kbn  ];
+      //f1_BN   = (D.f[DIR_0MP])[kts  ];
       //f1_ZERO = (D.f[DIR_000])[kzero];
-      //f1_BSW  = (D.f[DIR_PPP ])[ktne ];
-      //f1_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      //f1_BNW  = (D.f[DIR_PMP ])[ktse ];
-      //f1_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      //f1_TSW  = (D.f[DIR_PPM ])[kbne ];
-      //f1_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      //f1_TNW  = (D.f[DIR_PMM ])[kbse ];
-      //f1_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      //f1_BSW  = (D.f[DIR_PPP])[ktne ];
+      //f1_BNE  = (D.f[DIR_MMP])[ktsw ];
+      //f1_BNW  = (D.f[DIR_PMP])[ktse ];
+      //f1_BSE  = (D.f[DIR_MPP])[ktnw ];
+      //f1_TSW  = (D.f[DIR_PPM])[kbne ];
+      //f1_TNE  = (D.f[DIR_MMM])[kbsw ];
+      //f1_TNW  = (D.f[DIR_PMM])[kbse ];
+      //f1_TSE  = (D.f[DIR_MPM])[kbnw ];
       //////////////////////////////////////////////////////////////////////////
       real cs       = c1o1/sqrt(c3o1);
       real csp1     = cs + c1o1;
@@ -522,64 +522,64 @@ __global__ void VelSchlaff27(  int t,
       f1_TNW = f1_BSE - c1o36 * (VX - VY - VZ);
 
       deltaVz0[k] = tempDeltaV;
-      (D.f[DIR_00P   ])[kt   ] = f1_T  ;
-      (D.f[DIR_P0P  ])[kte  ] = f1_TE ;
-      (D.f[DIR_M0P  ])[ktw  ] = f1_TW ;
-      (D.f[DIR_0PP  ])[ktn  ] = f1_TN ;
-      (D.f[DIR_0MP  ])[kts  ] = f1_TS ;
-      (D.f[DIR_PPP ])[ktne ] = f1_TNE;
-      (D.f[DIR_MMP ])[ktsw ] = f1_TSW;
-      (D.f[DIR_PMP ])[ktse ] = f1_TSE;
-      (D.f[DIR_MPP ])[ktnw ] = f1_TNW;
-
-      //(D.f[DIR_00M   ])[kb   ] = f1_B   ;
-      //(D.f[DIR_M0M  ])[kbw  ] = f1_BW  ;
-      //(D.f[DIR_P0M  ])[kbe  ] = f1_BE  ;
-      //(D.f[DIR_0MM  ])[kbs  ] = f1_BS  ;
-      //(D.f[DIR_0PM  ])[kbn  ] = f1_BN  ;
-      //(D.f[DIR_PPM ])[kbne ] = f1_BNE ;
-      //(D.f[DIR_MMM ])[kbsw ] = f1_BSW ;
-      //(D.f[DIR_PMM ])[kbse ] = f1_BSE ;
-      //(D.f[DIR_MPM ])[kbnw ] = f1_BNW ;
-
-
-      //(D.f[DIR_00P   ])[kt   ] = f1_B  ;
-      //(D.f[DIR_P0P  ])[kte  ] = f1_BW ;
-      //(D.f[DIR_M0P  ])[ktw  ] = f1_BE ;
-      //(D.f[DIR_0PP  ])[ktn  ] = f1_BS ;
-      //(D.f[DIR_0MP  ])[kts  ] = f1_BN ;
-      //(D.f[DIR_PPP ])[ktne ] = f1_BSW;
-      //(D.f[DIR_MMP ])[ktsw ] = f1_BNE;
-      //(D.f[DIR_PMP ])[ktse ] = f1_BNW;
-      //(D.f[DIR_MPP ])[ktnw ] = f1_BSE;
-
-      //(D.f[DIR_P00   ])[ke   ] = f1_W   -c2over27*drho1;
-      //(D.f[DIR_M00   ])[kw   ] = f1_E   -c2over27*drho1;
-      //(D.f[DIR_0P0   ])[kn   ] = f1_S   -c2over27*drho1;
-      //(D.f[DIR_0M0   ])[ks   ] = f1_N   -c2over27*drho1;
-      //(D.f[DIR_00P   ])[kt   ] = f1_B   -c2over27*drho1;
-      //(D.f[DIR_00M   ])[kb   ] = f1_T   -c2over27*drho1;
-      //(D.f[DIR_PP0  ])[kne  ] = f1_SW  -c1over54*drho1;
-      //(D.f[DIR_MM0  ])[ksw  ] = f1_NE  -c1over54*drho1;
-      //(D.f[DIR_PM0  ])[kse  ] = f1_NW  -c1over54*drho1;
-      //(D.f[DIR_MP0  ])[knw  ] = f1_SE  -c1over54*drho1;
-      //(D.f[DIR_P0P  ])[kte  ] = f1_BW  -c1over54*drho1;
-      //(D.f[DIR_M0M  ])[kbw  ] = f1_TE  -c1over54*drho1;
-      //(D.f[DIR_P0M  ])[kbe  ] = f1_TW  -c1over54*drho1;
-      //(D.f[DIR_M0P  ])[ktw  ] = f1_BE  -c1over54*drho1;
-      //(D.f[DIR_0PP  ])[ktn  ] = f1_BS  -c1over54*drho1;
-      //(D.f[DIR_0MM  ])[kbs  ] = f1_TN  -c1over54*drho1;
-      //(D.f[DIR_0PM  ])[kbn  ] = f1_TS  -c1over54*drho1;
-      //(D.f[DIR_0MP  ])[kts  ] = f1_BN  -c1over54*drho1;
+      (D.f[DIR_00P])[kt   ] = f1_T  ;
+      (D.f[DIR_P0P])[kte  ] = f1_TE ;
+      (D.f[DIR_M0P])[ktw  ] = f1_TW ;
+      (D.f[DIR_0PP])[ktn  ] = f1_TN ;
+      (D.f[DIR_0MP])[kts  ] = f1_TS ;
+      (D.f[DIR_PPP])[ktne ] = f1_TNE;
+      (D.f[DIR_MMP])[ktsw ] = f1_TSW;
+      (D.f[DIR_PMP])[ktse ] = f1_TSE;
+      (D.f[DIR_MPP])[ktnw ] = f1_TNW;
+
+      //(D.f[DIR_00M])[kb   ] = f1_B   ;
+      //(D.f[DIR_M0M])[kbw  ] = f1_BW  ;
+      //(D.f[DIR_P0M])[kbe  ] = f1_BE  ;
+      //(D.f[DIR_0MM])[kbs  ] = f1_BS  ;
+      //(D.f[DIR_0PM])[kbn  ] = f1_BN  ;
+      //(D.f[DIR_PPM])[kbne ] = f1_BNE ;
+      //(D.f[DIR_MMM])[kbsw ] = f1_BSW ;
+      //(D.f[DIR_PMM])[kbse ] = f1_BSE ;
+      //(D.f[DIR_MPM])[kbnw ] = f1_BNW ;
+
+
+      //(D.f[DIR_00P])[kt   ] = f1_B  ;
+      //(D.f[DIR_P0P])[kte  ] = f1_BW ;
+      //(D.f[DIR_M0P])[ktw  ] = f1_BE ;
+      //(D.f[DIR_0PP])[ktn  ] = f1_BS ;
+      //(D.f[DIR_0MP])[kts  ] = f1_BN ;
+      //(D.f[DIR_PPP])[ktne ] = f1_BSW;
+      //(D.f[DIR_MMP])[ktsw ] = f1_BNE;
+      //(D.f[DIR_PMP])[ktse ] = f1_BNW;
+      //(D.f[DIR_MPP])[ktnw ] = f1_BSE;
+
+      //(D.f[DIR_P00])[ke   ] = f1_W   -c2over27*drho1;
+      //(D.f[DIR_M00])[kw   ] = f1_E   -c2over27*drho1;
+      //(D.f[DIR_0P0])[kn   ] = f1_S   -c2over27*drho1;
+      //(D.f[DIR_0M0])[ks   ] = f1_N   -c2over27*drho1;
+      //(D.f[DIR_00P])[kt   ] = f1_B   -c2over27*drho1;
+      //(D.f[DIR_00M])[kb   ] = f1_T   -c2over27*drho1;
+      //(D.f[DIR_PP0])[kne  ] = f1_SW  -c1over54*drho1;
+      //(D.f[DIR_MM0])[ksw  ] = f1_NE  -c1over54*drho1;
+      //(D.f[DIR_PM0])[kse  ] = f1_NW  -c1over54*drho1;
+      //(D.f[DIR_MP0])[knw  ] = f1_SE  -c1over54*drho1;
+      //(D.f[DIR_P0P])[kte  ] = f1_BW  -c1over54*drho1;
+      //(D.f[DIR_M0M])[kbw  ] = f1_TE  -c1over54*drho1;
+      //(D.f[DIR_P0M])[kbe  ] = f1_TW  -c1over54*drho1;
+      //(D.f[DIR_M0P])[ktw  ] = f1_BE  -c1over54*drho1;
+      //(D.f[DIR_0PP])[ktn  ] = f1_BS  -c1over54*drho1;
+      //(D.f[DIR_0MM])[kbs  ] = f1_TN  -c1over54*drho1;
+      //(D.f[DIR_0PM])[kbn  ] = f1_TS  -c1over54*drho1;
+      //(D.f[DIR_0MP])[kts  ] = f1_BN  -c1over54*drho1;
       //(D.f[DIR_000])[kzero] = f1_ZERO-c8over27*drho1;
-      //(D.f[DIR_PPP ])[ktne ] = f1_BSW -c1over216*drho1;
-      //(D.f[DIR_MMP ])[ktsw ] = f1_BNE -c1over216*drho1;
-      //(D.f[DIR_PMP ])[ktse ] = f1_BNW -c1over216*drho1;
-      //(D.f[DIR_MPP ])[ktnw ] = f1_BSE -c1over216*drho1;
-      //(D.f[DIR_PPM ])[kbne ] = f1_TSW -c1over216*drho1;
-      //(D.f[DIR_MMM ])[kbsw ] = f1_TNE -c1over216*drho1;
-      //(D.f[DIR_PMM ])[kbse ] = f1_TNW -c1over216*drho1;
-      //(D.f[DIR_MPM ])[kbnw ] = f1_TSE -c1over216*drho1;
+      //(D.f[DIR_PPP])[ktne ] = f1_BSW -c1over216*drho1;
+      //(D.f[DIR_MMP])[ktsw ] = f1_BNE -c1over216*drho1;
+      //(D.f[DIR_PMP])[ktse ] = f1_BNW -c1over216*drho1;
+      //(D.f[DIR_MPP])[ktnw ] = f1_BSE -c1over216*drho1;
+      //(D.f[DIR_PPM])[kbne ] = f1_TSW -c1over216*drho1;
+      //(D.f[DIR_MMM])[kbsw ] = f1_TNE -c1over216*drho1;
+      //(D.f[DIR_PMM])[kbse ] = f1_TNW -c1over216*drho1;
+      //(D.f[DIR_MPM])[kbnw ] = f1_TSE -c1over216*drho1;
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/GPU/SetForcing27.cu b/src/gpu/VirtualFluids_GPU/GPU/SetForcing27.cu
index 8dbf2c670a549f9a6afe581510205c31246b50cb..07fc5853eb7042d5567c38a03cb27418142bf642 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/SetForcing27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/SetForcing27.cu
@@ -16,69 +16,69 @@ __global__ void GetVeloforForcing27( real* DD,
 												unsigned int* neighborX,
 												unsigned int* neighborY,
 												unsigned int* neighborZ,
-												unsigned int size_Mat, 
+												unsigned long long numberOfLBnodes, 
 												bool isEvenTimestep)
 {
 	Distributions27 D;
 	if (isEvenTimestep==false)
 	{
-		D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+		D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
 	} 
 	else
 	{
-		D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+		D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
 	}
 	////////////////////////////////////////////////////////////////////////////////
 	const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -123,33 +123,33 @@ __global__ void GetVeloforForcing27( real* DD,
 		unsigned int ktne = KQK;
 		unsigned int kbsw = neighborZ[ksw];
 		////////////////////////////////////////////////////////////////////////////////
-		real mfcbb = (D.f[DIR_P00   ])[ke   ];
-		real mfabb = (D.f[DIR_M00   ])[kw   ];
-		real mfbcb = (D.f[DIR_0P0   ])[kn   ];
-		real mfbab = (D.f[DIR_0M0   ])[ks   ];
-		real mfbbc = (D.f[DIR_00P   ])[kt   ];
-		real mfbba = (D.f[DIR_00M   ])[kb   ];
-		real mfccb = (D.f[DIR_PP0  ])[kne  ];
-		real mfaab = (D.f[DIR_MM0  ])[ksw  ];
-		real mfcab = (D.f[DIR_PM0  ])[kse  ];
-		real mfacb = (D.f[DIR_MP0  ])[knw  ];
-		real mfcbc = (D.f[DIR_P0P  ])[kte  ];
-		real mfaba = (D.f[DIR_M0M  ])[kbw  ];
-		real mfcba = (D.f[DIR_P0M  ])[kbe  ];
-		real mfabc = (D.f[DIR_M0P  ])[ktw  ];
-		real mfbcc = (D.f[DIR_0PP  ])[ktn  ];
-		real mfbaa = (D.f[DIR_0MM  ])[kbs  ];
-		real mfbca = (D.f[DIR_0PM  ])[kbn  ];
-		real mfbac = (D.f[DIR_0MP  ])[kts  ];
+		real mfcbb = (D.f[DIR_P00])[ke   ];
+		real mfabb = (D.f[DIR_M00])[kw   ];
+		real mfbcb = (D.f[DIR_0P0])[kn   ];
+		real mfbab = (D.f[DIR_0M0])[ks   ];
+		real mfbbc = (D.f[DIR_00P])[kt   ];
+		real mfbba = (D.f[DIR_00M])[kb   ];
+		real mfccb = (D.f[DIR_PP0])[kne  ];
+		real mfaab = (D.f[DIR_MM0])[ksw  ];
+		real mfcab = (D.f[DIR_PM0])[kse  ];
+		real mfacb = (D.f[DIR_MP0])[knw  ];
+		real mfcbc = (D.f[DIR_P0P])[kte  ];
+		real mfaba = (D.f[DIR_M0M])[kbw  ];
+		real mfcba = (D.f[DIR_P0M])[kbe  ];
+		real mfabc = (D.f[DIR_M0P])[ktw  ];
+		real mfbcc = (D.f[DIR_0PP])[ktn  ];
+		real mfbaa = (D.f[DIR_0MM])[kbs  ];
+		real mfbca = (D.f[DIR_0PM])[kbn  ];
+		real mfbac = (D.f[DIR_0MP])[kts  ];
 		real mfbbb = (D.f[DIR_000])[kzero];
-		real mfccc = (D.f[DIR_PPP ])[ktne ];
-		real mfaac = (D.f[DIR_MMP ])[ktsw ];
-		real mfcac = (D.f[DIR_PMP ])[ktse ];
-		real mfacc = (D.f[DIR_MPP ])[ktnw ];
-		real mfcca = (D.f[DIR_PPM ])[kbne ];
-		real mfaaa = (D.f[DIR_MMM ])[kbsw ];
-		real mfcaa = (D.f[DIR_PMM ])[kbse ];
-		real mfaca = (D.f[DIR_MPM ])[kbnw ];
+		real mfccc = (D.f[DIR_PPP])[ktne ];
+		real mfaac = (D.f[DIR_MMP])[ktsw ];
+		real mfcac = (D.f[DIR_PMP])[ktse ];
+		real mfacc = (D.f[DIR_MPP])[ktnw ];
+		real mfcca = (D.f[DIR_PPM])[kbne ];
+		real mfaaa = (D.f[DIR_MMM])[kbsw ];
+		real mfcaa = (D.f[DIR_PMM])[kbse ];
+		real mfaca = (D.f[DIR_MPM])[kbnw ];
 		////////////////////////////////////////////////////////////////////////////////////
 		real rho   = (mfccc+mfaaa + mfaca+mfcac + mfacc+mfcaa + mfaac+mfcca + 
 					 	 mfbac+mfbca + mfbaa+mfbcc + mfabc+mfcba + mfaba+mfcbc + mfacb+mfcab + mfaab+mfccb +
diff --git a/src/gpu/VirtualFluids_GPU/GPU/SlipBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/SlipBCs27.cu
index 0079c927373e90c1e408d2c57ace0595bcfdff15..cc8ca53d15ac02686b850a70ab181bb47285a7d1 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/SlipBCs27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/SlipBCs27.cu
@@ -1,84 +1,117 @@
-/* Device code */
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file SlipBCs27.cu
+//! \ingroup GPU
+//! \author Martin Schoenherr, Anna Wellmann
+//======================================================================================
 #include "LBM/LB.h" 
 #include "lbm/constants/D3Q27.h"
 #include "lbm/constants/NumericConstants.h"
-#include "KernelUtilities.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
 
 using namespace vf::lbm::constant;
 using namespace vf::lbm::dir;
+using namespace vf::gpu;
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QSlipDevice27(real* DD, 
-                                         int* k_Q, 
-                                         real* QQ,
-                                         unsigned int numberOfBCnodes,
-                                         real om1, 
-                                         unsigned int* neighborX,
-                                         unsigned int* neighborY,
-                                         unsigned int* neighborZ,
-                                         unsigned int size_Mat, 
-                                         bool isEvenTimestep)
+__global__ void QSlipDevice27(
+    real* DD, 
+    int* k_Q, 
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -99,24 +132,24 @@ __global__ void QSlipDevice27(real* DD,
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -156,32 +189,32 @@ __global__ void QSlipDevice27(real* DD,
       unsigned int ktne = KQK;
       unsigned int kbsw = neighborZ[ksw];
       ////////////////////////////////////////////////////////////////////////////////
-      real f_W    = (D.f[DIR_P00   ])[ke   ];
-      real f_E    = (D.f[DIR_M00   ])[kw   ];
-      real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      real f_B    = (D.f[DIR_00P   ])[kt   ];
-      real f_T    = (D.f[DIR_00M   ])[kb   ];
-      real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      real f_W    = (D.f[DIR_P00])[ke   ];
+      real f_E    = (D.f[DIR_M00])[kw   ];
+      real f_S    = (D.f[DIR_0P0])[kn   ];
+      real f_N    = (D.f[DIR_0M0])[ks   ];
+      real f_B    = (D.f[DIR_00P])[kt   ];
+      real f_T    = (D.f[DIR_00M])[kb   ];
+      real f_SW   = (D.f[DIR_PP0])[kne  ];
+      real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      real f_NW   = (D.f[DIR_PM0])[kse  ];
+      real f_SE   = (D.f[DIR_MP0])[knw  ];
+      real f_BW   = (D.f[DIR_P0P])[kte  ];
+      real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      real f_BN   = (D.f[DIR_0MP])[kts  ];
+      real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      real f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho, feq, q;
       drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -206,63 +239,63 @@ __global__ void QSlipDevice27(real* DD,
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
@@ -659,32 +692,26 @@ __global__ void QSlipDevice27(real* DD,
 
 //////////////////////////////////////////////////////////////////////////////
 __global__ void QSlipDeviceComp27(
-                                    real* distributions, 
-                                    int* subgridDistanceIndices, 
-                                    real* subgridDistances,
-                                    unsigned int numberOfBCnodes,
-                                    real omega, 
-                                    unsigned int* neighborX,
-                                    unsigned int* neighborY,
-                                    unsigned int* neighborZ,
-                                    unsigned int numberOfLBnodes, 
-                                    bool isEvenTimestep)
+    real* distributions, 
+    int* subgridDistanceIndices, 
+    real* subgridDistances,
+    unsigned int numberOfBCnodes,
+    real omega, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    //! The slip boundary condition is executed in the following steps
    //!
+
    ////////////////////////////////////////////////////////////////////////////////
    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
    //!
-   const unsigned  x = threadIdx.x;  // global x-index 
-   const unsigned  y = blockIdx.x;   // global y-index 
-   const unsigned  z = blockIdx.y;   // global z-index 
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
+   const unsigned nodeIndex = getNodeIndex();
 
-   const unsigned k = nx*(ny*z + y) + x;
-
-   if(k < numberOfBCnodes)
+   if(nodeIndex < numberOfBCnodes)
    {
       //////////////////////////////////////////////////////////////////////////
       //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
@@ -702,7 +729,7 @@ __global__ void QSlipDeviceComp27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set neighbor indices (necessary for indirect addressing)
       //!
-      unsigned int indexOfBCnode  = subgridDistanceIndices[k];
+      unsigned int indexOfBCnode  = subgridDistanceIndices[nodeIndex];
       unsigned int kzero= indexOfBCnode;
       unsigned int ke   = indexOfBCnode;
       unsigned int kw   = neighborX[indexOfBCnode];
@@ -734,32 +761,32 @@ __global__ void QSlipDeviceComp27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local distributions
       //!
-      real f_W    = (dist.f[DIR_P00   ])[ke   ];
-      real f_E    = (dist.f[DIR_M00   ])[kw   ];
-      real f_S    = (dist.f[DIR_0P0   ])[kn   ];
-      real f_N    = (dist.f[DIR_0M0   ])[ks   ];
-      real f_B    = (dist.f[DIR_00P   ])[kt   ];
-      real f_T    = (dist.f[DIR_00M   ])[kb   ];
-      real f_SW   = (dist.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (dist.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (dist.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (dist.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (dist.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (dist.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (dist.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (dist.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (dist.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (dist.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (dist.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (dist.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (dist.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (dist.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (dist.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (dist.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (dist.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (dist.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (dist.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (dist.f[DIR_MPM ])[kbnw ];
+      real f_W    = (dist.f[DIR_P00])[ke   ];
+      real f_E    = (dist.f[DIR_M00])[kw   ];
+      real f_S    = (dist.f[DIR_0P0])[kn   ];
+      real f_N    = (dist.f[DIR_0M0])[ks   ];
+      real f_B    = (dist.f[DIR_00P])[kt   ];
+      real f_T    = (dist.f[DIR_00M])[kb   ];
+      real f_SW   = (dist.f[DIR_PP0])[kne  ];
+      real f_NE   = (dist.f[DIR_MM0])[ksw  ];
+      real f_NW   = (dist.f[DIR_PM0])[kse  ];
+      real f_SE   = (dist.f[DIR_MP0])[knw  ];
+      real f_BW   = (dist.f[DIR_P0P])[kte  ];
+      real f_TE   = (dist.f[DIR_M0M])[kbw  ];
+      real f_TW   = (dist.f[DIR_P0M])[kbe  ];
+      real f_BE   = (dist.f[DIR_M0P])[ktw  ];
+      real f_BS   = (dist.f[DIR_0PP])[ktn  ];
+      real f_TN   = (dist.f[DIR_0MM])[kbs  ];
+      real f_TS   = (dist.f[DIR_0PM])[kbn  ];
+      real f_BN   = (dist.f[DIR_0MP])[kts  ];
+      real f_BSW  = (dist.f[DIR_PPP])[ktne ];
+      real f_BNE  = (dist.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (dist.f[DIR_PMP])[ktse ];
+      real f_BSE  = (dist.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (dist.f[DIR_PPM])[kbne ];
+      real f_TNE  = (dist.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (dist.f[DIR_PMM])[kbse ];
+      real f_TSE  = (dist.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Calculate macroscopic quantities
@@ -804,7 +831,7 @@ __global__ void QSlipDeviceComp27(
       bool y = false;
       bool z = false;
 
-      q = (subgridD.q[DIR_P00])[k];
+      q = (subgridD.q[DIR_P00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)  // only update distribution for q between zero and one
       {
          VeloX = c0o1;
@@ -816,7 +843,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_M00])[kw] = getInterpolatedDistributionForVeloBC(q, f_E, f_W, feq, omega, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_M00])[k];
+      q = (subgridD.q[DIR_M00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = c0o1;
@@ -828,7 +855,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_P00])[ke] = getInterpolatedDistributionForVeloBC(q, f_W, f_E, feq, omega, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_0P0])[k];
+      q = (subgridD.q[DIR_0P0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = c0o1;
@@ -840,7 +867,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_0M0])[ks] = getInterpolatedDistributionForVeloBC(q, f_N, f_S, feq, omega, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_0M0])[k];
+      q = (subgridD.q[DIR_0M0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = c0o1;
@@ -852,7 +879,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_0P0])[kn] = getInterpolatedDistributionForVeloBC(q, f_S, f_N, feq, omega, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_00P])[k];
+      q = (subgridD.q[DIR_00P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloZ = c0o1;
@@ -864,7 +891,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_00M])[kb] = getInterpolatedDistributionForVeloBC(q, f_T, f_B, feq, omega, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_00M])[k];
+      q = (subgridD.q[DIR_00M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloZ = c0o1;
@@ -876,7 +903,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_00P])[kt] = getInterpolatedDistributionForVeloBC(q, f_B, f_T, feq, omega, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_PP0])[k];
+      q = (subgridD.q[DIR_PP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -890,7 +917,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_MM0])[ksw] = getInterpolatedDistributionForVeloBC(q, f_NE, f_SW, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_MM0])[k];
+      q = (subgridD.q[DIR_MM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -904,7 +931,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_PP0])[kne] = getInterpolatedDistributionForVeloBC(q, f_SW, f_NE, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_PM0])[k];
+      q = (subgridD.q[DIR_PM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -918,7 +945,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_MP0])[knw] = getInterpolatedDistributionForVeloBC(q, f_SE, f_NW, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_MP0])[k];
+      q = (subgridD.q[DIR_MP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -932,7 +959,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_PM0])[kse] = getInterpolatedDistributionForVeloBC(q, f_NW, f_SE, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_P0P])[k];
+      q = (subgridD.q[DIR_P0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -946,7 +973,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_M0M])[kbw] = getInterpolatedDistributionForVeloBC(q, f_TE, f_BW, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_M0M])[k];
+      q = (subgridD.q[DIR_M0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
         VeloX = slipLength*vx1;
@@ -955,12 +982,12 @@ __global__ void QSlipDeviceComp27(
         if (z == true) VeloZ = c0o1;
 
          velocityLB = -vx1 - vx3;
-         feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
          velocityBC = -VeloX - VeloZ;
          (dist.f[DIR_P0P])[kte] = getInterpolatedDistributionForVeloBC(q, f_BW, f_TE, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_P0M])[k];
+      q = (subgridD.q[DIR_P0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -974,7 +1001,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_M0P])[ktw] = getInterpolatedDistributionForVeloBC(q, f_BE, f_TW, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_M0P])[k];
+      q = (subgridD.q[DIR_M0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -988,7 +1015,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_P0M])[kbe] = getInterpolatedDistributionForVeloBC(q, f_TW, f_BE, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0PP])[k];
+      q = (subgridD.q[DIR_0PP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -1002,7 +1029,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_0MM])[kbs] = getInterpolatedDistributionForVeloBC(q, f_TN, f_BS, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0MM])[k];
+      q = (subgridD.q[DIR_0MM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -1017,7 +1044,7 @@ __global__ void QSlipDeviceComp27(
       }
 
 
-      q = (subgridD.q[DIR_0PM])[k];
+      q = (subgridD.q[DIR_0PM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -1031,7 +1058,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_0MP])[kts] = getInterpolatedDistributionForVeloBC(q, f_BN, f_TS, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0MP])[k];
+      q = (subgridD.q[DIR_0MP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -1045,7 +1072,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_0PM])[kbn] = getInterpolatedDistributionForVeloBC(q, f_TS, f_BN, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_PPP])[k];
+      q = (subgridD.q[DIR_PPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1060,7 +1087,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_MMM])[kbsw] = getInterpolatedDistributionForVeloBC(q, f_TNE, f_BSW, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MMM])[k];
+      q = (subgridD.q[DIR_MMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1076,7 +1103,7 @@ __global__ void QSlipDeviceComp27(
       }
 
 
-      q = (subgridD.q[DIR_PPM])[k];
+      q = (subgridD.q[DIR_PPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1091,7 +1118,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_MMP])[ktsw] = getInterpolatedDistributionForVeloBC(q, f_BNE, f_TSW, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MMP])[k];
+      q = (subgridD.q[DIR_MMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1106,7 +1133,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_PPM])[kbne] = getInterpolatedDistributionForVeloBC(q, f_TSW, f_BNE, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PMP])[k];
+      q = (subgridD.q[DIR_PMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1121,7 +1148,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_MPM])[kbnw] = getInterpolatedDistributionForVeloBC(q, f_TSE, f_BNW, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MPM])[k];
+      q = (subgridD.q[DIR_MPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1136,7 +1163,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_PMP])[ktse] = getInterpolatedDistributionForVeloBC(q, f_BNW, f_TSE, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PMM])[k];
+      q = (subgridD.q[DIR_PMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1151,7 +1178,7 @@ __global__ void QSlipDeviceComp27(
          (dist.f[DIR_MPP])[ktnw] = getInterpolatedDistributionForVeloBC(q, f_BSE, f_TNW, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MPP])[k];
+      q = (subgridD.q[DIR_MPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1169,34 +1196,53 @@ __global__ void QSlipDeviceComp27(
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 //////////////////////////////////////////////////////////////////////////////
 __global__ void BBSlipDeviceComp27(
-                                    real* distributions, 
-                                    int* subgridDistanceIndices, 
-                                    real* subgridDistances,
-                                    unsigned int numberOfBCnodes,
-                                    real omega, 
-                                    unsigned int* neighborX,
-                                    unsigned int* neighborY,
-                                    unsigned int* neighborZ,
-                                    unsigned int numberOfLBnodes, 
-                                    bool isEvenTimestep)
+    real* distributions, 
+    int* subgridDistanceIndices, 
+    real* subgridDistances,
+    unsigned int numberOfBCnodes,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    //! The slip boundary condition is executed in the following steps
    //!
+
    ////////////////////////////////////////////////////////////////////////////////
    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
    //!
-   const unsigned  x = threadIdx.x;  // global x-index 
-   const unsigned  y = blockIdx.x;   // global y-index 
-   const unsigned  z = blockIdx.y;   // global z-index 
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
-
-   const unsigned k = nx*(ny*z + y) + x;
+   const unsigned nodeIndex = getNodeIndex();
 
-   if(k < numberOfBCnodes)
+   if(nodeIndex < numberOfBCnodes)
    {
       //////////////////////////////////////////////////////////////////////////
       //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
@@ -1214,7 +1260,7 @@ __global__ void BBSlipDeviceComp27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set neighbor indices (necessary for indirect addressing)
       //!
-      unsigned int indexOfBCnode  = subgridDistanceIndices[k];
+      unsigned int indexOfBCnode  = subgridDistanceIndices[nodeIndex];
       unsigned int kzero= indexOfBCnode;
       unsigned int ke   = indexOfBCnode;
       unsigned int kw   = neighborX[indexOfBCnode];
@@ -1246,32 +1292,32 @@ __global__ void BBSlipDeviceComp27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local distributions
       //!
-      real f_W    = (dist.f[DIR_P00   ])[ke   ];
-      real f_E    = (dist.f[DIR_M00   ])[kw   ];
-      real f_S    = (dist.f[DIR_0P0   ])[kn   ];
-      real f_N    = (dist.f[DIR_0M0   ])[ks   ];
-      real f_B    = (dist.f[DIR_00P   ])[kt   ];
-      real f_T    = (dist.f[DIR_00M   ])[kb   ];
-      real f_SW   = (dist.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (dist.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (dist.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (dist.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (dist.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (dist.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (dist.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (dist.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (dist.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (dist.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (dist.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (dist.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (dist.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (dist.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (dist.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (dist.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (dist.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (dist.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (dist.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (dist.f[DIR_MPM ])[kbnw ];
+      real f_W    = (dist.f[DIR_P00])[ke   ];
+      real f_E    = (dist.f[DIR_M00])[kw   ];
+      real f_S    = (dist.f[DIR_0P0])[kn   ];
+      real f_N    = (dist.f[DIR_0M0])[ks   ];
+      real f_B    = (dist.f[DIR_00P])[kt   ];
+      real f_T    = (dist.f[DIR_00M])[kb   ];
+      real f_SW   = (dist.f[DIR_PP0])[kne  ];
+      real f_NE   = (dist.f[DIR_MM0])[ksw  ];
+      real f_NW   = (dist.f[DIR_PM0])[kse  ];
+      real f_SE   = (dist.f[DIR_MP0])[knw  ];
+      real f_BW   = (dist.f[DIR_P0P])[kte  ];
+      real f_TE   = (dist.f[DIR_M0M])[kbw  ];
+      real f_TW   = (dist.f[DIR_P0M])[kbe  ];
+      real f_BE   = (dist.f[DIR_M0P])[ktw  ];
+      real f_BS   = (dist.f[DIR_0PP])[ktn  ];
+      real f_TN   = (dist.f[DIR_0MM])[kbs  ];
+      real f_TS   = (dist.f[DIR_0PM])[kbn  ];
+      real f_BN   = (dist.f[DIR_0MP])[kts  ];
+      real f_BSW  = (dist.f[DIR_PPP])[ktne ];
+      real f_BNE  = (dist.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (dist.f[DIR_PMP])[ktse ];
+      real f_BSE  = (dist.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (dist.f[DIR_PPM])[kbne ];
+      real f_TNE  = (dist.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (dist.f[DIR_PMM])[kbse ];
+      real f_TSE  = (dist.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Calculate macroscopic quantities
@@ -1316,7 +1362,7 @@ __global__ void BBSlipDeviceComp27(
       bool y = false;
       bool z = false;
 
-      q = (subgridD.q[DIR_P00])[k];
+      q = (subgridD.q[DIR_P00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)  // only update distribution for q between zero and one
       {
          VeloX = c0o1;
@@ -1326,7 +1372,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_M00])[kw] = getBounceBackDistributionForVeloBC(f_W, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_M00])[k];
+      q = (subgridD.q[DIR_M00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = c0o1;
@@ -1336,7 +1382,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_P00])[ke] = getBounceBackDistributionForVeloBC(f_E, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_0P0])[k];
+      q = (subgridD.q[DIR_0P0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = c0o1;
@@ -1346,7 +1392,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_0M0])[ks] = getBounceBackDistributionForVeloBC(f_S, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_0M0])[k];
+      q = (subgridD.q[DIR_0M0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = c0o1;
@@ -1356,7 +1402,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_0P0])[kn] = getBounceBackDistributionForVeloBC(f_N, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_00P])[k];
+      q = (subgridD.q[DIR_00P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloZ = c0o1;
@@ -1366,7 +1412,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_00M])[kb] = getBounceBackDistributionForVeloBC(f_B, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_00M])[k];
+      q = (subgridD.q[DIR_00M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloZ = c0o1;
@@ -1376,7 +1422,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_00P])[kt] = getBounceBackDistributionForVeloBC(f_T, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_PP0])[k];
+      q = (subgridD.q[DIR_PP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1388,7 +1434,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_MM0])[ksw] = getBounceBackDistributionForVeloBC(f_SW, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_MM0])[k];
+      q = (subgridD.q[DIR_MM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1400,7 +1446,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_PP0])[kne] = getBounceBackDistributionForVeloBC(f_NE, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_PM0])[k];
+      q = (subgridD.q[DIR_PM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1412,7 +1458,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_MP0])[knw] = getBounceBackDistributionForVeloBC(f_NW, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_MP0])[k];
+      q = (subgridD.q[DIR_MP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1424,7 +1470,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_PM0])[kse] = getBounceBackDistributionForVeloBC(f_SE, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_P0P])[k];
+      q = (subgridD.q[DIR_P0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1436,7 +1482,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_M0M])[kbw] = getBounceBackDistributionForVeloBC(f_BW, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_M0M])[k];
+      q = (subgridD.q[DIR_M0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
         VeloX = slipLength*vx1;
@@ -1444,11 +1490,11 @@ __global__ void BBSlipDeviceComp27(
         if (x == true) VeloX = c0o1;
         if (z == true) VeloZ = c0o1;
 
-         velocityBC = -VeloX - VeloZ;
-         (dist.f[DIR_P0P])[kte] = getBounceBackDistributionForVeloBC(f_TE, velocityBC, c1o54);
+        velocityBC = -VeloX - VeloZ;
+        (dist.f[DIR_P0P])[kte] = getBounceBackDistributionForVeloBC(f_TE, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_P0M])[k];
+      q = (subgridD.q[DIR_P0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1460,7 +1506,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_M0P])[ktw] = getBounceBackDistributionForVeloBC(f_TW, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_M0P])[k];
+      q = (subgridD.q[DIR_M0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1472,7 +1518,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_P0M])[kbe] = getBounceBackDistributionForVeloBC(f_BE, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0PP])[k];
+      q = (subgridD.q[DIR_0PP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -1484,7 +1530,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_0MM])[kbs] = getBounceBackDistributionForVeloBC(f_BS, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0MM])[k];
+      q = (subgridD.q[DIR_0MM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -1497,7 +1543,7 @@ __global__ void BBSlipDeviceComp27(
       }
 
 
-      q = (subgridD.q[DIR_0PM])[k];
+      q = (subgridD.q[DIR_0PM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -1509,7 +1555,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_0MP])[kts] = getBounceBackDistributionForVeloBC(f_TS, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0MP])[k];
+      q = (subgridD.q[DIR_0MP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -1521,7 +1567,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_0PM])[kbn] = getBounceBackDistributionForVeloBC(f_BN, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_PPP])[k];
+      q = (subgridD.q[DIR_PPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1535,7 +1581,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_MMM])[kbsw] = getBounceBackDistributionForVeloBC(f_TNE, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MMM])[k];
+      q = (subgridD.q[DIR_MMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1550,7 +1596,7 @@ __global__ void BBSlipDeviceComp27(
       }
 
 
-      q = (subgridD.q[DIR_PPM])[k];
+      q = (subgridD.q[DIR_PPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1564,7 +1610,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_MMP])[ktsw] = getBounceBackDistributionForVeloBC(f_TSW, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MMP])[k];
+      q = (subgridD.q[DIR_MMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1578,7 +1624,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_PPM])[kbne] = getBounceBackDistributionForVeloBC(f_BNE, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PMP])[k];
+      q = (subgridD.q[DIR_PMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1592,7 +1638,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_MPM])[kbnw] = getBounceBackDistributionForVeloBC(f_BNW, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MPM])[k];
+      q = (subgridD.q[DIR_MPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1606,7 +1652,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_PMP])[ktse] = getBounceBackDistributionForVeloBC(f_TSE, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PMM])[k];
+      q = (subgridD.q[DIR_PMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1620,7 +1666,7 @@ __global__ void BBSlipDeviceComp27(
          (dist.f[DIR_MPP])[ktnw] = getBounceBackDistributionForVeloBC(f_TNW, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MPP])[k];
+      q = (subgridD.q[DIR_MPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1638,35 +1684,55 @@ __global__ void BBSlipDeviceComp27(
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 ////////////////////////////////////////////////////////////////////////////
 __global__ void QSlipDeviceComp27TurbViscosity(
-                                    real* distributions, 
-                                    int* subgridDistanceIndices, 
-                                    real* subgridDistances,
-                                    unsigned int numberOfBCnodes,
-                                    real omega, 
-                                    unsigned int* neighborX,
-                                    unsigned int* neighborY,
-                                    unsigned int* neighborZ,
-                                    real* turbViscosity,
-                                    unsigned int numberOfLBnodes, 
-                                    bool isEvenTimestep)
+    real* distributions, 
+    int* subgridDistanceIndices, 
+    real* subgridDistances,
+    unsigned int numberOfBCnodes,
+    real omega, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* turbViscosity,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    //! The slip boundary condition is executed in the following steps
    //!
+
    ////////////////////////////////////////////////////////////////////////////////
    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
    //!
-   const unsigned  x = threadIdx.x;  // global x-index 
-   const unsigned  y = blockIdx.x;   // global y-index 
-   const unsigned  z = blockIdx.y;   // global z-index 
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
+   const unsigned nodeIndex = getNodeIndex();
 
-   const unsigned k = nx*(ny*z + y) + x;
-
-   if(k < numberOfBCnodes)
+   if(nodeIndex < numberOfBCnodes)
    {
       //////////////////////////////////////////////////////////////////////////
       //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
@@ -1684,7 +1750,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set neighbor indices (necessary for indirect addressing)
       //!
-      unsigned int indexOfBCnode  = subgridDistanceIndices[k];
+      unsigned int indexOfBCnode  = subgridDistanceIndices[nodeIndex];
       unsigned int kzero= indexOfBCnode;
       unsigned int ke   = indexOfBCnode;
       unsigned int kw   = neighborX[indexOfBCnode];
@@ -1716,32 +1782,32 @@ __global__ void QSlipDeviceComp27TurbViscosity(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local distributions
       //!
-      real f_W    = (dist.f[DIR_P00   ])[ke   ];
-      real f_E    = (dist.f[DIR_M00   ])[kw   ];
-      real f_S    = (dist.f[DIR_0P0   ])[kn   ];
-      real f_N    = (dist.f[DIR_0M0   ])[ks   ];
-      real f_B    = (dist.f[DIR_00P   ])[kt   ];
-      real f_T    = (dist.f[DIR_00M   ])[kb   ];
-      real f_SW   = (dist.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (dist.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (dist.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (dist.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (dist.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (dist.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (dist.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (dist.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (dist.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (dist.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (dist.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (dist.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (dist.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (dist.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (dist.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (dist.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (dist.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (dist.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (dist.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (dist.f[DIR_MPM ])[kbnw ];
+      real f_W    = (dist.f[DIR_P00])[ke   ];
+      real f_E    = (dist.f[DIR_M00])[kw   ];
+      real f_S    = (dist.f[DIR_0P0])[kn   ];
+      real f_N    = (dist.f[DIR_0M0])[ks   ];
+      real f_B    = (dist.f[DIR_00P])[kt   ];
+      real f_T    = (dist.f[DIR_00M])[kb   ];
+      real f_SW   = (dist.f[DIR_PP0])[kne  ];
+      real f_NE   = (dist.f[DIR_MM0])[ksw  ];
+      real f_NW   = (dist.f[DIR_PM0])[kse  ];
+      real f_SE   = (dist.f[DIR_MP0])[knw  ];
+      real f_BW   = (dist.f[DIR_P0P])[kte  ];
+      real f_TE   = (dist.f[DIR_M0M])[kbw  ];
+      real f_TW   = (dist.f[DIR_P0M])[kbe  ];
+      real f_BE   = (dist.f[DIR_M0P])[ktw  ];
+      real f_BS   = (dist.f[DIR_0PP])[ktn  ];
+      real f_TN   = (dist.f[DIR_0MM])[kbs  ];
+      real f_TS   = (dist.f[DIR_0PM])[kbn  ];
+      real f_BN   = (dist.f[DIR_0MP])[kts  ];
+      real f_BSW  = (dist.f[DIR_PPP])[ktne ];
+      real f_BNE  = (dist.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (dist.f[DIR_PMP])[ktse ];
+      real f_BSE  = (dist.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (dist.f[DIR_PPM])[kbne ];
+      real f_TNE  = (dist.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (dist.f[DIR_PMM])[kbse ];
+      real f_TSE  = (dist.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Calculate macroscopic quantities
@@ -1791,7 +1857,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
       bool y = false;
       bool z = false;
 
-      q = (subgridD.q[DIR_P00])[k];
+      q = (subgridD.q[DIR_P00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)  // only update distribution for q between zero and one
       {
          VeloX = c0o1;
@@ -1803,7 +1869,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_M00])[kw] = getInterpolatedDistributionForVeloBC(q, f_E, f_W, feq, om_turb, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_M00])[k];
+      q = (subgridD.q[DIR_M00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = c0o1;
@@ -1815,7 +1881,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_P00])[ke] = getInterpolatedDistributionForVeloBC(q, f_W, f_E, feq, om_turb, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_0P0])[k];
+      q = (subgridD.q[DIR_0P0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = c0o1;
@@ -1827,7 +1893,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_0M0])[ks] = getInterpolatedDistributionForVeloBC(q, f_N, f_S, feq, om_turb, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_0M0])[k];
+      q = (subgridD.q[DIR_0M0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = c0o1;
@@ -1839,7 +1905,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_0P0])[kn] = getInterpolatedDistributionForVeloBC(q, f_S, f_N, feq, om_turb, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_00P])[k];
+      q = (subgridD.q[DIR_00P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloZ = c0o1;
@@ -1851,7 +1917,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_00M])[kb] = getInterpolatedDistributionForVeloBC(q, f_T, f_B, feq, om_turb, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_00M])[k];
+      q = (subgridD.q[DIR_00M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloZ = c0o1;
@@ -1863,7 +1929,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_00P])[kt] = getInterpolatedDistributionForVeloBC(q, f_B, f_T, feq, om_turb, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_PP0])[k];
+      q = (subgridD.q[DIR_PP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1877,7 +1943,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_MM0])[ksw] = getInterpolatedDistributionForVeloBC(q, f_NE, f_SW, feq, om_turb, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_MM0])[k];
+      q = (subgridD.q[DIR_MM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1891,7 +1957,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_PP0])[kne] = getInterpolatedDistributionForVeloBC(q, f_SW, f_NE, feq, om_turb, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_PM0])[k];
+      q = (subgridD.q[DIR_PM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1905,7 +1971,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_MP0])[knw] = getInterpolatedDistributionForVeloBC(q, f_SE, f_NW, feq, om_turb, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_MP0])[k];
+      q = (subgridD.q[DIR_MP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1919,7 +1985,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_PM0])[kse] = getInterpolatedDistributionForVeloBC(q, f_NW, f_SE, feq, om_turb, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_P0P])[k];
+      q = (subgridD.q[DIR_P0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1933,7 +1999,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_M0M])[kbw] = getInterpolatedDistributionForVeloBC(q, f_TE, f_BW, feq, om_turb, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_M0M])[k];
+      q = (subgridD.q[DIR_M0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
         VeloX = slipLength*vx1;
@@ -1941,13 +2007,13 @@ __global__ void QSlipDeviceComp27TurbViscosity(
         if (x == true) VeloX = c0o1;
         if (z == true) VeloZ = c0o1;
 
-         velocityLB = -vx1 - vx3;
-         feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         velocityBC = -VeloX - VeloZ;
-         (dist.f[DIR_P0P])[kte] = getInterpolatedDistributionForVeloBC(q, f_BW, f_TE, feq, om_turb, velocityBC, c1o54);
+        velocityLB = -vx1 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloX - VeloZ;
+        (dist.f[DIR_P0P])[kte] = getInterpolatedDistributionForVeloBC(q, f_BW, f_TE, feq, om_turb, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_P0M])[k];
+      q = (subgridD.q[DIR_P0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1961,7 +2027,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_M0P])[ktw] = getInterpolatedDistributionForVeloBC(q, f_BE, f_TW, feq, om_turb, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_M0P])[k];
+      q = (subgridD.q[DIR_M0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -1975,7 +2041,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_P0M])[kbe] = getInterpolatedDistributionForVeloBC(q, f_TW, f_BE, feq, om_turb, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0PP])[k];
+      q = (subgridD.q[DIR_0PP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -1989,7 +2055,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_0MM])[kbs] = getInterpolatedDistributionForVeloBC(q, f_TN, f_BS, feq, om_turb, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0MM])[k];
+      q = (subgridD.q[DIR_0MM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -2004,7 +2070,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
       }
 
 
-      q = (subgridD.q[DIR_0PM])[k];
+      q = (subgridD.q[DIR_0PM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -2018,7 +2084,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_0MP])[kts] = getInterpolatedDistributionForVeloBC(q, f_BN, f_TS, feq, om_turb, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0MP])[k];
+      q = (subgridD.q[DIR_0MP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -2032,7 +2098,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_0PM])[kbn] = getInterpolatedDistributionForVeloBC(q, f_TS, f_BN, feq, om_turb, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_PPP])[k];
+      q = (subgridD.q[DIR_PPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2047,7 +2113,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_MMM])[kbsw] = getInterpolatedDistributionForVeloBC(q, f_TNE, f_BSW, feq, om_turb, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MMM])[k];
+      q = (subgridD.q[DIR_MMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2063,7 +2129,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
       }
 
 
-      q = (subgridD.q[DIR_PPM])[k];
+      q = (subgridD.q[DIR_PPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2078,7 +2144,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_MMP])[ktsw] = getInterpolatedDistributionForVeloBC(q, f_BNE, f_TSW, feq, om_turb, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MMP])[k];
+      q = (subgridD.q[DIR_MMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2093,7 +2159,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_PPM])[kbne] = getInterpolatedDistributionForVeloBC(q, f_TSW, f_BNE, feq, om_turb, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PMP])[k];
+      q = (subgridD.q[DIR_PMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2108,7 +2174,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_MPM])[kbnw] = getInterpolatedDistributionForVeloBC(q, f_TSE, f_BNW, feq, om_turb, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MPM])[k];
+      q = (subgridD.q[DIR_MPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2123,7 +2189,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_PMP])[ktse] = getInterpolatedDistributionForVeloBC(q, f_BNW, f_TSE, feq, om_turb, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PMM])[k];
+      q = (subgridD.q[DIR_PMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2138,7 +2204,7 @@ __global__ void QSlipDeviceComp27TurbViscosity(
          (dist.f[DIR_MPP])[ktnw] = getInterpolatedDistributionForVeloBC(q, f_BSE, f_TNW, feq, om_turb, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MPP])[k];
+      q = (subgridD.q[DIR_MPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2154,37 +2220,59 @@ __global__ void QSlipDeviceComp27TurbViscosity(
       }
    }
 }
+////////////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 
 ////////////////////////////////////////////////////////////////////////////
 __global__ void QSlipPressureDeviceComp27TurbViscosity(
-                                    real* distributions, 
-                                    int* subgridDistanceIndices, 
-                                    real* subgridDistances,
-                                    unsigned int numberOfBCnodes,
-                                    real omega, 
-                                    unsigned int* neighborX,
-                                    unsigned int* neighborY,
-                                    unsigned int* neighborZ,
-                                    real* turbViscosity,
-                                    unsigned int numberOfLBnodes, 
-                                    bool isEvenTimestep)
+    real* distributions, 
+    int* subgridDistanceIndices, 
+    real* subgridDistances,
+    unsigned int numberOfBCnodes,
+    real omega, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* turbViscosity,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    //! The slip boundary condition is executed in the following steps
    //!
    ////////////////////////////////////////////////////////////////////////////////
    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
    //!
-   const unsigned  x = threadIdx.x;  // global x-index 
-   const unsigned  y = blockIdx.x;   // global y-index 
-   const unsigned  z = blockIdx.y;   // global z-index 
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
-
-   const unsigned k = nx*(ny*z + y) + x;
+   const unsigned nodeIndex = getNodeIndex();
 
-   if(k < numberOfBCnodes)
+   if(nodeIndex < numberOfBCnodes)
    {
       //////////////////////////////////////////////////////////////////////////
       //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
@@ -2202,7 +2290,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set neighbor indices (necessary for indirect addressing)
       //!
-      unsigned int indexOfBCnode  = subgridDistanceIndices[k];
+      unsigned int indexOfBCnode  = subgridDistanceIndices[nodeIndex];
       unsigned int kzero= indexOfBCnode;
       unsigned int ke   = indexOfBCnode;
       unsigned int kw   = neighborX[indexOfBCnode];
@@ -2234,32 +2322,32 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local distributions
       //!
-      real f_W    = (dist.f[DIR_P00   ])[ke   ];
-      real f_E    = (dist.f[DIR_M00   ])[kw   ];
-      real f_S    = (dist.f[DIR_0P0   ])[kn   ];
-      real f_N    = (dist.f[DIR_0M0   ])[ks   ];
-      real f_B    = (dist.f[DIR_00P   ])[kt   ];
-      real f_T    = (dist.f[DIR_00M   ])[kb   ];
-      real f_SW   = (dist.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (dist.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (dist.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (dist.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (dist.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (dist.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (dist.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (dist.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (dist.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (dist.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (dist.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (dist.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (dist.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (dist.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (dist.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (dist.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (dist.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (dist.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (dist.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (dist.f[DIR_MPM ])[kbnw ];
+      real f_W    = (dist.f[DIR_P00])[ke   ];
+      real f_E    = (dist.f[DIR_M00])[kw   ];
+      real f_S    = (dist.f[DIR_0P0])[kn   ];
+      real f_N    = (dist.f[DIR_0M0])[ks   ];
+      real f_B    = (dist.f[DIR_00P])[kt   ];
+      real f_T    = (dist.f[DIR_00M])[kb   ];
+      real f_SW   = (dist.f[DIR_PP0])[kne  ];
+      real f_NE   = (dist.f[DIR_MM0])[ksw  ];
+      real f_NW   = (dist.f[DIR_PM0])[kse  ];
+      real f_SE   = (dist.f[DIR_MP0])[knw  ];
+      real f_BW   = (dist.f[DIR_P0P])[kte  ];
+      real f_TE   = (dist.f[DIR_M0M])[kbw  ];
+      real f_TW   = (dist.f[DIR_P0M])[kbe  ];
+      real f_BE   = (dist.f[DIR_M0P])[ktw  ];
+      real f_BS   = (dist.f[DIR_0PP])[ktn  ];
+      real f_TN   = (dist.f[DIR_0MM])[kbs  ];
+      real f_TS   = (dist.f[DIR_0PM])[kbn  ];
+      real f_BN   = (dist.f[DIR_0MP])[kts  ];
+      real f_BSW  = (dist.f[DIR_PPP])[ktne ];
+      real f_BNE  = (dist.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (dist.f[DIR_PMP])[ktse ];
+      real f_BSE  = (dist.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (dist.f[DIR_PPM])[kbne ];
+      real f_TNE  = (dist.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (dist.f[DIR_PMM])[kbse ];
+      real f_TSE  = (dist.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Calculate macroscopic quantities
@@ -2309,7 +2397,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
       bool y = false;
       bool z = false;
 
-      q = (subgridD.q[DIR_P00])[k];
+      q = (subgridD.q[DIR_P00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)  // only update distribution for q between zero and one
       {
          VeloX = c0o1;
@@ -2321,7 +2409,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_M00])[kw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_E, f_W, feq, om_turb, drho, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_M00])[k];
+      q = (subgridD.q[DIR_M00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = c0o1;
@@ -2333,7 +2421,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_P00])[ke] = getInterpolatedDistributionForVeloWithPressureBC(q, f_W, f_E, feq, om_turb, drho, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_0P0])[k];
+      q = (subgridD.q[DIR_0P0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = c0o1;
@@ -2345,7 +2433,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_0M0])[ks] = getInterpolatedDistributionForVeloWithPressureBC(q, f_N, f_S, feq, om_turb, drho, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_0M0])[k];
+      q = (subgridD.q[DIR_0M0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = c0o1;
@@ -2357,7 +2445,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_0P0])[kn] = getInterpolatedDistributionForVeloWithPressureBC(q, f_S, f_N, feq, om_turb, drho, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_00P])[k];
+      q = (subgridD.q[DIR_00P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloZ = c0o1;
@@ -2369,7 +2457,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_00M])[kb] = getInterpolatedDistributionForVeloWithPressureBC(q, f_T, f_B, feq, om_turb, drho, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_00M])[k];
+      q = (subgridD.q[DIR_00M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloZ = c0o1;
@@ -2381,7 +2469,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_00P])[kt] = getInterpolatedDistributionForVeloWithPressureBC(q, f_B, f_T, feq, om_turb, drho, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_PP0])[k];
+      q = (subgridD.q[DIR_PP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2395,7 +2483,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_MM0])[ksw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_NE, f_SW, feq, om_turb, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_MM0])[k];
+      q = (subgridD.q[DIR_MM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2409,7 +2497,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_PP0])[kne] = getInterpolatedDistributionForVeloWithPressureBC(q, f_SW, f_NE, feq, om_turb, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_PM0])[k];
+      q = (subgridD.q[DIR_PM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2423,7 +2511,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_MP0])[knw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_SE, f_NW, feq, om_turb, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_MP0])[k];
+      q = (subgridD.q[DIR_MP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2437,7 +2525,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_PM0])[kse] = getInterpolatedDistributionForVeloWithPressureBC(q, f_NW, f_SE, feq, om_turb, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_P0P])[k];
+      q = (subgridD.q[DIR_P0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2451,7 +2539,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_M0M])[kbw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TE, f_BW, feq, om_turb, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_M0M])[k];
+      q = (subgridD.q[DIR_M0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
         VeloX = slipLength*vx1;
@@ -2459,13 +2547,13 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
         if (x == true) VeloX = c0o1;
         if (z == true) VeloZ = c0o1;
 
-         velocityLB = -vx1 - vx3;
-         feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         velocityBC = -VeloX - VeloZ;
-         (dist.f[DIR_P0P])[kte] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BW, f_TE, feq, om_turb, drho, velocityBC, c1o54);
+        velocityLB = -vx1 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloX - VeloZ;
+        (dist.f[DIR_P0P])[kte] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BW, f_TE, feq, om_turb, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_P0M])[k];
+      q = (subgridD.q[DIR_P0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2479,7 +2567,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_M0P])[ktw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BE, f_TW, feq, om_turb, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_M0P])[k];
+      q = (subgridD.q[DIR_M0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2493,7 +2581,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_P0M])[kbe] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TW, f_BE, feq, om_turb, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0PP])[k];
+      q = (subgridD.q[DIR_0PP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -2507,7 +2595,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_0MM])[kbs] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TN, f_BS, feq, om_turb, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0MM])[k];
+      q = (subgridD.q[DIR_0MM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -2522,7 +2610,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
       }
 
 
-      q = (subgridD.q[DIR_0PM])[k];
+      q = (subgridD.q[DIR_0PM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -2536,7 +2624,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_0MP])[kts] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BN, f_TS, feq, om_turb, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0MP])[k];
+      q = (subgridD.q[DIR_0MP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloY = slipLength*vx2;
@@ -2550,7 +2638,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_0PM])[kbn] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TS, f_BN, feq, om_turb, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_PPP])[k];
+      q = (subgridD.q[DIR_PPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2565,7 +2653,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_MMM])[kbsw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TNE, f_BSW, feq, om_turb, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MMM])[k];
+      q = (subgridD.q[DIR_MMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2581,7 +2669,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
       }
 
 
-      q = (subgridD.q[DIR_PPM])[k];
+      q = (subgridD.q[DIR_PPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2596,7 +2684,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_MMP])[ktsw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BNE, f_TSW, feq, om_turb, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MMP])[k];
+      q = (subgridD.q[DIR_MMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2611,7 +2699,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_PPM])[kbne] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TSW, f_BNE, feq, om_turb, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PMP])[k];
+      q = (subgridD.q[DIR_PMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2626,7 +2714,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_MPM])[kbnw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TSE, f_BNW, feq, om_turb, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MPM])[k];
+      q = (subgridD.q[DIR_MPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2641,7 +2729,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_PMP])[ktse] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BNW, f_TSE, feq, om_turb, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PMM])[k];
+      q = (subgridD.q[DIR_PMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2656,7 +2744,7 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
          (dist.f[DIR_MPP])[ktnw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BSE, f_TNW, feq, om_turb, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MPP])[k];
+      q = (subgridD.q[DIR_MPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          VeloX = slipLength*vx1;
@@ -2688,63 +2776,63 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
 //    Distributions27 D;
 //    if (isEvenTimestep==true)
 //    {
-//       D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-//       D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-//       D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-//       D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-//       D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-//       D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-//       D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-//       D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-//       D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-//       D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-//       D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-//       D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-//       D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-//       D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-//       D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-//       D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-//       D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-//       D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-//       D.f[DIR_000] = &DD[DIR_000*size_Mat];
-//       D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-//       D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-//       D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-//       D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-//       D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-//       D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-//       D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-//       D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+//       D.f[DIR_P00] = &DD[DIR_P00 * size_Mat];
+//       D.f[DIR_M00] = &DD[DIR_M00 * size_Mat];
+//       D.f[DIR_0P0] = &DD[DIR_0P0 * size_Mat];
+//       D.f[DIR_0M0] = &DD[DIR_0M0 * size_Mat];
+//       D.f[DIR_00P] = &DD[DIR_00P * size_Mat];
+//       D.f[DIR_00M] = &DD[DIR_00M * size_Mat];
+//       D.f[DIR_PP0] = &DD[DIR_PP0 * size_Mat];
+//       D.f[DIR_MM0] = &DD[DIR_MM0 * size_Mat];
+//       D.f[DIR_PM0] = &DD[DIR_PM0 * size_Mat];
+//       D.f[DIR_MP0] = &DD[DIR_MP0 * size_Mat];
+//       D.f[DIR_P0P] = &DD[DIR_P0P * size_Mat];
+//       D.f[DIR_M0M] = &DD[DIR_M0M * size_Mat];
+//       D.f[DIR_P0M] = &DD[DIR_P0M * size_Mat];
+//       D.f[DIR_M0P] = &DD[DIR_M0P * size_Mat];
+//       D.f[DIR_0PP] = &DD[DIR_0PP * size_Mat];
+//       D.f[DIR_0MM] = &DD[DIR_0MM * size_Mat];
+//       D.f[DIR_0PM] = &DD[DIR_0PM * size_Mat];
+//       D.f[DIR_0MP] = &DD[DIR_0MP * size_Mat];
+//       D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+//       D.f[DIR_PPP] = &DD[DIR_PPP * size_Mat];
+//       D.f[DIR_MMP] = &DD[DIR_MMP * size_Mat];
+//       D.f[DIR_PMP] = &DD[DIR_PMP * size_Mat];
+//       D.f[DIR_MPP] = &DD[DIR_MPP * size_Mat];
+//       D.f[DIR_PPM] = &DD[DIR_PPM * size_Mat];
+//       D.f[DIR_MMM] = &DD[DIR_MMM * size_Mat];
+//       D.f[DIR_PMM] = &DD[DIR_PMM * size_Mat];
+//       D.f[DIR_MPM] = &DD[DIR_MPM * size_Mat];
 //    } 
 //    else
 //    {
-//       D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-//       D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-//       D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-//       D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-//       D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-//       D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-//       D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-//       D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-//       D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-//       D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-//       D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-//       D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-//       D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-//       D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-//       D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-//       D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-//       D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-//       D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-//       D.f[DIR_000] = &DD[DIR_000*size_Mat];
-//       D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-//       D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-//       D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-//       D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-//       D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-//       D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-//       D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-//       D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+//       D.f[DIR_M00] = &DD[DIR_P00 * size_Mat];
+//       D.f[DIR_P00] = &DD[DIR_M00 * size_Mat];
+//       D.f[DIR_0M0] = &DD[DIR_0P0 * size_Mat];
+//       D.f[DIR_0P0] = &DD[DIR_0M0 * size_Mat];
+//       D.f[DIR_00M] = &DD[DIR_00P * size_Mat];
+//       D.f[DIR_00P] = &DD[DIR_00M * size_Mat];
+//       D.f[DIR_MM0] = &DD[DIR_PP0 * size_Mat];
+//       D.f[DIR_PP0] = &DD[DIR_MM0 * size_Mat];
+//       D.f[DIR_MP0] = &DD[DIR_PM0 * size_Mat];
+//       D.f[DIR_PM0] = &DD[DIR_MP0 * size_Mat];
+//       D.f[DIR_M0M] = &DD[DIR_P0P * size_Mat];
+//       D.f[DIR_P0P] = &DD[DIR_M0M * size_Mat];
+//       D.f[DIR_M0P] = &DD[DIR_P0M * size_Mat];
+//       D.f[DIR_P0M] = &DD[DIR_M0P * size_Mat];
+//       D.f[DIR_0MM] = &DD[DIR_0PP * size_Mat];
+//       D.f[DIR_0PP] = &DD[DIR_0MM * size_Mat];
+//       D.f[DIR_0MP] = &DD[DIR_0PM * size_Mat];
+//       D.f[DIR_0PM] = &DD[DIR_0MP * size_Mat];
+//       D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+//       D.f[DIR_PPP] = &DD[DIR_MMM * size_Mat];
+//       D.f[DIR_MMP] = &DD[DIR_PPM * size_Mat];
+//       D.f[DIR_PMP] = &DD[DIR_MPM * size_Mat];
+//       D.f[DIR_MPP] = &DD[DIR_PMM * size_Mat];
+//       D.f[DIR_PPM] = &DD[DIR_MMP * size_Mat];
+//       D.f[DIR_MMM] = &DD[DIR_PPP * size_Mat];
+//       D.f[DIR_PMM] = &DD[DIR_MPP * size_Mat];
+//       D.f[DIR_MPM] = &DD[DIR_PMP * size_Mat];
 //    }
 //    ////////////////////////////////////////////////////////////////////////////////
 //    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -2765,24 +2853,24 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
 //             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
 //             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
 //             *q_dirBSE, *q_dirBNW; 
-//       q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-//       q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-//       q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-//       q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-//       q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-//       q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-//       q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-//       q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-//       q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-//       q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-//       q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-//       q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-//       q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-//       q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-//       q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-//       q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-//       q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-//       q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+//       q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+//       q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+//       q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+//       q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+//       q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+//       q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+//       q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+//       q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+//       q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+//       q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+//       q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+//       q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+//       q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+//       q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+//       q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+//       q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+//       q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+//       q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
 //       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
 //       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
 //       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -2823,32 +2911,32 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
 //       unsigned int kbsw = neighborZ[ksw];
       
 //       ////////////////////////////////////////////////////////////////////////////////
-//       real f_W    = (D.f[DIR_P00   ])[ke   ];
-//       real f_E    = (D.f[DIR_M00   ])[kw   ];
-//       real f_S    = (D.f[DIR_0P0   ])[kn   ];
-//       real f_N    = (D.f[DIR_0M0   ])[ks   ];
-//       real f_B    = (D.f[DIR_00P   ])[kt   ];
-//       real f_T    = (D.f[DIR_00M   ])[kb   ];
-//       real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-//       real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-//       real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-//       real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-//       real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-//       real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-//       real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-//       real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-//       real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-//       real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-//       real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-//       real f_BN   = (D.f[DIR_0MP  ])[kts  ];
-//       real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-//       real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-//       real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-//       real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-//       real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-//       real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-//       real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-//       real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+//       real f_W    = (D.f[DIR_P00])[ke   ];
+//       real f_E    = (D.f[DIR_M00])[kw   ];
+//       real f_S    = (D.f[DIR_0P0])[kn   ];
+//       real f_N    = (D.f[DIR_0M0])[ks   ];
+//       real f_B    = (D.f[DIR_00P])[kt   ];
+//       real f_T    = (D.f[DIR_00M])[kb   ];
+//       real f_SW   = (D.f[DIR_PP0])[kne  ];
+//       real f_NE   = (D.f[DIR_MM0])[ksw  ];
+//       real f_NW   = (D.f[DIR_PM0])[kse  ];
+//       real f_SE   = (D.f[DIR_MP0])[knw  ];
+//       real f_BW   = (D.f[DIR_P0P])[kte  ];
+//       real f_TE   = (D.f[DIR_M0M])[kbw  ];
+//       real f_TW   = (D.f[DIR_P0M])[kbe  ];
+//       real f_BE   = (D.f[DIR_M0P])[ktw  ];
+//       real f_BS   = (D.f[DIR_0PP])[ktn  ];
+//       real f_TN   = (D.f[DIR_0MM])[kbs  ];
+//       real f_TS   = (D.f[DIR_0PM])[kbn  ];
+//       real f_BN   = (D.f[DIR_0MP])[kts  ];
+//       real f_BSW  = (D.f[DIR_PPP])[ktne ];
+//       real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+//       real f_BNW  = (D.f[DIR_PMP])[ktse ];
+//       real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+//       real f_TSW  = (D.f[DIR_PPM])[kbne ];
+//       real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+//       real f_TNW  = (D.f[DIR_PMM])[kbse ];
+//       real f_TSE  = (D.f[DIR_MPM])[kbnw ];
 //       ////////////////////////////////////////////////////////////////////////////////
 //       real vx1, vx2, vx3, drho, feq, q;
 //       drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -2873,63 +2961,63 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
 //       //////////////////////////////////////////////////////////////////////////
 //       if (isEvenTimestep==false)
 //       {
-//          D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-//          D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-//          D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-//          D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-//          D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-//          D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-//          D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-//          D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-//          D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-//          D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-//          D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-//          D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-//          D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-//          D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-//          D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-//          D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-//          D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-//          D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-//          D.f[DIR_000] = &DD[DIR_000*size_Mat];
-//          D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-//          D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-//          D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-//          D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-//          D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-//          D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-//          D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-//          D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+//          D.f[DIR_P00] = &DD[DIR_P00 * size_Mat];
+//          D.f[DIR_M00] = &DD[DIR_M00 * size_Mat];
+//          D.f[DIR_0P0] = &DD[DIR_0P0 * size_Mat];
+//          D.f[DIR_0M0] = &DD[DIR_0M0 * size_Mat];
+//          D.f[DIR_00P] = &DD[DIR_00P * size_Mat];
+//          D.f[DIR_00M] = &DD[DIR_00M * size_Mat];
+//          D.f[DIR_PP0] = &DD[DIR_PP0 * size_Mat];
+//          D.f[DIR_MM0] = &DD[DIR_MM0 * size_Mat];
+//          D.f[DIR_PM0] = &DD[DIR_PM0 * size_Mat];
+//          D.f[DIR_MP0] = &DD[DIR_MP0 * size_Mat];
+//          D.f[DIR_P0P] = &DD[DIR_P0P * size_Mat];
+//          D.f[DIR_M0M] = &DD[DIR_M0M * size_Mat];
+//          D.f[DIR_P0M] = &DD[DIR_P0M * size_Mat];
+//          D.f[DIR_M0P] = &DD[DIR_M0P * size_Mat];
+//          D.f[DIR_0PP] = &DD[DIR_0PP * size_Mat];
+//          D.f[DIR_0MM] = &DD[DIR_0MM * size_Mat];
+//          D.f[DIR_0PM] = &DD[DIR_0PM * size_Mat];
+//          D.f[DIR_0MP] = &DD[DIR_0MP * size_Mat];
+//          D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+//          D.f[DIR_PPP] = &DD[DIR_PPP * size_Mat];
+//          D.f[DIR_MMP] = &DD[DIR_MMP * size_Mat];
+//          D.f[DIR_PMP] = &DD[DIR_PMP * size_Mat];
+//          D.f[DIR_MPP] = &DD[DIR_MPP * size_Mat];
+//          D.f[DIR_PPM] = &DD[DIR_PPM * size_Mat];
+//          D.f[DIR_MMM] = &DD[DIR_MMM * size_Mat];
+//          D.f[DIR_PMM] = &DD[DIR_PMM * size_Mat];
+//          D.f[DIR_MPM] = &DD[DIR_MPM * size_Mat];
 //       } 
 //       else
 //       {
-//          D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-//          D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-//          D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-//          D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-//          D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-//          D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-//          D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-//          D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-//          D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-//          D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-//          D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-//          D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-//          D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-//          D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-//          D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-//          D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-//          D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-//          D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-//          D.f[DIR_000] = &DD[DIR_000*size_Mat];
-//          D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-//          D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-//          D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-//          D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-//          D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-//          D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-//          D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-//          D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+//          D.f[DIR_M00] = &DD[DIR_P00 * size_Mat];
+//          D.f[DIR_P00] = &DD[DIR_M00 * size_Mat];
+//          D.f[DIR_0M0] = &DD[DIR_0P0 * size_Mat];
+//          D.f[DIR_0P0] = &DD[DIR_0M0 * size_Mat];
+//          D.f[DIR_00M] = &DD[DIR_00P * size_Mat];
+//          D.f[DIR_00P] = &DD[DIR_00M * size_Mat];
+//          D.f[DIR_MM0] = &DD[DIR_PP0 * size_Mat];
+//          D.f[DIR_PP0] = &DD[DIR_MM0 * size_Mat];
+//          D.f[DIR_MP0] = &DD[DIR_PM0 * size_Mat];
+//          D.f[DIR_PM0] = &DD[DIR_MP0 * size_Mat];
+//          D.f[DIR_M0M] = &DD[DIR_P0P * size_Mat];
+//          D.f[DIR_P0P] = &DD[DIR_M0M * size_Mat];
+//          D.f[DIR_M0P] = &DD[DIR_P0M * size_Mat];
+//          D.f[DIR_P0M] = &DD[DIR_M0P * size_Mat];
+//          D.f[DIR_0MM] = &DD[DIR_0PP * size_Mat];
+//          D.f[DIR_0PP] = &DD[DIR_0MM * size_Mat];
+//          D.f[DIR_0MP] = &DD[DIR_0PM * size_Mat];
+//          D.f[DIR_0PM] = &DD[DIR_0MP * size_Mat];
+//          D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+//          D.f[DIR_PPP] = &DD[DIR_MMM * size_Mat];
+//          D.f[DIR_MMP] = &DD[DIR_PPM * size_Mat];
+//          D.f[DIR_PMP] = &DD[DIR_MPM * size_Mat];
+//          D.f[DIR_MPP] = &DD[DIR_PMM * size_Mat];
+//          D.f[DIR_PPM] = &DD[DIR_MMP * size_Mat];
+//          D.f[DIR_MMM] = &DD[DIR_PPP * size_Mat];
+//          D.f[DIR_PMM] = &DD[DIR_MPP * size_Mat];
+//          D.f[DIR_MPM] = &DD[DIR_PMP * size_Mat];
 //       }
 //       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //       //Test
@@ -3378,80 +3466,81 @@ __global__ void QSlipPressureDeviceComp27TurbViscosity(
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QSlipGeomDeviceComp27(real* DD, 
-												 int* k_Q, 
-												 real* QQ,
-												 unsigned int  numberOfBCnodes,
-												 real om1, 
-												 real* NormalX,
-												 real* NormalY,
-												 real* NormalZ,
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int size_Mat, 
-												 bool isEvenTimestep)
+__global__ void QSlipGeomDeviceComp27(
+    real* DD, 
+    int* k_Q, 
+    real* QQ,
+    unsigned int  numberOfBCnodes,
+    real om1, 
+    real* NormalX,
+    real* NormalY,
+    real* NormalZ,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -3472,24 +3561,24 @@ __global__ void QSlipGeomDeviceComp27(real* DD,
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -3504,24 +3593,24 @@ __global__ void QSlipGeomDeviceComp27(real* DD,
               *nx_dirBE,  *nx_dirTW,  *nx_dirTN,  *nx_dirBS,  *nx_dirBN,  *nx_dirTS,
               *nx_dirTNE, *nx_dirTSW, *nx_dirTSE, *nx_dirTNW, *nx_dirBNE, *nx_dirBSW,
               *nx_dirBSE, *nx_dirBNW; 
-      nx_dirE   = &NormalX[DIR_P00   * numberOfBCnodes];
-      nx_dirW   = &NormalX[DIR_M00   * numberOfBCnodes];
-      nx_dirN   = &NormalX[DIR_0P0   * numberOfBCnodes];
-      nx_dirS   = &NormalX[DIR_0M0   * numberOfBCnodes];
-      nx_dirT   = &NormalX[DIR_00P   * numberOfBCnodes];
-      nx_dirB   = &NormalX[DIR_00M   * numberOfBCnodes];
-      nx_dirNE  = &NormalX[DIR_PP0  * numberOfBCnodes];
-      nx_dirSW  = &NormalX[DIR_MM0  * numberOfBCnodes];
-      nx_dirSE  = &NormalX[DIR_PM0  * numberOfBCnodes];
-      nx_dirNW  = &NormalX[DIR_MP0  * numberOfBCnodes];
-      nx_dirTE  = &NormalX[DIR_P0P  * numberOfBCnodes];
-      nx_dirBW  = &NormalX[DIR_M0M  * numberOfBCnodes];
-      nx_dirBE  = &NormalX[DIR_P0M  * numberOfBCnodes];
-      nx_dirTW  = &NormalX[DIR_M0P  * numberOfBCnodes];
-      nx_dirTN  = &NormalX[DIR_0PP  * numberOfBCnodes];
-      nx_dirBS  = &NormalX[DIR_0MM  * numberOfBCnodes];
-      nx_dirBN  = &NormalX[DIR_0PM  * numberOfBCnodes];
-      nx_dirTS  = &NormalX[DIR_0MP  * numberOfBCnodes];
+      nx_dirE   = &NormalX[DIR_P00 * numberOfBCnodes];
+      nx_dirW   = &NormalX[DIR_M00 * numberOfBCnodes];
+      nx_dirN   = &NormalX[DIR_0P0 * numberOfBCnodes];
+      nx_dirS   = &NormalX[DIR_0M0 * numberOfBCnodes];
+      nx_dirT   = &NormalX[DIR_00P * numberOfBCnodes];
+      nx_dirB   = &NormalX[DIR_00M * numberOfBCnodes];
+      nx_dirNE  = &NormalX[DIR_PP0 * numberOfBCnodes];
+      nx_dirSW  = &NormalX[DIR_MM0 * numberOfBCnodes];
+      nx_dirSE  = &NormalX[DIR_PM0 * numberOfBCnodes];
+      nx_dirNW  = &NormalX[DIR_MP0 * numberOfBCnodes];
+      nx_dirTE  = &NormalX[DIR_P0P * numberOfBCnodes];
+      nx_dirBW  = &NormalX[DIR_M0M * numberOfBCnodes];
+      nx_dirBE  = &NormalX[DIR_P0M * numberOfBCnodes];
+      nx_dirTW  = &NormalX[DIR_M0P * numberOfBCnodes];
+      nx_dirTN  = &NormalX[DIR_0PP * numberOfBCnodes];
+      nx_dirBS  = &NormalX[DIR_0MM * numberOfBCnodes];
+      nx_dirBN  = &NormalX[DIR_0PM * numberOfBCnodes];
+      nx_dirTS  = &NormalX[DIR_0MP * numberOfBCnodes];
       nx_dirTNE = &NormalX[DIR_PPP * numberOfBCnodes];
       nx_dirTSW = &NormalX[DIR_MMP * numberOfBCnodes];
       nx_dirTSE = &NormalX[DIR_PMP * numberOfBCnodes];
@@ -3536,24 +3625,24 @@ __global__ void QSlipGeomDeviceComp27(real* DD,
               *ny_dirBE,  *ny_dirTW,  *ny_dirTN,  *ny_dirBS,  *ny_dirBN,  *ny_dirTS,
               *ny_dirTNE, *ny_dirTSW, *ny_dirTSE, *ny_dirTNW, *ny_dirBNE, *ny_dirBSW,
               *ny_dirBSE, *ny_dirBNW; 
-      ny_dirE   = &NormalY[DIR_P00   * numberOfBCnodes];
-      ny_dirW   = &NormalY[DIR_M00   * numberOfBCnodes];
-      ny_dirN   = &NormalY[DIR_0P0   * numberOfBCnodes];
-      ny_dirS   = &NormalY[DIR_0M0   * numberOfBCnodes];
-      ny_dirT   = &NormalY[DIR_00P   * numberOfBCnodes];
-      ny_dirB   = &NormalY[DIR_00M   * numberOfBCnodes];
-      ny_dirNE  = &NormalY[DIR_PP0  * numberOfBCnodes];
-      ny_dirSW  = &NormalY[DIR_MM0  * numberOfBCnodes];
-      ny_dirSE  = &NormalY[DIR_PM0  * numberOfBCnodes];
-      ny_dirNW  = &NormalY[DIR_MP0  * numberOfBCnodes];
-      ny_dirTE  = &NormalY[DIR_P0P  * numberOfBCnodes];
-      ny_dirBW  = &NormalY[DIR_M0M  * numberOfBCnodes];
-      ny_dirBE  = &NormalY[DIR_P0M  * numberOfBCnodes];
-      ny_dirTW  = &NormalY[DIR_M0P  * numberOfBCnodes];
-      ny_dirTN  = &NormalY[DIR_0PP  * numberOfBCnodes];
-      ny_dirBS  = &NormalY[DIR_0MM  * numberOfBCnodes];
-      ny_dirBN  = &NormalY[DIR_0PM  * numberOfBCnodes];
-      ny_dirTS  = &NormalY[DIR_0MP  * numberOfBCnodes];
+      ny_dirE   = &NormalY[DIR_P00 * numberOfBCnodes];
+      ny_dirW   = &NormalY[DIR_M00 * numberOfBCnodes];
+      ny_dirN   = &NormalY[DIR_0P0 * numberOfBCnodes];
+      ny_dirS   = &NormalY[DIR_0M0 * numberOfBCnodes];
+      ny_dirT   = &NormalY[DIR_00P * numberOfBCnodes];
+      ny_dirB   = &NormalY[DIR_00M * numberOfBCnodes];
+      ny_dirNE  = &NormalY[DIR_PP0 * numberOfBCnodes];
+      ny_dirSW  = &NormalY[DIR_MM0 * numberOfBCnodes];
+      ny_dirSE  = &NormalY[DIR_PM0 * numberOfBCnodes];
+      ny_dirNW  = &NormalY[DIR_MP0 * numberOfBCnodes];
+      ny_dirTE  = &NormalY[DIR_P0P * numberOfBCnodes];
+      ny_dirBW  = &NormalY[DIR_M0M * numberOfBCnodes];
+      ny_dirBE  = &NormalY[DIR_P0M * numberOfBCnodes];
+      ny_dirTW  = &NormalY[DIR_M0P * numberOfBCnodes];
+      ny_dirTN  = &NormalY[DIR_0PP * numberOfBCnodes];
+      ny_dirBS  = &NormalY[DIR_0MM * numberOfBCnodes];
+      ny_dirBN  = &NormalY[DIR_0PM * numberOfBCnodes];
+      ny_dirTS  = &NormalY[DIR_0MP * numberOfBCnodes];
       ny_dirTNE = &NormalY[DIR_PPP * numberOfBCnodes];
       ny_dirTSW = &NormalY[DIR_MMP * numberOfBCnodes];
       ny_dirTSE = &NormalY[DIR_PMP * numberOfBCnodes];
@@ -3568,24 +3657,24 @@ __global__ void QSlipGeomDeviceComp27(real* DD,
               *nz_dirBE,  *nz_dirTW,  *nz_dirTN,  *nz_dirBS,  *nz_dirBN,  *nz_dirTS,
               *nz_dirTNE, *nz_dirTSW, *nz_dirTSE, *nz_dirTNW, *nz_dirBNE, *nz_dirBSW,
               *nz_dirBSE, *nz_dirBNW; 
-      nz_dirE   = &NormalZ[DIR_P00   * numberOfBCnodes];
-      nz_dirW   = &NormalZ[DIR_M00   * numberOfBCnodes];
-      nz_dirN   = &NormalZ[DIR_0P0   * numberOfBCnodes];
-      nz_dirS   = &NormalZ[DIR_0M0   * numberOfBCnodes];
-      nz_dirT   = &NormalZ[DIR_00P   * numberOfBCnodes];
-      nz_dirB   = &NormalZ[DIR_00M   * numberOfBCnodes];
-      nz_dirNE  = &NormalZ[DIR_PP0  * numberOfBCnodes];
-      nz_dirSW  = &NormalZ[DIR_MM0  * numberOfBCnodes];
-      nz_dirSE  = &NormalZ[DIR_PM0  * numberOfBCnodes];
-      nz_dirNW  = &NormalZ[DIR_MP0  * numberOfBCnodes];
-      nz_dirTE  = &NormalZ[DIR_P0P  * numberOfBCnodes];
-      nz_dirBW  = &NormalZ[DIR_M0M  * numberOfBCnodes];
-      nz_dirBE  = &NormalZ[DIR_P0M  * numberOfBCnodes];
-      nz_dirTW  = &NormalZ[DIR_M0P  * numberOfBCnodes];
-      nz_dirTN  = &NormalZ[DIR_0PP  * numberOfBCnodes];
-      nz_dirBS  = &NormalZ[DIR_0MM  * numberOfBCnodes];
-      nz_dirBN  = &NormalZ[DIR_0PM  * numberOfBCnodes];
-      nz_dirTS  = &NormalZ[DIR_0MP  * numberOfBCnodes];
+      nz_dirE   = &NormalZ[DIR_P00 * numberOfBCnodes];
+      nz_dirW   = &NormalZ[DIR_M00 * numberOfBCnodes];
+      nz_dirN   = &NormalZ[DIR_0P0 * numberOfBCnodes];
+      nz_dirS   = &NormalZ[DIR_0M0 * numberOfBCnodes];
+      nz_dirT   = &NormalZ[DIR_00P * numberOfBCnodes];
+      nz_dirB   = &NormalZ[DIR_00M * numberOfBCnodes];
+      nz_dirNE  = &NormalZ[DIR_PP0 * numberOfBCnodes];
+      nz_dirSW  = &NormalZ[DIR_MM0 * numberOfBCnodes];
+      nz_dirSE  = &NormalZ[DIR_PM0 * numberOfBCnodes];
+      nz_dirNW  = &NormalZ[DIR_MP0 * numberOfBCnodes];
+      nz_dirTE  = &NormalZ[DIR_P0P * numberOfBCnodes];
+      nz_dirBW  = &NormalZ[DIR_M0M * numberOfBCnodes];
+      nz_dirBE  = &NormalZ[DIR_P0M * numberOfBCnodes];
+      nz_dirTW  = &NormalZ[DIR_M0P * numberOfBCnodes];
+      nz_dirTN  = &NormalZ[DIR_0PP * numberOfBCnodes];
+      nz_dirBS  = &NormalZ[DIR_0MM * numberOfBCnodes];
+      nz_dirBN  = &NormalZ[DIR_0PM * numberOfBCnodes];
+      nz_dirTS  = &NormalZ[DIR_0MP * numberOfBCnodes];
       nz_dirTNE = &NormalZ[DIR_PPP * numberOfBCnodes];
       nz_dirTSW = &NormalZ[DIR_MMP * numberOfBCnodes];
       nz_dirTSE = &NormalZ[DIR_PMP * numberOfBCnodes];
@@ -3625,32 +3714,32 @@ __global__ void QSlipGeomDeviceComp27(real* DD,
       unsigned int ktne = KQK;
       unsigned int kbsw = neighborZ[ksw];
       ////////////////////////////////////////////////////////////////////////////////
-      real f_W    = (D.f[DIR_P00   ])[ke   ];
-      real f_E    = (D.f[DIR_M00   ])[kw   ];
-      real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      real f_B    = (D.f[DIR_00P   ])[kt   ];
-      real f_T    = (D.f[DIR_00M   ])[kb   ];
-      real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      real f_W    = (D.f[DIR_P00])[ke   ];
+      real f_E    = (D.f[DIR_M00])[kw   ];
+      real f_S    = (D.f[DIR_0P0])[kn   ];
+      real f_N    = (D.f[DIR_0M0])[ks   ];
+      real f_B    = (D.f[DIR_00P])[kt   ];
+      real f_T    = (D.f[DIR_00M])[kb   ];
+      real f_SW   = (D.f[DIR_PP0])[kne  ];
+      real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      real f_NW   = (D.f[DIR_PM0])[kse  ];
+      real f_SE   = (D.f[DIR_MP0])[knw  ];
+      real f_BW   = (D.f[DIR_P0P])[kte  ];
+      real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      real f_BN   = (D.f[DIR_0MP])[kts  ];
+      real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      real f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho, feq, q;
       drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -3675,63 +3764,63 @@ __global__ void QSlipGeomDeviceComp27(real* DD,
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  real VeloX = vx1;
@@ -4264,80 +4353,81 @@ __global__ void QSlipGeomDeviceComp27(real* DD,
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QSlipNormDeviceComp27(real* DD, 
-												 int* k_Q, 
-												 real* QQ,
-												 unsigned int  numberOfBCnodes,
-												 real om1, 
-												 real* NormalX,
-												 real* NormalY,
-												 real* NormalZ,
-												 unsigned int* neighborX,
-												 unsigned int* neighborY,
-												 unsigned int* neighborZ,
-												 unsigned int size_Mat, 
-												 bool isEvenTimestep)
+__global__ void QSlipNormDeviceComp27(
+    real* DD, 
+    int* k_Q, 
+    real* QQ,
+    unsigned int  numberOfBCnodes,
+    real om1, 
+    real* NormalX,
+    real* NormalY,
+    real* NormalZ,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -4358,24 +4448,24 @@ __global__ void QSlipNormDeviceComp27(real* DD,
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -4390,24 +4480,24 @@ __global__ void QSlipNormDeviceComp27(real* DD,
               *nx_dirBE,  *nx_dirTW,  *nx_dirTN,  *nx_dirBS,  *nx_dirBN,  *nx_dirTS,
               *nx_dirTNE, *nx_dirTSW, *nx_dirTSE, *nx_dirTNW, *nx_dirBNE, *nx_dirBSW,
               *nx_dirBSE, *nx_dirBNW; 
-      nx_dirE   = &NormalX[DIR_P00   * numberOfBCnodes];
-      nx_dirW   = &NormalX[DIR_M00   * numberOfBCnodes];
-      nx_dirN   = &NormalX[DIR_0P0   * numberOfBCnodes];
-      nx_dirS   = &NormalX[DIR_0M0   * numberOfBCnodes];
-      nx_dirT   = &NormalX[DIR_00P   * numberOfBCnodes];
-      nx_dirB   = &NormalX[DIR_00M   * numberOfBCnodes];
-      nx_dirNE  = &NormalX[DIR_PP0  * numberOfBCnodes];
-      nx_dirSW  = &NormalX[DIR_MM0  * numberOfBCnodes];
-      nx_dirSE  = &NormalX[DIR_PM0  * numberOfBCnodes];
-      nx_dirNW  = &NormalX[DIR_MP0  * numberOfBCnodes];
-      nx_dirTE  = &NormalX[DIR_P0P  * numberOfBCnodes];
-      nx_dirBW  = &NormalX[DIR_M0M  * numberOfBCnodes];
-      nx_dirBE  = &NormalX[DIR_P0M  * numberOfBCnodes];
-      nx_dirTW  = &NormalX[DIR_M0P  * numberOfBCnodes];
-      nx_dirTN  = &NormalX[DIR_0PP  * numberOfBCnodes];
-      nx_dirBS  = &NormalX[DIR_0MM  * numberOfBCnodes];
-      nx_dirBN  = &NormalX[DIR_0PM  * numberOfBCnodes];
-      nx_dirTS  = &NormalX[DIR_0MP  * numberOfBCnodes];
+      nx_dirE   = &NormalX[DIR_P00 * numberOfBCnodes];
+      nx_dirW   = &NormalX[DIR_M00 * numberOfBCnodes];
+      nx_dirN   = &NormalX[DIR_0P0 * numberOfBCnodes];
+      nx_dirS   = &NormalX[DIR_0M0 * numberOfBCnodes];
+      nx_dirT   = &NormalX[DIR_00P * numberOfBCnodes];
+      nx_dirB   = &NormalX[DIR_00M * numberOfBCnodes];
+      nx_dirNE  = &NormalX[DIR_PP0 * numberOfBCnodes];
+      nx_dirSW  = &NormalX[DIR_MM0 * numberOfBCnodes];
+      nx_dirSE  = &NormalX[DIR_PM0 * numberOfBCnodes];
+      nx_dirNW  = &NormalX[DIR_MP0 * numberOfBCnodes];
+      nx_dirTE  = &NormalX[DIR_P0P * numberOfBCnodes];
+      nx_dirBW  = &NormalX[DIR_M0M * numberOfBCnodes];
+      nx_dirBE  = &NormalX[DIR_P0M * numberOfBCnodes];
+      nx_dirTW  = &NormalX[DIR_M0P * numberOfBCnodes];
+      nx_dirTN  = &NormalX[DIR_0PP * numberOfBCnodes];
+      nx_dirBS  = &NormalX[DIR_0MM * numberOfBCnodes];
+      nx_dirBN  = &NormalX[DIR_0PM * numberOfBCnodes];
+      nx_dirTS  = &NormalX[DIR_0MP * numberOfBCnodes];
       nx_dirTNE = &NormalX[DIR_PPP * numberOfBCnodes];
       nx_dirTSW = &NormalX[DIR_MMP * numberOfBCnodes];
       nx_dirTSE = &NormalX[DIR_PMP * numberOfBCnodes];
@@ -4422,24 +4512,24 @@ __global__ void QSlipNormDeviceComp27(real* DD,
               *ny_dirBE,  *ny_dirTW,  *ny_dirTN,  *ny_dirBS,  *ny_dirBN,  *ny_dirTS,
               *ny_dirTNE, *ny_dirTSW, *ny_dirTSE, *ny_dirTNW, *ny_dirBNE, *ny_dirBSW,
               *ny_dirBSE, *ny_dirBNW; 
-      ny_dirE   = &NormalY[DIR_P00   * numberOfBCnodes];
-      ny_dirW   = &NormalY[DIR_M00   * numberOfBCnodes];
-      ny_dirN   = &NormalY[DIR_0P0   * numberOfBCnodes];
-      ny_dirS   = &NormalY[DIR_0M0   * numberOfBCnodes];
-      ny_dirT   = &NormalY[DIR_00P   * numberOfBCnodes];
-      ny_dirB   = &NormalY[DIR_00M   * numberOfBCnodes];
-      ny_dirNE  = &NormalY[DIR_PP0  * numberOfBCnodes];
-      ny_dirSW  = &NormalY[DIR_MM0  * numberOfBCnodes];
-      ny_dirSE  = &NormalY[DIR_PM0  * numberOfBCnodes];
-      ny_dirNW  = &NormalY[DIR_MP0  * numberOfBCnodes];
-      ny_dirTE  = &NormalY[DIR_P0P  * numberOfBCnodes];
-      ny_dirBW  = &NormalY[DIR_M0M  * numberOfBCnodes];
-      ny_dirBE  = &NormalY[DIR_P0M  * numberOfBCnodes];
-      ny_dirTW  = &NormalY[DIR_M0P  * numberOfBCnodes];
-      ny_dirTN  = &NormalY[DIR_0PP  * numberOfBCnodes];
-      ny_dirBS  = &NormalY[DIR_0MM  * numberOfBCnodes];
-      ny_dirBN  = &NormalY[DIR_0PM  * numberOfBCnodes];
-      ny_dirTS  = &NormalY[DIR_0MP  * numberOfBCnodes];
+      ny_dirE   = &NormalY[DIR_P00 * numberOfBCnodes];
+      ny_dirW   = &NormalY[DIR_M00 * numberOfBCnodes];
+      ny_dirN   = &NormalY[DIR_0P0 * numberOfBCnodes];
+      ny_dirS   = &NormalY[DIR_0M0 * numberOfBCnodes];
+      ny_dirT   = &NormalY[DIR_00P * numberOfBCnodes];
+      ny_dirB   = &NormalY[DIR_00M * numberOfBCnodes];
+      ny_dirNE  = &NormalY[DIR_PP0 * numberOfBCnodes];
+      ny_dirSW  = &NormalY[DIR_MM0 * numberOfBCnodes];
+      ny_dirSE  = &NormalY[DIR_PM0 * numberOfBCnodes];
+      ny_dirNW  = &NormalY[DIR_MP0 * numberOfBCnodes];
+      ny_dirTE  = &NormalY[DIR_P0P * numberOfBCnodes];
+      ny_dirBW  = &NormalY[DIR_M0M * numberOfBCnodes];
+      ny_dirBE  = &NormalY[DIR_P0M * numberOfBCnodes];
+      ny_dirTW  = &NormalY[DIR_M0P * numberOfBCnodes];
+      ny_dirTN  = &NormalY[DIR_0PP * numberOfBCnodes];
+      ny_dirBS  = &NormalY[DIR_0MM * numberOfBCnodes];
+      ny_dirBN  = &NormalY[DIR_0PM * numberOfBCnodes];
+      ny_dirTS  = &NormalY[DIR_0MP * numberOfBCnodes];
       ny_dirTNE = &NormalY[DIR_PPP * numberOfBCnodes];
       ny_dirTSW = &NormalY[DIR_MMP * numberOfBCnodes];
       ny_dirTSE = &NormalY[DIR_PMP * numberOfBCnodes];
@@ -4454,24 +4544,24 @@ __global__ void QSlipNormDeviceComp27(real* DD,
               *nz_dirBE,  *nz_dirTW,  *nz_dirTN,  *nz_dirBS,  *nz_dirBN,  *nz_dirTS,
               *nz_dirTNE, *nz_dirTSW, *nz_dirTSE, *nz_dirTNW, *nz_dirBNE, *nz_dirBSW,
               *nz_dirBSE, *nz_dirBNW; 
-      nz_dirE   = &NormalZ[DIR_P00   * numberOfBCnodes];
-      nz_dirW   = &NormalZ[DIR_M00   * numberOfBCnodes];
-      nz_dirN   = &NormalZ[DIR_0P0   * numberOfBCnodes];
-      nz_dirS   = &NormalZ[DIR_0M0   * numberOfBCnodes];
-      nz_dirT   = &NormalZ[DIR_00P   * numberOfBCnodes];
-      nz_dirB   = &NormalZ[DIR_00M   * numberOfBCnodes];
-      nz_dirNE  = &NormalZ[DIR_PP0  * numberOfBCnodes];
-      nz_dirSW  = &NormalZ[DIR_MM0  * numberOfBCnodes];
-      nz_dirSE  = &NormalZ[DIR_PM0  * numberOfBCnodes];
-      nz_dirNW  = &NormalZ[DIR_MP0  * numberOfBCnodes];
-      nz_dirTE  = &NormalZ[DIR_P0P  * numberOfBCnodes];
-      nz_dirBW  = &NormalZ[DIR_M0M  * numberOfBCnodes];
-      nz_dirBE  = &NormalZ[DIR_P0M  * numberOfBCnodes];
-      nz_dirTW  = &NormalZ[DIR_M0P  * numberOfBCnodes];
-      nz_dirTN  = &NormalZ[DIR_0PP  * numberOfBCnodes];
-      nz_dirBS  = &NormalZ[DIR_0MM  * numberOfBCnodes];
-      nz_dirBN  = &NormalZ[DIR_0PM  * numberOfBCnodes];
-      nz_dirTS  = &NormalZ[DIR_0MP  * numberOfBCnodes];
+      nz_dirE   = &NormalZ[DIR_P00 * numberOfBCnodes];
+      nz_dirW   = &NormalZ[DIR_M00 * numberOfBCnodes];
+      nz_dirN   = &NormalZ[DIR_0P0 * numberOfBCnodes];
+      nz_dirS   = &NormalZ[DIR_0M0 * numberOfBCnodes];
+      nz_dirT   = &NormalZ[DIR_00P * numberOfBCnodes];
+      nz_dirB   = &NormalZ[DIR_00M * numberOfBCnodes];
+      nz_dirNE  = &NormalZ[DIR_PP0 * numberOfBCnodes];
+      nz_dirSW  = &NormalZ[DIR_MM0 * numberOfBCnodes];
+      nz_dirSE  = &NormalZ[DIR_PM0 * numberOfBCnodes];
+      nz_dirNW  = &NormalZ[DIR_MP0 * numberOfBCnodes];
+      nz_dirTE  = &NormalZ[DIR_P0P * numberOfBCnodes];
+      nz_dirBW  = &NormalZ[DIR_M0M * numberOfBCnodes];
+      nz_dirBE  = &NormalZ[DIR_P0M * numberOfBCnodes];
+      nz_dirTW  = &NormalZ[DIR_M0P * numberOfBCnodes];
+      nz_dirTN  = &NormalZ[DIR_0PP * numberOfBCnodes];
+      nz_dirBS  = &NormalZ[DIR_0MM * numberOfBCnodes];
+      nz_dirBN  = &NormalZ[DIR_0PM * numberOfBCnodes];
+      nz_dirTS  = &NormalZ[DIR_0MP * numberOfBCnodes];
       nz_dirTNE = &NormalZ[DIR_PPP * numberOfBCnodes];
       nz_dirTSW = &NormalZ[DIR_MMP * numberOfBCnodes];
       nz_dirTSE = &NormalZ[DIR_PMP * numberOfBCnodes];
@@ -4511,32 +4601,32 @@ __global__ void QSlipNormDeviceComp27(real* DD,
       unsigned int ktne = KQK;
       unsigned int kbsw = neighborZ[ksw];
       ////////////////////////////////////////////////////////////////////////////////
-      real f_W    = (D.f[DIR_P00   ])[ke   ];
-      real f_E    = (D.f[DIR_M00   ])[kw   ];
-      real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      real f_B    = (D.f[DIR_00P   ])[kt   ];
-      real f_T    = (D.f[DIR_00M   ])[kb   ];
-      real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      real f_W    = (D.f[DIR_P00])[ke   ];
+      real f_E    = (D.f[DIR_M00])[kw   ];
+      real f_S    = (D.f[DIR_0P0])[kn   ];
+      real f_N    = (D.f[DIR_0M0])[ks   ];
+      real f_B    = (D.f[DIR_00P])[kt   ];
+      real f_T    = (D.f[DIR_00M])[kb   ];
+      real f_SW   = (D.f[DIR_PP0])[kne  ];
+      real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      real f_NW   = (D.f[DIR_PM0])[kse  ];
+      real f_SE   = (D.f[DIR_MP0])[knw  ];
+      real f_BW   = (D.f[DIR_P0P])[kte  ];
+      real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      real f_BN   = (D.f[DIR_0MP])[kts  ];
+      real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      real f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho, feq, q;
       drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -4561,63 +4651,63 @@ __global__ void QSlipNormDeviceComp27(real* DD,
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  real VeloX = vx1;
diff --git a/src/gpu/VirtualFluids_GPU/GPU/StressBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/StressBCs27.cu
index 74e2faa38638228aa5d499aa74226405ab109f7d..3208299e93940dabe52faa7d0b3c684c45596660 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/StressBCs27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/StressBCs27.cu
@@ -43,28 +43,30 @@
 #include "LBM/LB.h"
 #include "lbm/constants/D3Q27.h"
 #include <lbm/constants/NumericConstants.h>
-#include "KernelUtilities.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
 
 using namespace vf::lbm::constant;
 using namespace vf::lbm::dir;
+using namespace vf::gpu;
 
 //////////////////////////////////////////////////////////////////////////////
-__host__ __device__ __forceinline__ void iMEM(uint k, uint kN,
-                                                         real* _wallNormalX, real* _wallNormalY, real* _wallNormalZ,
-                                                         real* vx, real* vy, real* vz,
-                                                         real* vx_el,      real* vy_el,      real* vz_el,      //!>mean (temporally filtered) velocities at exchange location
-                                                         real* vx_w_mean,  real* vy_w_mean,  real* vz_w_mean,  //!>mean (temporally filtered) velocities at wall-adjactent node
-                                                         real  vx_w_inst,  real  vy_w_inst,  real  vz_w_inst,  //!>instantaneous velocities at wall-adjactent node
-                                                         real  rho,
-                                                         int* samplingOffset,
-                                                         real q,
-                                                         real forceFactor,                                     //!>e.g., 1.0 for simple-bounce back, or (1+q) for interpolated single-node bounce-back as in Geier et al (2015)
-                                                         real eps,                                             //!>filter constant in temporal averaging
-                                                         real* z0,                                             //!>aerodynamic roughness length
-                                                         bool  hasWallModelMonitor,
-                                                         real* u_star_monitor,
-                                                         real wallMomentumX, real wallMomentumY, real wallMomentumZ,
-                                                         real& wallVelocityX, real& wallVelocityY, real&wallVelocityZ)
+__host__ __device__ __forceinline__ void iMEM(
+    uint k, uint kN,
+    real* _wallNormalX, real* _wallNormalY, real* _wallNormalZ,
+    real* vx, real* vy, real* vz,
+    real* vx_el,      real* vy_el,      real* vz_el,      //!>mean (temporally filtered) velocities at exchange location
+    real* vx_w_mean,  real* vy_w_mean,  real* vz_w_mean,  //!>mean (temporally filtered) velocities at wall-adjactent node
+    real  vx_w_inst,  real  vy_w_inst,  real  vz_w_inst,  //!>instantaneous velocities at wall-adjactent node
+    real  rho,
+    int* samplingOffset,
+    real q,
+    real forceFactor,                                     //!>e.g., 1.0 for simple-bounce back, or (1+q) for interpolated single-node bounce-back as in Geier et al (2015)
+    real eps,                                             //!>filter constant in temporal averaging
+    real* z0,                                             //!>aerodynamic roughness length
+    bool  hasWallModelMonitor,
+    real* u_star_monitor,
+    real wallMomentumX, real wallMomentumY, real wallMomentumZ,
+    real& wallVelocityX, real& wallVelocityY, real&wallVelocityZ)
 {
       real wallNormalX = _wallNormalX[k];
       real wallNormalY = _wallNormalY[k];
@@ -136,99 +138,100 @@ __host__ __device__ __forceinline__ void iMEM(uint k, uint kN,
 }
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QStressDeviceComp27(real* DD,
-											   int* k_Q,
-                                    int* k_N,
-											   real* QQ,
-                                    unsigned int numberOfBCnodes,
-                                    real om1,
-                                    real* turbViscosity,
-                                    real* vx,
-                                    real* vy,
-                                    real* vz,
-                                    real* normalX,
-                                    real* normalY,
-                                    real* normalZ,
-                                    real* vx_el,
-                                    real* vy_el,
-                                    real* vz_el,
-                                    real* vx_w_mean,
-                                    real* vy_w_mean,
-                                    real* vz_w_mean,
-                                    int* samplingOffset,
-                                    real* z0,
-                                    bool  hasWallModelMonitor,
-                                    real* u_star_monitor,
-                                    real* Fx_monitor,
-                                    real* Fy_monitor,
-                                    real* Fz_monitor,
-											   unsigned int* neighborX,
-                                    unsigned int* neighborY,
-                                    unsigned int* neighborZ,
-                                    unsigned int size_Mat,
-                                    bool isEvenTimestep)
+__global__ void QStressDeviceComp27(
+    real* DD,
+    int* k_Q,
+    int* k_N,
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1,
+    real* turbViscosity,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* normalX,
+    real* normalY,
+    real* normalZ,
+    real* vx_el,
+    real* vy_el,
+    real* vz_el,
+    real* vx_w_mean,
+    real* vy_w_mean,
+    real* vz_w_mean,
+    int* samplingOffset,
+    real* z0,
+    bool  hasWallModelMonitor,
+    real* u_star_monitor,
+    real* Fx_monitor,
+    real* Fy_monitor,
+    real* Fz_monitor,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
 
    Distributions27 D;
    if (isEvenTimestep==true)//get right array of post coll f's
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    }
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index
@@ -249,24 +252,24 @@ __global__ void QStressDeviceComp27(real* DD,
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW;
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -309,32 +312,32 @@ __global__ void QStressDeviceComp27(real* DD,
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];     //post-coll f's
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_W    = (D.f[DIR_P00])[ke   ];     //post-coll f's
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho, feq, q;
@@ -361,63 +364,63 @@ __global__ void QStressDeviceComp27(real* DD,
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)      //get adress where incoming f's should be written to
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Compute incoming f's with zero wall velocity
@@ -968,69 +971,69 @@ __global__ void BBStressDevice27( real* DD,
                                              unsigned int* neighborX,
                                              unsigned int* neighborY,
                                              unsigned int* neighborZ,
-                                             unsigned int size_Mat,
+                                             unsigned long long numberOfLBnodes,
                                              bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    }
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index
@@ -1051,24 +1054,24 @@ __global__ void BBStressDevice27( real* DD,
          *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
          *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
          *q_dirBSE, *q_dirBNW;
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -1112,32 +1115,32 @@ __global__ void BBStressDevice27( real* DD,
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho;
@@ -1161,63 +1164,63 @@ __global__ void BBStressDevice27( real* DD,
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real f_E_in,  f_W_in,  f_N_in,  f_S_in,  f_T_in,  f_B_in,   f_NE_in,  f_SW_in,  f_SE_in,  f_NW_in,  f_TE_in,  f_BW_in,  f_BE_in,
@@ -1715,69 +1718,69 @@ __global__ void BBStressPressureDevice27( real* DD,
                                              unsigned int* neighborX,
                                              unsigned int* neighborY,
                                              unsigned int* neighborZ,
-                                             unsigned int size_Mat,
+                                             unsigned long long numberOfLBnodes,
                                              bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    }
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index
@@ -1798,24 +1801,24 @@ __global__ void BBStressPressureDevice27( real* DD,
          *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
          *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
          *q_dirBSE, *q_dirBNW;
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -1859,32 +1862,32 @@ __global__ void BBStressPressureDevice27( real* DD,
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho;
@@ -1908,63 +1911,63 @@ __global__ void BBStressPressureDevice27( real* DD,
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       }
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real f_E_in,  f_W_in,  f_N_in,  f_S_in,  f_T_in,  f_B_in,   f_NE_in,  f_SW_in,  f_SE_in,  f_NW_in,  f_TE_in,  f_BW_in,  f_BE_in,
diff --git a/src/gpu/VirtualFluids_GPU/GPU/ThinWallBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/ThinWallBCs27.cu
index 55f810628f370976289d1492e9916d5d3fa0dbb8..b96d961c9b92ae5d041beeb23482d7144e7a8acb 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/ThinWallBCs27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/ThinWallBCs27.cu
@@ -27,69 +27,69 @@ __global__ void QVelDeviceCompThinWallsPartOne27(
 	uint* neighborX,
 	uint* neighborY,
 	uint* neighborZ,
-	uint size_Mat, 
+	unsigned long long numberOfLBnodes, 
 	bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -114,24 +114,24 @@ __global__ void QVelDeviceCompThinWallsPartOne27(
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -174,32 +174,32 @@ __global__ void QVelDeviceCompThinWallsPartOne27(
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho, feq, q;
       drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -456,69 +456,69 @@ __global__ void QDeviceCompThinWallsPartOne27(
 	unsigned int* neighborX,
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
-	unsigned int size_Mat,
+	unsigned long long numberOfLBnodes,
 	bool isEvenTimestep)
 {
 	Distributions27 D;
 	if (isEvenTimestep == true)
 	{
-		D.f[DIR_P00] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_M00] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0P0] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0M0] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00P] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00M] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_PP0] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_MM0] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_PM0] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_MP0] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_P0P] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_M0M] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_P0M] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_M0P] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0PP] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0MM] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0PM] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0MP] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_MMP] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_PMP] = &DD[DIR_PMP *size_Mat];
-		D.f[DIR_MPP] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_PPM] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_MMM] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_PMM] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_MPM] = &DD[DIR_MPM *size_Mat];
+		D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
 	}
 	else
 	{
-		D.f[DIR_M00] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_P00] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0M0] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0P0] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00M] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00P] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_MM0] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_PP0] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_MP0] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_PM0] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_M0M] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_P0P] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_M0P] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_P0M] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0MM] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0PP] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0MP] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0PM] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_MMP] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_PMP] = &DD[DIR_MPM *size_Mat];
-		D.f[DIR_MPP] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_PPM] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_MMM] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_PMM] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_MPM] = &DD[DIR_PMP *size_Mat];
+		D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
 	}
 	////////////////////////////////////////////////////////////////////////////////
 	const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -539,24 +539,24 @@ __global__ void QDeviceCompThinWallsPartOne27(
 			*q_dirBE, *q_dirTW, *q_dirTN, *q_dirBS, *q_dirBN, *q_dirTS,
 			*q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
 			*q_dirBSE, *q_dirBNW;
-		q_dirE = &QQ[DIR_P00   * numberOfBCnodes];
-		q_dirW = &QQ[DIR_M00   * numberOfBCnodes];
-		q_dirN = &QQ[DIR_0P0   * numberOfBCnodes];
-		q_dirS = &QQ[DIR_0M0   * numberOfBCnodes];
-		q_dirT = &QQ[DIR_00P   * numberOfBCnodes];
-		q_dirB = &QQ[DIR_00M   * numberOfBCnodes];
-		q_dirNE = &QQ[DIR_PP0  * numberOfBCnodes];
-		q_dirSW = &QQ[DIR_MM0  * numberOfBCnodes];
-		q_dirSE = &QQ[DIR_PM0  * numberOfBCnodes];
-		q_dirNW = &QQ[DIR_MP0  * numberOfBCnodes];
-		q_dirTE = &QQ[DIR_P0P  * numberOfBCnodes];
-		q_dirBW = &QQ[DIR_M0M  * numberOfBCnodes];
-		q_dirBE = &QQ[DIR_P0M  * numberOfBCnodes];
-		q_dirTW = &QQ[DIR_M0P  * numberOfBCnodes];
-		q_dirTN = &QQ[DIR_0PP  * numberOfBCnodes];
-		q_dirBS = &QQ[DIR_0MM  * numberOfBCnodes];
-		q_dirBN = &QQ[DIR_0PM  * numberOfBCnodes];
-		q_dirTS = &QQ[DIR_0MP  * numberOfBCnodes];
+		q_dirE = &QQ[DIR_P00 * numberOfBCnodes];
+		q_dirW = &QQ[DIR_M00 * numberOfBCnodes];
+		q_dirN = &QQ[DIR_0P0 * numberOfBCnodes];
+		q_dirS = &QQ[DIR_0M0 * numberOfBCnodes];
+		q_dirT = &QQ[DIR_00P * numberOfBCnodes];
+		q_dirB = &QQ[DIR_00M * numberOfBCnodes];
+		q_dirNE = &QQ[DIR_PP0 * numberOfBCnodes];
+		q_dirSW = &QQ[DIR_MM0 * numberOfBCnodes];
+		q_dirSE = &QQ[DIR_PM0 * numberOfBCnodes];
+		q_dirNW = &QQ[DIR_MP0 * numberOfBCnodes];
+		q_dirTE = &QQ[DIR_P0P * numberOfBCnodes];
+		q_dirBW = &QQ[DIR_M0M * numberOfBCnodes];
+		q_dirBE = &QQ[DIR_P0M * numberOfBCnodes];
+		q_dirTW = &QQ[DIR_M0P * numberOfBCnodes];
+		q_dirTN = &QQ[DIR_0PP * numberOfBCnodes];
+		q_dirBS = &QQ[DIR_0MM * numberOfBCnodes];
+		q_dirBN = &QQ[DIR_0PM * numberOfBCnodes];
+		q_dirTS = &QQ[DIR_0MP * numberOfBCnodes];
 		q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
 		q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
 		q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -882,7 +882,7 @@ __global__ void QThinWallsPartTwo27(
 	uint* neighborY,
 	uint* neighborZ,
 	uint* neighborWSB,
-	uint size_Mat, 
+	unsigned long long numberOfLBnodes, 
 	bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
@@ -904,24 +904,24 @@ __global__ void QThinWallsPartTwo27(
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -980,123 +980,123 @@ __global__ void QThinWallsPartTwo27(
 	  Distributions27 D, DN;
 	  if (isEvenTimestep == true)
 	  {
-		  D.f[DIR_P00] = &DD[DIR_P00   *size_Mat];
-		  D.f[DIR_M00] = &DD[DIR_M00   *size_Mat];
-		  D.f[DIR_0P0] = &DD[DIR_0P0   *size_Mat];
-		  D.f[DIR_0M0] = &DD[DIR_0M0   *size_Mat];
-		  D.f[DIR_00P] = &DD[DIR_00P   *size_Mat];
-		  D.f[DIR_00M] = &DD[DIR_00M   *size_Mat];
-		  D.f[DIR_PP0] = &DD[DIR_PP0  *size_Mat];
-		  D.f[DIR_MM0] = &DD[DIR_MM0  *size_Mat];
-		  D.f[DIR_PM0] = &DD[DIR_PM0  *size_Mat];
-		  D.f[DIR_MP0] = &DD[DIR_MP0  *size_Mat];
-		  D.f[DIR_P0P] = &DD[DIR_P0P  *size_Mat];
-		  D.f[DIR_M0M] = &DD[DIR_M0M  *size_Mat];
-		  D.f[DIR_P0M] = &DD[DIR_P0M  *size_Mat];
-		  D.f[DIR_M0P] = &DD[DIR_M0P  *size_Mat];
-		  D.f[DIR_0PP] = &DD[DIR_0PP  *size_Mat];
-		  D.f[DIR_0MM] = &DD[DIR_0MM  *size_Mat];
-		  D.f[DIR_0PM] = &DD[DIR_0PM  *size_Mat];
-		  D.f[DIR_0MP] = &DD[DIR_0MP  *size_Mat];
-		  D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		  D.f[DIR_PPP] = &DD[DIR_PPP *size_Mat];
-		  D.f[DIR_MMP] = &DD[DIR_MMP *size_Mat];
-		  D.f[DIR_PMP] = &DD[DIR_PMP *size_Mat];
-		  D.f[DIR_MPP] = &DD[DIR_MPP *size_Mat];
-		  D.f[DIR_PPM] = &DD[DIR_PPM *size_Mat];
-		  D.f[DIR_MMM] = &DD[DIR_MMM *size_Mat];
-		  D.f[DIR_PMM] = &DD[DIR_PMM *size_Mat];
-		  D.f[DIR_MPM] = &DD[DIR_MPM *size_Mat];
+		  D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+		  D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+		  D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+		  D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+		  D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+		  D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+		  D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+		  D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+		  D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+		  D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+		  D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+		  D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+		  D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+		  D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+		  D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+		  D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+		  D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+		  D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+		  D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		  D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+		  D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+		  D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+		  D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+		  D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+		  D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+		  D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+		  D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
 	  }
 	  else
 	  {
-		  D.f[DIR_M00] = &DD[DIR_P00   *size_Mat];
-		  D.f[DIR_P00] = &DD[DIR_M00   *size_Mat];
-		  D.f[DIR_0M0] = &DD[DIR_0P0   *size_Mat];
-		  D.f[DIR_0P0] = &DD[DIR_0M0   *size_Mat];
-		  D.f[DIR_00M] = &DD[DIR_00P   *size_Mat];
-		  D.f[DIR_00P] = &DD[DIR_00M   *size_Mat];
-		  D.f[DIR_MM0] = &DD[DIR_PP0  *size_Mat];
-		  D.f[DIR_PP0] = &DD[DIR_MM0  *size_Mat];
-		  D.f[DIR_MP0] = &DD[DIR_PM0  *size_Mat];
-		  D.f[DIR_PM0] = &DD[DIR_MP0  *size_Mat];
-		  D.f[DIR_M0M] = &DD[DIR_P0P  *size_Mat];
-		  D.f[DIR_P0P] = &DD[DIR_M0M  *size_Mat];
-		  D.f[DIR_M0P] = &DD[DIR_P0M  *size_Mat];
-		  D.f[DIR_P0M] = &DD[DIR_M0P  *size_Mat];
-		  D.f[DIR_0MM] = &DD[DIR_0PP  *size_Mat];
-		  D.f[DIR_0PP] = &DD[DIR_0MM  *size_Mat];
-		  D.f[DIR_0MP] = &DD[DIR_0PM  *size_Mat];
-		  D.f[DIR_0PM] = &DD[DIR_0MP  *size_Mat];
-		  D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		  D.f[DIR_PPP] = &DD[DIR_MMM *size_Mat];
-		  D.f[DIR_MMP] = &DD[DIR_PPM *size_Mat];
-		  D.f[DIR_PMP] = &DD[DIR_MPM *size_Mat];
-		  D.f[DIR_MPP] = &DD[DIR_PMM *size_Mat];
-		  D.f[DIR_PPM] = &DD[DIR_MMP *size_Mat];
-		  D.f[DIR_MMM] = &DD[DIR_PPP *size_Mat];
-		  D.f[DIR_PMM] = &DD[DIR_MPP *size_Mat];
-		  D.f[DIR_MPM] = &DD[DIR_PMP *size_Mat];
+		  D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+		  D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+		  D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+		  D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+		  D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+		  D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+		  D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+		  D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+		  D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+		  D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+		  D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+		  D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+		  D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+		  D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+		  D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+		  D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+		  D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+		  D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+		  D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		  D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+		  D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+		  D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+		  D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+		  D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+		  D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+		  D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+		  D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
 	  }
 	  if (isEvenTimestep==false)
       {
-         DN.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         DN.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         DN.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         DN.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         DN.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         DN.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         DN.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         DN.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         DN.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         DN.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         DN.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         DN.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         DN.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         DN.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         DN.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         DN.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         DN.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         DN.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         DN.f[DIR_000] = &DD[DIR_000*size_Mat];
-         DN.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         DN.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         DN.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         DN.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         DN.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         DN.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         DN.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         DN.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         DN.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         DN.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         DN.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         DN.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         DN.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         DN.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         DN.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         DN.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         DN.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         DN.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         DN.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         DN.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         DN.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         DN.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         DN.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         DN.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         DN.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         DN.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         DN.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         DN.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         DN.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         DN.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         DN.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         DN.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         DN.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         DN.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         DN.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         DN.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         DN.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         DN.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         DN.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         DN.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         DN.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         DN.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         DN.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         DN.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         DN.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         DN.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         DN.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         DN.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         DN.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         DN.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         DN.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         DN.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         DN.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         DN.f[DIR_000] = &DD[DIR_000*size_Mat];
-         DN.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         DN.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         DN.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         DN.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         DN.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         DN.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         DN.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         DN.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         DN.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         DN.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         DN.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         DN.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         DN.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         DN.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         DN.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         DN.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         DN.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         DN.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         DN.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         DN.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         DN.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         DN.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         DN.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         DN.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         DN.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         DN.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         DN.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         DN.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         DN.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         DN.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         DN.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         DN.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         DN.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         DN.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         DN.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  //directions allways exchange
@@ -1106,24 +1106,24 @@ __global__ void QThinWallsPartTwo27(
 	  //( 1  1  1) ( 1  0  0) ( 0  1  0) ( 0  0  1) ( 1  1  0) ( 1  0  1) ( 0  1  1) (-1 -1  1) (-1  1 -1) ( 1 -1 -1) (-1  1  0) (-1  0  1) ( 0 -1  1)
 	  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  real q, tmp;
-      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1){ if (geom[kw  ] < GEO_FLUID){tmp = (DN.f[DIR_M00  ])[kw  ]; (DN.f[DIR_M00  ])[kw  ]=(D.f[DIR_M00  ])[kw  ]; (D.f[DIR_M00  ])[kw  ]=tmp;}}
-	  q = q_dirW[k];   if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_P00  ])[ke  ]; (DN.f[DIR_P00  ])[ke  ]=(D.f[DIR_P00  ])[ke  ]; (D.f[DIR_P00  ])[ke  ]=tmp;}}
-      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1){ if (geom[ks  ] < GEO_FLUID){tmp = (DN.f[DIR_0M0  ])[ks  ]; (DN.f[DIR_0M0  ])[ks  ]=(D.f[DIR_0M0  ])[ks  ]; (D.f[DIR_0M0  ])[ks  ]=tmp;}}
-      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_0P0  ])[kn  ]; (DN.f[DIR_0P0  ])[kn  ]=(D.f[DIR_0P0  ])[kn  ]; (D.f[DIR_0P0  ])[kn  ]=tmp;}}
-      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1){ if (geom[kb  ] < GEO_FLUID){tmp = (DN.f[DIR_00M  ])[kb  ]; (DN.f[DIR_00M  ])[kb  ]=(D.f[DIR_00M  ])[kb  ]; (D.f[DIR_00M  ])[kb  ]=tmp;}}
-      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_00P  ])[kt  ]; (DN.f[DIR_00P  ])[kt  ]=(D.f[DIR_00P  ])[kt  ]; (D.f[DIR_00P  ])[kt  ]=tmp;}}
-      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1){ if (geom[ksw ] < GEO_FLUID){tmp = (DN.f[DIR_MM0 ])[ksw ]; (DN.f[DIR_MM0 ])[ksw ]=(D.f[DIR_MM0 ])[ksw ]; (D.f[DIR_MM0 ])[ksw ]=tmp;}}
-      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_PP0 ])[kne ]; (DN.f[DIR_PP0 ])[kne ]=(D.f[DIR_PP0 ])[kne ]; (D.f[DIR_PP0 ])[kne ]=tmp;}}
-      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_MP0 ])[knw ]; (DN.f[DIR_MP0 ])[knw ]=(D.f[DIR_MP0 ])[knw ]; (D.f[DIR_MP0 ])[knw ]=tmp;}}
-      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1){ if (geom[kmp0] < GEO_FLUID){tmp = (DN.f[DIR_PM0 ])[kse ]; (DN.f[DIR_PM0 ])[kse ]=(D.f[DIR_PM0 ])[kse ]; (D.f[DIR_PM0 ])[kse ]=tmp;}}
-      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1){ if (geom[kbw ] < GEO_FLUID){tmp = (DN.f[DIR_M0M ])[kbw ]; (DN.f[DIR_M0M ])[kbw ]=(D.f[DIR_M0M ])[kbw ]; (D.f[DIR_M0M ])[kbw ]=tmp;}}
-      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_P0P ])[kte ]; (DN.f[DIR_P0P ])[kte ]=(D.f[DIR_P0P ])[kte ]; (D.f[DIR_P0P ])[kte ]=tmp;}}
-      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_M0P ])[ktw ]; (DN.f[DIR_M0P ])[ktw ]=(D.f[DIR_M0P ])[ktw ]; (D.f[DIR_M0P ])[ktw ]=tmp;}}
-      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1){ if (geom[km0p] < GEO_FLUID){tmp = (DN.f[DIR_P0M ])[kbe ]; (DN.f[DIR_P0M ])[kbe ]=(D.f[DIR_P0M ])[kbe ]; (D.f[DIR_P0M ])[kbe ]=tmp;}}
-      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1){ if (geom[kbs ] < GEO_FLUID){tmp = (DN.f[DIR_0MM ])[kbs ]; (DN.f[DIR_0MM ])[kbs ]=(D.f[DIR_0MM ])[kbs ]; (D.f[DIR_0MM ])[kbs ]=tmp;}}
-      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_0PP ])[ktn ]; (DN.f[DIR_0PP ])[ktn ]=(D.f[DIR_0PP ])[ktn ]; (D.f[DIR_0PP ])[ktn ]=tmp;}}
-      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_0MP ])[kts ]; (DN.f[DIR_0MP ])[kts ]=(D.f[DIR_0MP ])[kts ]; (D.f[DIR_0MP ])[kts ]=tmp;}}
-      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1){ if (geom[k0mp] < GEO_FLUID){tmp = (DN.f[DIR_0PM ])[kbn ]; (DN.f[DIR_0PM ])[kbn ]=(D.f[DIR_0PM ])[kbn ]; (D.f[DIR_0PM ])[kbn ]=tmp;}}
+      q = q_dirE[k];   if (q>=c0o1 && q<=c1o1){ if (geom[kw  ] < GEO_FLUID){tmp = (DN.f[DIR_M00])[kw  ]; (DN.f[DIR_M00])[kw  ]=(D.f[DIR_M00])[kw  ]; (D.f[DIR_M00])[kw  ]=tmp;}}
+	  q = q_dirW[k];   if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_P00])[ke  ]; (DN.f[DIR_P00])[ke  ]=(D.f[DIR_P00])[ke  ]; (D.f[DIR_P00])[ke  ]=tmp;}}
+      q = q_dirN[k];   if (q>=c0o1 && q<=c1o1){ if (geom[ks  ] < GEO_FLUID){tmp = (DN.f[DIR_0M0])[ks  ]; (DN.f[DIR_0M0])[ks  ]=(D.f[DIR_0M0])[ks  ]; (D.f[DIR_0M0])[ks  ]=tmp;}}
+      q = q_dirS[k];   if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_0P0])[kn  ]; (DN.f[DIR_0P0])[kn  ]=(D.f[DIR_0P0])[kn  ]; (D.f[DIR_0P0])[kn  ]=tmp;}}
+      q = q_dirT[k];   if (q>=c0o1 && q<=c1o1){ if (geom[kb  ] < GEO_FLUID){tmp = (DN.f[DIR_00M])[kb  ]; (DN.f[DIR_00M])[kb  ]=(D.f[DIR_00M])[kb  ]; (D.f[DIR_00M])[kb  ]=tmp;}}
+      q = q_dirB[k];   if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_00P])[kt  ]; (DN.f[DIR_00P])[kt  ]=(D.f[DIR_00P])[kt  ]; (D.f[DIR_00P])[kt  ]=tmp;}}
+      q = q_dirNE[k];  if (q>=c0o1 && q<=c1o1){ if (geom[ksw ] < GEO_FLUID){tmp = (DN.f[DIR_MM0])[ksw ]; (DN.f[DIR_MM0])[ksw ]=(D.f[DIR_MM0])[ksw ]; (D.f[DIR_MM0])[ksw ]=tmp;}}
+      q = q_dirSW[k];  if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_PP0])[kne ]; (DN.f[DIR_PP0])[kne ]=(D.f[DIR_PP0])[kne ]; (D.f[DIR_PP0])[kne ]=tmp;}}
+      q = q_dirSE[k];  if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_MP0])[knw ]; (DN.f[DIR_MP0])[knw ]=(D.f[DIR_MP0])[knw ]; (D.f[DIR_MP0])[knw ]=tmp;}}
+      q = q_dirNW[k];  if (q>=c0o1 && q<=c1o1){ if (geom[kmp0] < GEO_FLUID){tmp = (DN.f[DIR_PM0])[kse ]; (DN.f[DIR_PM0])[kse ]=(D.f[DIR_PM0])[kse ]; (D.f[DIR_PM0])[kse ]=tmp;}}
+      q = q_dirTE[k];  if (q>=c0o1 && q<=c1o1){ if (geom[kbw ] < GEO_FLUID){tmp = (DN.f[DIR_M0M])[kbw ]; (DN.f[DIR_M0M])[kbw ]=(D.f[DIR_M0M])[kbw ]; (D.f[DIR_M0M])[kbw ]=tmp;}}
+      q = q_dirBW[k];  if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_P0P])[kte ]; (DN.f[DIR_P0P])[kte ]=(D.f[DIR_P0P])[kte ]; (D.f[DIR_P0P])[kte ]=tmp;}}
+      q = q_dirBE[k];  if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_M0P])[ktw ]; (DN.f[DIR_M0P])[ktw ]=(D.f[DIR_M0P])[ktw ]; (D.f[DIR_M0P])[ktw ]=tmp;}}
+      q = q_dirTW[k];  if (q>=c0o1 && q<=c1o1){ if (geom[km0p] < GEO_FLUID){tmp = (DN.f[DIR_P0M])[kbe ]; (DN.f[DIR_P0M])[kbe ]=(D.f[DIR_P0M])[kbe ]; (D.f[DIR_P0M])[kbe ]=tmp;}}
+      q = q_dirTN[k];  if (q>=c0o1 && q<=c1o1){ if (geom[kbs ] < GEO_FLUID){tmp = (DN.f[DIR_0MM])[kbs ]; (DN.f[DIR_0MM])[kbs ]=(D.f[DIR_0MM])[kbs ]; (D.f[DIR_0MM])[kbs ]=tmp;}}
+      q = q_dirBS[k];  if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_0PP])[ktn ]; (DN.f[DIR_0PP])[ktn ]=(D.f[DIR_0PP])[ktn ]; (D.f[DIR_0PP])[ktn ]=tmp;}}
+      q = q_dirBN[k];  if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_0MP])[kts ]; (DN.f[DIR_0MP])[kts ]=(D.f[DIR_0MP])[kts ]; (D.f[DIR_0MP])[kts ]=tmp;}}
+      q = q_dirTS[k];  if (q>=c0o1 && q<=c1o1){ if (geom[k0mp] < GEO_FLUID){tmp = (DN.f[DIR_0PM])[kbn ]; (DN.f[DIR_0PM])[kbn ]=(D.f[DIR_0PM])[kbn ]; (D.f[DIR_0PM])[kbn ]=tmp;}}
       q = q_dirTNE[k]; if (q>=c0o1 && q<=c1o1){ if (geom[kbsw] < GEO_FLUID){tmp = (DN.f[DIR_MMM])[kbsw]; (DN.f[DIR_MMM])[kbsw]=(D.f[DIR_MMM])[kbsw]; (D.f[DIR_MMM])[kbsw]=tmp;}}
       q = q_dirBSW[k]; if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_PPP])[ktne]; (DN.f[DIR_PPP])[ktne]=(D.f[DIR_PPP])[ktne]; (D.f[DIR_PPP])[ktne]=tmp;}}
       q = q_dirBNE[k]; if (q>=c0o1 && q<=c1o1){                            {tmp = (DN.f[DIR_MMP])[ktsw]; (DN.f[DIR_MMP])[ktsw]=(D.f[DIR_MMP])[ktsw]; (D.f[DIR_MMP])[ktsw]=tmp;}}
diff --git a/src/gpu/VirtualFluids_GPU/GPU/TurbulenceIntensity.cu b/src/gpu/VirtualFluids_GPU/GPU/TurbulenceIntensity.cu
index f8cf8ab13c39d55477bf006cd27f7943dcb5b53a..3f440454ef272b13c24fe2a2882d67d32d32a841 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/TurbulenceIntensity.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/TurbulenceIntensity.cu
@@ -9,14 +9,16 @@
 /* Device code */
 #include "LBM/LB.h" 
 #include "lbm/constants/D3Q27.h"
-#include <lbm/constants/NumericConstants.h>
+#include "lbm/constants/NumericConstants.h"
 
 #include "lbm/MacroscopicQuantities.h"
 #include "../Kernel/Utilities/DistributionHelper.cuh"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
 
 
 using namespace vf::lbm::constant;
 using namespace vf::lbm::dir;
+using namespace vf::gpu;
 
 //////////////////////////////////////////////////////////////////////////////
 __global__ void CalcTurbulenceIntensity(
@@ -34,19 +36,21 @@ __global__ void CalcTurbulenceIntensity(
    unsigned int* neighborX,
    unsigned int* neighborY,
    unsigned int* neighborZ,
-   unsigned int size_Mat, 
+   unsigned long long numberOfLBnodes, 
    bool isEvenTimestep)
 {
-   const unsigned k = vf::gpu::getNodeIndex();
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
 
-   if (k >= size_Mat)
+   if (nodeIndex >= numberOfLBnodes)
        return;
 
-   if (!vf::gpu::isValidFluidNode(typeOfGridNode[k]))
+   if (!isValidFluidNode(typeOfGridNode[nodeIndex]))
        return;
 
-   vf::gpu::DistributionWrapper distr_wrapper(distributions, size_Mat, isEvenTimestep, k, neighborX, neighborY,
-                                              neighborZ);
+   DistributionWrapper distr_wrapper(distributions, numberOfLBnodes, isEvenTimestep, nodeIndex, neighborX, neighborY, neighborZ);
    const auto &distribution = distr_wrapper.distribution;
 
    // analogue to LBCalcMacCompSP27
@@ -58,16 +62,16 @@ __global__ void CalcTurbulenceIntensity(
 
    // compute subtotals:
    // fluctuations
-   vxx[k] = vxx[k] + vx * vx;
-   vyy[k] = vyy[k] + vy * vy;
-   vzz[k] = vzz[k] + vz * vz;
-   vxy[k] = vxy[k] + vx * vy;
-   vxz[k] = vxz[k] + vx * vz;
-   vyz[k] = vyz[k] + vy * vz;
+   vxx[nodeIndex] = vxx[nodeIndex] + vx * vx;
+   vyy[nodeIndex] = vyy[nodeIndex] + vy * vy;
+   vzz[nodeIndex] = vzz[nodeIndex] + vz * vz;
+   vxy[nodeIndex] = vxy[nodeIndex] + vx * vy;
+   vxz[nodeIndex] = vxz[nodeIndex] + vx * vz;
+   vyz[nodeIndex] = vyz[nodeIndex] + vy * vz;
 
    // velocity (for mean velocity)
-   vx_mean[k] = vx_mean[k] + vx;
-   vy_mean[k] = vy_mean[k] + vy;
-   vz_mean[k] = vz_mean[k] + vz; 
+   vx_mean[nodeIndex] = vx_mean[nodeIndex] + vx;
+   vy_mean[nodeIndex] = vy_mean[nodeIndex] + vy;
+   vz_mean[nodeIndex] = vz_mean[nodeIndex] + vz; 
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosityInlines.cuh b/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosityInlines.cuh
index eb301515527a9e8a3056676b0d4dffe8197c7dbe..58856f624fa1dfd2488c3061721e9dac53a67d07 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosityInlines.cuh
+++ b/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosityInlines.cuh
@@ -53,8 +53,8 @@ __inline__ __device__ real calcTurbulentViscosityQR(real C, real dxux, real dyuy
         //! Second invariant of the strain-rate tensor
         real Q = c1o2*( dxux*dxux + dyuy*dyuy + dzuz*dzuz ) + c1o4*( Dxy*Dxy + Dxz*Dxz + Dyz*Dyz);
         //! Third invariant of the strain-rate tensor (determinant)
-        real R = - dxux*dyuy*dzuz - c1o4*( Dxy*Dxz*Dyz + dxux*Dyz*Dyz + dyuy*Dxz*Dxz + dzuz*Dxy*Dxy );
-        
+        // real R = - dxux*dyuy*dzuz - c1o4*( Dxy*Dxz*Dyz + dxux*Dyz*Dyz + dyuy*Dxz*Dxz + dzuz*Dxy*Dxy );
+        real R = - dxux*dyuy*dzuz + c1o4*( -Dxy*Dxz*Dyz + dxux*Dyz*Dyz + dyuy*Dxz*Dxz + dzuz*Dxy*Dxy );
         return C * max(R, c0o1) / Q;
 }
 
diff --git a/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosityKernels.cu b/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosityKernels.cu
index 3719ca3712e6f63a77f62bf314af7d19eea01f4c..7147629c448b8b730e4ae8c4eff8a0a400863de9 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosityKernels.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosityKernels.cu
@@ -38,6 +38,7 @@
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 #include "LBM/LB.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
 
 using namespace vf::lbm::constant;
 
@@ -52,34 +53,31 @@ __host__ __device__ __forceinline__ void calcDerivatives(const uint& k, uint& kM
     dvz = ((fluidP ? vz[kP] : vz[k])-(fluidM ? vz[kM] : vz[k]))*div;
 }
 
-__global__ void calcAMD(real* vx,
-                        real* vy,
-                        real* vz,
-                        real* turbulentViscosity,
-                        uint* neighborX,
-                        uint* neighborY,
-                        uint* neighborZ,
-                        uint* neighborWSB,
-                        uint* typeOfGridNode,
-                        uint size_Mat,
-                        real SGSConstant)
+__global__ void calcAMD(
+    real* vx,
+    real* vy,
+    real* vz,
+    real* turbulentViscosity,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    uint* neighborWSB,
+    uint* typeOfGridNode,
+    unsigned long long numberOfLBnodes,
+    real SGSConstant)
 {
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = vf::gpu::getNodeIndex();
 
-    const uint x = threadIdx.x; 
-    const uint y = blockIdx.x; 
-    const uint z = blockIdx.y; 
+    if(nodeIndex >= numberOfLBnodes) return;
+    if(typeOfGridNode[nodeIndex] != GEO_FLUID) return;
 
-    const uint nx = blockDim.x;
-    const uint ny = gridDim.x;
-
-    const uint k = nx*(ny*z + y) + x;
-    if(k >= size_Mat) return;
-    if(typeOfGridNode[k] != GEO_FLUID) return;
-
-    uint kPx = neighborX[k];
-    uint kPy = neighborY[k];
-    uint kPz = neighborZ[k];
-    uint kMxyz = neighborWSB[k];
+    uint kPx = neighborX[nodeIndex];
+    uint kPy = neighborY[nodeIndex];
+    uint kPz = neighborZ[nodeIndex];
+    uint kMxyz = neighborWSB[nodeIndex];
     uint kMx = neighborZ[neighborY[kMxyz]];
     uint kMy = neighborZ[neighborX[kMxyz]];
     uint kMz = neighborY[neighborX[kMxyz]];
@@ -88,9 +86,9 @@ __global__ void calcAMD(real* vx,
          dvydx, dvydy, dvydz,
          dvzdx, dvzdy, dvzdz;
 
-    calcDerivatives(k, kMx, kPx, typeOfGridNode, vx, vy, vz, dvxdx, dvydx, dvzdx);
-    calcDerivatives(k, kMy, kPy, typeOfGridNode, vx, vy, vz, dvxdy, dvydy, dvzdy);
-    calcDerivatives(k, kMz, kPz, typeOfGridNode, vx, vy, vz, dvxdz, dvydz, dvzdz);
+    calcDerivatives(nodeIndex, kMx, kPx, typeOfGridNode, vx, vy, vz, dvxdx, dvydx, dvzdx);
+    calcDerivatives(nodeIndex, kMy, kPy, typeOfGridNode, vx, vy, vz, dvxdy, dvydy, dvzdy);
+    calcDerivatives(nodeIndex, kMz, kPz, typeOfGridNode, vx, vy, vz, dvxdz, dvydz, dvzdz);
 
     real denominator =  dvxdx*dvxdx + dvydx*dvydx + dvzdx*dvzdx + 
                         dvxdy*dvxdy + dvydy*dvydy + dvzdy*dvzdy +
@@ -102,7 +100,7 @@ __global__ void calcAMD(real* vx,
                         (dvxdx*dvzdx + dvxdy*dvzdy + dvxdz*dvzdz) * (dvxdz+dvzdx) + 
                         (dvydx*dvzdx + dvydy*dvzdy + dvydz*dvzdz) * (dvydz+dvzdy);
 
-    turbulentViscosity[k] = max(c0o1,-SGSConstant*enumerator)/denominator;
+    turbulentViscosity[nodeIndex] = denominator != c0o1 ? max(c0o1,-SGSConstant*enumerator)/denominator : c0o1;
 }
 
 void calcTurbulentViscosityAMD(Parameter* para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/GPU/VelocityBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/VelocityBCs27.cu
index 05c85e8b546aeaa964b1dbb61cbf01dd9b82ca1a..ccf9d1771ec0e1895e5cb79fae63675429b02c73 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/VelocityBCs27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/VelocityBCs27.cu
@@ -1,96 +1,120 @@
-//  _    ___      __              __________      _     __        ______________   __
-// | |  / (_)____/ /___  ______ _/ / ____/ /_  __(_)___/ /____   /  ___/ __  / /  / /
-// | | / / / ___/ __/ / / / __ `/ / /_  / / / / / / __  / ___/  / /___/ /_/ / /  / /
-// | |/ / / /  / /_/ /_/ / /_/ / / __/ / / /_/ / / /_/ (__  )  / /_) / ____/ /__/ / 
-// |___/_/_/   \__/\__,_/\__,_/_/_/   /_/\__,_/_/\__,_/____/   \____/_/    \_____/
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
 //
-//////////////////////////////////////////////////////////////////////////
-
-/* Device code */
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file VelocityBCs27.cu
+//! \ingroup GPU
+//! \author Martin Schoenherr, Anna Wellmann
+//======================================================================================
 #include "LBM/LB.h" 
 #include "lbm/constants/D3Q27.h"
 #include "lbm/constants/NumericConstants.h"
-#include "KernelUtilities.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
 
 using namespace vf::lbm::constant;
 using namespace vf::lbm::dir;
+using namespace vf::gpu;
 
 //////////////////////////////////////////////////////////////////////////////
 __global__ void QVelDeviceCompPlusSlip27(
-													real* vx,
-													real* vy,
-													real* vz,
-													real* DD, 
-													int* k_Q, 
-													real* QQ,
-													unsigned int numberOfBCnodes, 
-													real om1, 
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													unsigned int size_Mat, 
-													bool isEvenTimestep)
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD, 
+    int* k_Q, 
+    real* QQ,
+    unsigned int numberOfBCnodes, 
+    real om1, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -115,24 +139,24 @@ __global__ void QVelDeviceCompPlusSlip27(
 		   *q_dirBE, *q_dirTW, *q_dirTN, *q_dirBS, *q_dirBN, *q_dirTS,
 		   *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
 		   *q_dirBSE, *q_dirBNW;
-	   q_dirE = &QQ[DIR_P00   * numberOfBCnodes];
-	   q_dirW = &QQ[DIR_M00   * numberOfBCnodes];
-	   q_dirN = &QQ[DIR_0P0   * numberOfBCnodes];
-	   q_dirS = &QQ[DIR_0M0   * numberOfBCnodes];
-	   q_dirT = &QQ[DIR_00P   * numberOfBCnodes];
-	   q_dirB = &QQ[DIR_00M   * numberOfBCnodes];
-	   q_dirNE = &QQ[DIR_PP0  * numberOfBCnodes];
-	   q_dirSW = &QQ[DIR_MM0  * numberOfBCnodes];
-	   q_dirSE = &QQ[DIR_PM0  * numberOfBCnodes];
-	   q_dirNW = &QQ[DIR_MP0  * numberOfBCnodes];
-	   q_dirTE = &QQ[DIR_P0P  * numberOfBCnodes];
-	   q_dirBW = &QQ[DIR_M0M  * numberOfBCnodes];
-	   q_dirBE = &QQ[DIR_P0M  * numberOfBCnodes];
-	   q_dirTW = &QQ[DIR_M0P  * numberOfBCnodes];
-	   q_dirTN = &QQ[DIR_0PP  * numberOfBCnodes];
-	   q_dirBS = &QQ[DIR_0MM  * numberOfBCnodes];
-	   q_dirBN = &QQ[DIR_0PM  * numberOfBCnodes];
-	   q_dirTS = &QQ[DIR_0MP  * numberOfBCnodes];
+	   q_dirE = &QQ[DIR_P00 * numberOfBCnodes];
+	   q_dirW = &QQ[DIR_M00 * numberOfBCnodes];
+	   q_dirN = &QQ[DIR_0P0 * numberOfBCnodes];
+	   q_dirS = &QQ[DIR_0M0 * numberOfBCnodes];
+	   q_dirT = &QQ[DIR_00P * numberOfBCnodes];
+	   q_dirB = &QQ[DIR_00M * numberOfBCnodes];
+	   q_dirNE = &QQ[DIR_PP0 * numberOfBCnodes];
+	   q_dirSW = &QQ[DIR_MM0 * numberOfBCnodes];
+	   q_dirSE = &QQ[DIR_PM0 * numberOfBCnodes];
+	   q_dirNW = &QQ[DIR_MP0 * numberOfBCnodes];
+	   q_dirTE = &QQ[DIR_P0P * numberOfBCnodes];
+	   q_dirBW = &QQ[DIR_M0M * numberOfBCnodes];
+	   q_dirBE = &QQ[DIR_P0M * numberOfBCnodes];
+	   q_dirTW = &QQ[DIR_M0P * numberOfBCnodes];
+	   q_dirTN = &QQ[DIR_0PP * numberOfBCnodes];
+	   q_dirBS = &QQ[DIR_0MM * numberOfBCnodes];
+	   q_dirBN = &QQ[DIR_0PM * numberOfBCnodes];
+	   q_dirTS = &QQ[DIR_0MP * numberOfBCnodes];
 	   q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
 	   q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
 	   q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -225,63 +249,63 @@ __global__ void QVelDeviceCompPlusSlip27(
 	   //////////////////////////////////////////////////////////////////////////
 	   if (isEvenTimestep == false)
 	   {
-		   D.f[DIR_P00] = &DD[DIR_P00   *size_Mat];
-		   D.f[DIR_M00] = &DD[DIR_M00   *size_Mat];
-		   D.f[DIR_0P0] = &DD[DIR_0P0   *size_Mat];
-		   D.f[DIR_0M0] = &DD[DIR_0M0   *size_Mat];
-		   D.f[DIR_00P] = &DD[DIR_00P   *size_Mat];
-		   D.f[DIR_00M] = &DD[DIR_00M   *size_Mat];
-		   D.f[DIR_PP0] = &DD[DIR_PP0  *size_Mat];
-		   D.f[DIR_MM0] = &DD[DIR_MM0  *size_Mat];
-		   D.f[DIR_PM0] = &DD[DIR_PM0  *size_Mat];
-		   D.f[DIR_MP0] = &DD[DIR_MP0  *size_Mat];
-		   D.f[DIR_P0P] = &DD[DIR_P0P  *size_Mat];
-		   D.f[DIR_M0M] = &DD[DIR_M0M  *size_Mat];
-		   D.f[DIR_P0M] = &DD[DIR_P0M  *size_Mat];
-		   D.f[DIR_M0P] = &DD[DIR_M0P  *size_Mat];
-		   D.f[DIR_0PP] = &DD[DIR_0PP  *size_Mat];
-		   D.f[DIR_0MM] = &DD[DIR_0MM  *size_Mat];
-		   D.f[DIR_0PM] = &DD[DIR_0PM  *size_Mat];
-		   D.f[DIR_0MP] = &DD[DIR_0MP  *size_Mat];
-		   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		   D.f[DIR_PPP] = &DD[DIR_PPP *size_Mat];
-		   D.f[DIR_MMP] = &DD[DIR_MMP *size_Mat];
-		   D.f[DIR_PMP] = &DD[DIR_PMP *size_Mat];
-		   D.f[DIR_MPP] = &DD[DIR_MPP *size_Mat];
-		   D.f[DIR_PPM] = &DD[DIR_PPM *size_Mat];
-		   D.f[DIR_MMM] = &DD[DIR_MMM *size_Mat];
-		   D.f[DIR_PMM] = &DD[DIR_PMM *size_Mat];
-		   D.f[DIR_MPM] = &DD[DIR_MPM *size_Mat];
+		   D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+		   D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+		   D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+		   D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+		   D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+		   D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+		   D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+		   D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+		   D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+		   D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+		   D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+		   D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+		   D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+		   D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+		   D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+		   D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+		   D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+		   D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+		   D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		   D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+		   D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+		   D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+		   D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+		   D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+		   D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+		   D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+		   D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
 	   }
 	   else
 	   {
-		   D.f[DIR_M00] = &DD[DIR_P00   *size_Mat];
-		   D.f[DIR_P00] = &DD[DIR_M00   *size_Mat];
-		   D.f[DIR_0M0] = &DD[DIR_0P0   *size_Mat];
-		   D.f[DIR_0P0] = &DD[DIR_0M0   *size_Mat];
-		   D.f[DIR_00M] = &DD[DIR_00P   *size_Mat];
-		   D.f[DIR_00P] = &DD[DIR_00M   *size_Mat];
-		   D.f[DIR_MM0] = &DD[DIR_PP0  *size_Mat];
-		   D.f[DIR_PP0] = &DD[DIR_MM0  *size_Mat];
-		   D.f[DIR_MP0] = &DD[DIR_PM0  *size_Mat];
-		   D.f[DIR_PM0] = &DD[DIR_MP0  *size_Mat];
-		   D.f[DIR_M0M] = &DD[DIR_P0P  *size_Mat];
-		   D.f[DIR_P0P] = &DD[DIR_M0M  *size_Mat];
-		   D.f[DIR_M0P] = &DD[DIR_P0M  *size_Mat];
-		   D.f[DIR_P0M] = &DD[DIR_M0P  *size_Mat];
-		   D.f[DIR_0MM] = &DD[DIR_0PP  *size_Mat];
-		   D.f[DIR_0PP] = &DD[DIR_0MM  *size_Mat];
-		   D.f[DIR_0MP] = &DD[DIR_0PM  *size_Mat];
-		   D.f[DIR_0PM] = &DD[DIR_0MP  *size_Mat];
-		   D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		   D.f[DIR_PPP] = &DD[DIR_MMM *size_Mat];
-		   D.f[DIR_MMP] = &DD[DIR_PPM *size_Mat];
-		   D.f[DIR_PMP] = &DD[DIR_MPM *size_Mat];
-		   D.f[DIR_MPP] = &DD[DIR_PMM *size_Mat];
-		   D.f[DIR_PPM] = &DD[DIR_MMP *size_Mat];
-		   D.f[DIR_MMM] = &DD[DIR_PPP *size_Mat];
-		   D.f[DIR_PMM] = &DD[DIR_MPP *size_Mat];
-		   D.f[DIR_MPM] = &DD[DIR_PMP *size_Mat];
+		   D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+		   D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+		   D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+		   D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+		   D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+		   D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+		   D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+		   D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+		   D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+		   D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+		   D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+		   D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+		   D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+		   D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+		   D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+		   D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+		   D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+		   D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+		   D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		   D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+		   D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+		   D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+		   D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+		   D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+		   D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+		   D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+		   D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
 	   }
 	   ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	   //Test
@@ -553,18 +577,19 @@ __global__ void QVelDeviceCompPlusSlip27(
 
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-__global__ void QVeloDeviceEQ27(real* VeloX,
-										   real* VeloY,
-										   real* VeloZ,
-                                           real* DD, 
-                                           int* k_Q, 
-                                           int numberOfBCnodes, 
-                                           real om1, 
-                                           unsigned int* neighborX,
-                                           unsigned int* neighborY,
-                                           unsigned int* neighborZ,
-                                           unsigned int size_Mat, 
-                                           bool isEvenTimestep)
+__global__ void QVeloDeviceEQ27(
+    real* VeloX,
+    real* VeloY,
+    real* VeloZ,
+    real* DD, 
+    int* k_Q, 
+    int numberOfBCnodes, 
+    real om1, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -613,95 +638,95 @@ __global__ void QVeloDeviceEQ27(real* VeloX,
       Distributions27 D;
       if (isEvenTimestep==true)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
 
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
             // based on BGK Plus Comp
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[ke   ];
-			real mfabb = (D.f[DIR_M00   ])[kw   ];
-			real mfbcb = (D.f[DIR_0P0   ])[kn   ];
-			real mfbab = (D.f[DIR_0M0   ])[ks   ];
-			real mfbbc = (D.f[DIR_00P   ])[kt   ];
-			real mfbba = (D.f[DIR_00M   ])[kb   ];
-			real mfccb = (D.f[DIR_PP0  ])[kne  ];
-			real mfaab = (D.f[DIR_MM0  ])[ksw  ];
-			real mfcab = (D.f[DIR_PM0  ])[kse  ];
-			real mfacb = (D.f[DIR_MP0  ])[knw  ];
-			real mfcbc = (D.f[DIR_P0P  ])[kte  ];
-			real mfaba = (D.f[DIR_M0M  ])[kbw  ];
-			real mfcba = (D.f[DIR_P0M  ])[kbe  ];
-			real mfabc = (D.f[DIR_M0P  ])[ktw  ];
-			real mfbcc = (D.f[DIR_0PP  ])[ktn  ];
-			real mfbaa = (D.f[DIR_0MM  ])[kbs  ];
-			real mfbca = (D.f[DIR_0PM  ])[kbn  ];
-			real mfbac = (D.f[DIR_0MP  ])[kts  ];
+			real mfcbb = (D.f[DIR_P00])[ke   ];
+			real mfabb = (D.f[DIR_M00])[kw   ];
+			real mfbcb = (D.f[DIR_0P0])[kn   ];
+			real mfbab = (D.f[DIR_0M0])[ks   ];
+			real mfbbc = (D.f[DIR_00P])[kt   ];
+			real mfbba = (D.f[DIR_00M])[kb   ];
+			real mfccb = (D.f[DIR_PP0])[kne  ];
+			real mfaab = (D.f[DIR_MM0])[ksw  ];
+			real mfcab = (D.f[DIR_PM0])[kse  ];
+			real mfacb = (D.f[DIR_MP0])[knw  ];
+			real mfcbc = (D.f[DIR_P0P])[kte  ];
+			real mfaba = (D.f[DIR_M0M])[kbw  ];
+			real mfcba = (D.f[DIR_P0M])[kbe  ];
+			real mfabc = (D.f[DIR_M0P])[ktw  ];
+			real mfbcc = (D.f[DIR_0PP])[ktn  ];
+			real mfbaa = (D.f[DIR_0MM])[kbs  ];
+			real mfbca = (D.f[DIR_0PM])[kbn  ];
+			real mfbac = (D.f[DIR_0MP])[kts  ];
 			real mfbbb = (D.f[DIR_000])[kzero];
-			real mfccc = (D.f[DIR_PPP ])[ktne ];
-			real mfaac = (D.f[DIR_MMP ])[ktsw ];
-			real mfcac = (D.f[DIR_PMP ])[ktse ];
-			real mfacc = (D.f[DIR_MPP ])[ktnw ];
-			real mfcca = (D.f[DIR_PPM ])[kbne ];
-			real mfaaa = (D.f[DIR_MMM ])[kbsw ];
-			real mfcaa = (D.f[DIR_PMM ])[kbse ];
-			real mfaca = (D.f[DIR_MPM ])[kbnw ];
+			real mfccc = (D.f[DIR_PPP])[ktne ];
+			real mfaac = (D.f[DIR_MMP])[ktsw ];
+			real mfcac = (D.f[DIR_PMP])[ktse ];
+			real mfacc = (D.f[DIR_MPP])[ktnw ];
+			real mfcca = (D.f[DIR_PPM])[kbne ];
+			real mfaaa = (D.f[DIR_MMM])[kbsw ];
+			real mfcaa = (D.f[DIR_PMM])[kbse ];
+			real mfaca = (D.f[DIR_MPM])[kbnw ];
 			////////////////////////////////////////////////////////////////////////////////////
 			real rho   = (mfccc+mfaaa + mfaca+mfcac + mfacc+mfcaa + mfaac+mfcca + 
 							 mfbac+mfbca + mfbaa+mfbcc + mfabc+mfcba + mfaba+mfcbc + mfacb+mfcab + mfaab+mfccb +
@@ -763,33 +788,33 @@ __global__ void QVeloDeviceEQ27(real* VeloX,
 			mfcaa = -rho * XXc * YYa * ZZa - c1o216;
 			mfaca = -rho * XXa * YYc * ZZa - c1o216;
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			(D.f[DIR_P00   ])[ke   ] = mfabb;//mfcbb;
-			(D.f[DIR_M00   ])[kw   ] = mfcbb;//mfabb;
-			(D.f[DIR_0P0   ])[kn   ] = mfbab;//mfbcb;
-			(D.f[DIR_0M0   ])[ks   ] = mfbcb;//mfbab;
-			(D.f[DIR_00P   ])[kt   ] = mfbba;//mfbbc;
-			(D.f[DIR_00M   ])[kb   ] = mfbbc;//mfbba;
-			(D.f[DIR_PP0  ])[kne  ] = mfaab;//mfccb;
-			(D.f[DIR_MM0  ])[ksw  ] = mfccb;//mfaab;
-			(D.f[DIR_PM0  ])[kse  ] = mfacb;//mfcab;
-			(D.f[DIR_MP0  ])[knw  ] = mfcab;//mfacb;
-			(D.f[DIR_P0P  ])[kte  ] = mfaba;//mfcbc;
-			(D.f[DIR_M0M  ])[kbw  ] = mfcbc;//mfaba;
-			(D.f[DIR_P0M  ])[kbe  ] = mfabc;//mfcba;
-			(D.f[DIR_M0P  ])[ktw  ] = mfcba;//mfabc;
-			(D.f[DIR_0PP  ])[ktn  ] = mfbaa;//mfbcc;
-			(D.f[DIR_0MM  ])[kbs  ] = mfbcc;//mfbaa;
-			(D.f[DIR_0PM  ])[kbn  ] = mfbac;//mfbca;
-			(D.f[DIR_0MP  ])[kts  ] = mfbca;//mfbac;
+			(D.f[DIR_P00])[ke   ] = mfabb;//mfcbb;
+			(D.f[DIR_M00])[kw   ] = mfcbb;//mfabb;
+			(D.f[DIR_0P0])[kn   ] = mfbab;//mfbcb;
+			(D.f[DIR_0M0])[ks   ] = mfbcb;//mfbab;
+			(D.f[DIR_00P])[kt   ] = mfbba;//mfbbc;
+			(D.f[DIR_00M])[kb   ] = mfbbc;//mfbba;
+			(D.f[DIR_PP0])[kne  ] = mfaab;//mfccb;
+			(D.f[DIR_MM0])[ksw  ] = mfccb;//mfaab;
+			(D.f[DIR_PM0])[kse  ] = mfacb;//mfcab;
+			(D.f[DIR_MP0])[knw  ] = mfcab;//mfacb;
+			(D.f[DIR_P0P])[kte  ] = mfaba;//mfcbc;
+			(D.f[DIR_M0M])[kbw  ] = mfcbc;//mfaba;
+			(D.f[DIR_P0M])[kbe  ] = mfabc;//mfcba;
+			(D.f[DIR_M0P])[ktw  ] = mfcba;//mfabc;
+			(D.f[DIR_0PP])[ktn  ] = mfbaa;//mfbcc;
+			(D.f[DIR_0MM])[kbs  ] = mfbcc;//mfbaa;
+			(D.f[DIR_0PM])[kbn  ] = mfbac;//mfbca;
+			(D.f[DIR_0MP])[kts  ] = mfbca;//mfbac;
 			(D.f[DIR_000])[kzero] = mfbbb;//mfbbb;
-			(D.f[DIR_PPP ])[ktne ] = mfaaa;//mfccc;
-			(D.f[DIR_MMP ])[ktsw ] = mfcca;//mfaac;
-			(D.f[DIR_PMP ])[ktse ] = mfaca;//mfcac;
-			(D.f[DIR_MPP ])[ktnw ] = mfcaa;//mfacc;
-			(D.f[DIR_PPM ])[kbne ] = mfaac;//mfcca;
-			(D.f[DIR_MMM ])[kbsw ] = mfccc;//mfaaa;
-			(D.f[DIR_PMM ])[kbse ] = mfacc;//mfcaa;
-			(D.f[DIR_MPM ])[kbnw ] = mfcac;//mfaca;
+			(D.f[DIR_PPP])[ktne ] = mfaaa;//mfccc;
+			(D.f[DIR_MMP])[ktsw ] = mfcca;//mfaac;
+			(D.f[DIR_PMP])[ktse ] = mfaca;//mfcac;
+			(D.f[DIR_MPP])[ktnw ] = mfcaa;//mfacc;
+			(D.f[DIR_PPM])[kbne ] = mfaac;//mfcca;
+			(D.f[DIR_MMM])[kbsw ] = mfccc;//mfaaa;
+			(D.f[DIR_PMM])[kbse ] = mfacc;//mfcaa;
+			(D.f[DIR_MPM])[kbnw ] = mfcac;//mfaca;
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -834,18 +859,18 @@ __global__ void QVeloDeviceEQ27(real* VeloX,
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 __global__ void QVeloStreetDeviceEQ27(
-	real* veloXfraction,
-	real* veloYfraction,
-	int*  naschVelo,
-	real* DD,
-	int*  naschIndex,
-	int   numberOfStreetNodes,
-	real  velocityRatio,
-	uint* neighborX,
-	uint* neighborY,
-	uint* neighborZ,
-	uint  size_Mat,
-	bool  isEvenTimestep)
+    real* veloXfraction,
+    real* veloYfraction,
+    int*  naschVelo,
+    real* DD,
+    int*  naschIndex,
+    int   numberOfStreetNodes,
+    real  velocityRatio,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool  isEvenTimestep)
 {
 	////////////////////////////////////////////////////////////////////////////////
 	const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -894,95 +919,95 @@ __global__ void QVeloStreetDeviceEQ27(
 		Distributions27 D;
 		if (isEvenTimestep == true)
 		{
-			D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-			D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-			D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-			D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-			D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-			D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-			D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-			D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-			D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-			D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-			D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-			D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-			D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-			D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-			D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-			D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-			D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-			D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-			D.f[DIR_000] = &DD[DIR_000*size_Mat];
-			D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-			D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-			D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-			D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-			D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-			D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-			D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-			D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+			D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+			D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+			D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+			D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+			D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+			D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+			D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+			D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+			D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+			D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+			D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+			D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+			D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+			D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+			D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+			D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+			D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+			D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+			D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+			D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+			D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+			D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+			D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+			D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+			D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+			D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+			D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
 		}
 		else
 		{
-			D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-			D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-			D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-			D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-			D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-			D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-			D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-			D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-			D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-			D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-			D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-			D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-			D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-			D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-			D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-			D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-			D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-			D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-			D.f[DIR_000] = &DD[DIR_000*size_Mat];
-			D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-			D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-			D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-			D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-			D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-			D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-			D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-			D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+			D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+			D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+			D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+			D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+			D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+			D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+			D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+			D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+			D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+			D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+			D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+			D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+			D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+			D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+			D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+			D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+			D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+			D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+			D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+			D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+			D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+			D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+			D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+			D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+			D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+			D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+			D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
 		}
 
 		//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 		// based on BGK Plus Comp
 		//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-		real mfcbb = (D.f[DIR_P00   ])[ke   ];
-		real mfabb = (D.f[DIR_M00   ])[kw   ];
-		real mfbcb = (D.f[DIR_0P0   ])[kn   ];
-		real mfbab = (D.f[DIR_0M0   ])[ks   ];
-		real mfbbc = (D.f[DIR_00P   ])[kt   ];
-		real mfbba = (D.f[DIR_00M   ])[kb   ];
-		real mfccb = (D.f[DIR_PP0  ])[kne  ];
-		real mfaab = (D.f[DIR_MM0  ])[ksw  ];
-		real mfcab = (D.f[DIR_PM0  ])[kse  ];
-		real mfacb = (D.f[DIR_MP0  ])[knw  ];
-		real mfcbc = (D.f[DIR_P0P  ])[kte  ];
-		real mfaba = (D.f[DIR_M0M  ])[kbw  ];
-		real mfcba = (D.f[DIR_P0M  ])[kbe  ];
-		real mfabc = (D.f[DIR_M0P  ])[ktw  ];
-		real mfbcc = (D.f[DIR_0PP  ])[ktn  ];
-		real mfbaa = (D.f[DIR_0MM  ])[kbs  ];
-		real mfbca = (D.f[DIR_0PM  ])[kbn  ];
-		real mfbac = (D.f[DIR_0MP  ])[kts  ];
+		real mfcbb = (D.f[DIR_P00])[ke   ];
+		real mfabb = (D.f[DIR_M00])[kw   ];
+		real mfbcb = (D.f[DIR_0P0])[kn   ];
+		real mfbab = (D.f[DIR_0M0])[ks   ];
+		real mfbbc = (D.f[DIR_00P])[kt   ];
+		real mfbba = (D.f[DIR_00M])[kb   ];
+		real mfccb = (D.f[DIR_PP0])[kne  ];
+		real mfaab = (D.f[DIR_MM0])[ksw  ];
+		real mfcab = (D.f[DIR_PM0])[kse  ];
+		real mfacb = (D.f[DIR_MP0])[knw  ];
+		real mfcbc = (D.f[DIR_P0P])[kte  ];
+		real mfaba = (D.f[DIR_M0M])[kbw  ];
+		real mfcba = (D.f[DIR_P0M])[kbe  ];
+		real mfabc = (D.f[DIR_M0P])[ktw  ];
+		real mfbcc = (D.f[DIR_0PP])[ktn  ];
+		real mfbaa = (D.f[DIR_0MM])[kbs  ];
+		real mfbca = (D.f[DIR_0PM])[kbn  ];
+		real mfbac = (D.f[DIR_0MP])[kts  ];
 		real mfbbb = (D.f[DIR_000])[kzero];
-		real mfccc = (D.f[DIR_PPP ])[ktne ];
-		real mfaac = (D.f[DIR_MMP ])[ktsw ];
-		real mfcac = (D.f[DIR_PMP ])[ktse ];
-		real mfacc = (D.f[DIR_MPP ])[ktnw ];
-		real mfcca = (D.f[DIR_PPM ])[kbne ];
-		real mfaaa = (D.f[DIR_MMM ])[kbsw ];
-		real mfcaa = (D.f[DIR_PMM ])[kbse ];
-		real mfaca = (D.f[DIR_MPM ])[kbnw ];
+		real mfccc = (D.f[DIR_PPP])[ktne ];
+		real mfaac = (D.f[DIR_MMP])[ktsw ];
+		real mfcac = (D.f[DIR_PMP])[ktse ];
+		real mfacc = (D.f[DIR_MPP])[ktnw ];
+		real mfcca = (D.f[DIR_PPM])[kbne ];
+		real mfaaa = (D.f[DIR_MMM])[kbsw ];
+		real mfcaa = (D.f[DIR_PMM])[kbse ];
+		real mfaca = (D.f[DIR_MPM])[kbnw ];
 		////////////////////////////////////////////////////////////////////////////////////
 		real rho = (mfccc + mfaaa + mfaca + mfcac + mfacc + mfcaa + mfaac + mfcca +
 			        mfbac + mfbca + mfbaa + mfbcc + mfabc + mfcba + mfaba + mfcbc + mfacb + mfcab + mfaab + mfccb +
@@ -1049,33 +1074,33 @@ __global__ void QVeloStreetDeviceEQ27(
 		mfcaa = -rho * XXc * YYa * ZZa - c1o216;
 		mfaca = -rho * XXa * YYc * ZZa - c1o216;
 		//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-		(D.f[DIR_P00   ])[ke   ] = mfabb;//mfcbb;
-		(D.f[DIR_M00   ])[kw   ] = mfcbb;//mfabb;
-		(D.f[DIR_0P0   ])[kn   ] = mfbab;//mfbcb;
-		(D.f[DIR_0M0   ])[ks   ] = mfbcb;//mfbab;
-		(D.f[DIR_00P   ])[kt   ] = mfbba;//mfbbc;
-		(D.f[DIR_00M   ])[kb   ] = mfbbc;//mfbba;
-		(D.f[DIR_PP0  ])[kne  ] = mfaab;//mfccb;
-		(D.f[DIR_MM0  ])[ksw  ] = mfccb;//mfaab;
-		(D.f[DIR_PM0  ])[kse  ] = mfacb;//mfcab;
-		(D.f[DIR_MP0  ])[knw  ] = mfcab;//mfacb;
-		(D.f[DIR_P0P  ])[kte  ] = mfaba;//mfcbc;
-		(D.f[DIR_M0M  ])[kbw  ] = mfcbc;//mfaba;
-		(D.f[DIR_P0M  ])[kbe  ] = mfabc;//mfcba;
-		(D.f[DIR_M0P  ])[ktw  ] = mfcba;//mfabc;
-		(D.f[DIR_0PP  ])[ktn  ] = mfbaa;//mfbcc;
-		(D.f[DIR_0MM  ])[kbs  ] = mfbcc;//mfbaa;
-		(D.f[DIR_0PM  ])[kbn  ] = mfbac;//mfbca;
-		(D.f[DIR_0MP  ])[kts  ] = mfbca;//mfbac;
+		(D.f[DIR_P00])[ke   ] = mfabb;//mfcbb;
+		(D.f[DIR_M00])[kw   ] = mfcbb;//mfabb;
+		(D.f[DIR_0P0])[kn   ] = mfbab;//mfbcb;
+		(D.f[DIR_0M0])[ks   ] = mfbcb;//mfbab;
+		(D.f[DIR_00P])[kt   ] = mfbba;//mfbbc;
+		(D.f[DIR_00M])[kb   ] = mfbbc;//mfbba;
+		(D.f[DIR_PP0])[kne  ] = mfaab;//mfccb;
+		(D.f[DIR_MM0])[ksw  ] = mfccb;//mfaab;
+		(D.f[DIR_PM0])[kse  ] = mfacb;//mfcab;
+		(D.f[DIR_MP0])[knw  ] = mfcab;//mfacb;
+		(D.f[DIR_P0P])[kte  ] = mfaba;//mfcbc;
+		(D.f[DIR_M0M])[kbw  ] = mfcbc;//mfaba;
+		(D.f[DIR_P0M])[kbe  ] = mfabc;//mfcba;
+		(D.f[DIR_M0P])[ktw  ] = mfcba;//mfabc;
+		(D.f[DIR_0PP])[ktn  ] = mfbaa;//mfbcc;
+		(D.f[DIR_0MM])[kbs  ] = mfbcc;//mfbaa;
+		(D.f[DIR_0PM])[kbn  ] = mfbac;//mfbca;
+		(D.f[DIR_0MP])[kts  ] = mfbca;//mfbac;
 		(D.f[DIR_000])[kzero] = mfbbb;//mfbbb;
-		(D.f[DIR_PPP ])[ktne ] = mfaaa;//mfccc;
-		(D.f[DIR_MMP ])[ktsw ] = mfcca;//mfaac;
-		(D.f[DIR_PMP ])[ktse ] = mfaca;//mfcac;
-		(D.f[DIR_MPP ])[ktnw ] = mfcaa;//mfacc;
-		(D.f[DIR_PPM ])[kbne ] = mfaac;//mfcca;
-		(D.f[DIR_MMM ])[kbsw ] = mfccc;//mfaaa;
-		(D.f[DIR_PMM ])[kbse ] = mfacc;//mfcaa;
-		(D.f[DIR_MPM ])[kbnw ] = mfcac;//mfaca;
+		(D.f[DIR_PPP])[ktne ] = mfaaa;//mfccc;
+		(D.f[DIR_MMP])[ktsw ] = mfcca;//mfaac;
+		(D.f[DIR_PMP])[ktse ] = mfaca;//mfcac;
+		(D.f[DIR_MPP])[ktnw ] = mfcaa;//mfacc;
+		(D.f[DIR_PPM])[kbne ] = mfaac;//mfcca;
+		(D.f[DIR_MMM])[kbsw ] = mfccc;//mfaaa;
+		(D.f[DIR_PMM])[kbse ] = mfacc;//mfcaa;
+		(D.f[DIR_MPM])[kbnw ] = mfcac;//mfaca;
 	}
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1120,80 +1145,80 @@ __global__ void QVeloStreetDeviceEQ27(
 
 //////////////////////////////////////////////////////////////////////////////
 __global__ void QVelDeviceIncompHighNu27(
-													real* vx,
-													real* vy,
-													real* vz,
-													real* DD, 
-													int* k_Q, 
-													real* QQ,
-													unsigned int numberOfBCnodes, 
-													real om1, 
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													unsigned int size_Mat, 
-													bool isEvenTimestep)
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD, 
+    int* k_Q, 
+    real* QQ,
+    unsigned int numberOfBCnodes, 
+    real om1, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -1218,24 +1243,24 @@ __global__ void QVelDeviceIncompHighNu27(
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -1278,32 +1303,32 @@ __global__ void QVelDeviceIncompHighNu27(
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_E   = (D.f[DIR_P00   ])[ke   ];
-      f_W   = (D.f[DIR_M00   ])[kw   ];
-      f_N   = (D.f[DIR_0P0   ])[kn   ];
-      f_S   = (D.f[DIR_0M0   ])[ks   ];
-      f_T   = (D.f[DIR_00P   ])[kt   ];
-      f_B   = (D.f[DIR_00M   ])[kb   ];
-      f_NE  = (D.f[DIR_PP0  ])[kne  ];
-      f_SW  = (D.f[DIR_MM0  ])[ksw  ];
-      f_SE  = (D.f[DIR_PM0  ])[kse  ];
-      f_NW  = (D.f[DIR_MP0  ])[knw  ];
-      f_TE  = (D.f[DIR_P0P  ])[kte  ];
-      f_BW  = (D.f[DIR_M0M  ])[kbw  ];
-      f_BE  = (D.f[DIR_P0M  ])[kbe  ];
-      f_TW  = (D.f[DIR_M0P  ])[ktw  ];
-      f_TN  = (D.f[DIR_0PP  ])[ktn  ];
-      f_BS  = (D.f[DIR_0MM  ])[kbs  ];
-      f_BN  = (D.f[DIR_0PM  ])[kbn  ];
-      f_TS  = (D.f[DIR_0MP  ])[kts  ];
-      f_TNE = (D.f[DIR_PPP ])[ktne ];
-      f_TSW = (D.f[DIR_MMP ])[ktsw ];
-      f_TSE = (D.f[DIR_PMP ])[ktse ];
-      f_TNW = (D.f[DIR_MPP ])[ktnw ];
-      f_BNE = (D.f[DIR_PPM ])[kbne ];
-      f_BSW = (D.f[DIR_MMM ])[kbsw ];
-      f_BSE = (D.f[DIR_PMM ])[kbse ];
-      f_BNW = (D.f[DIR_MPM ])[kbnw ];
+      f_E   = (D.f[DIR_P00])[ke   ];
+      f_W   = (D.f[DIR_M00])[kw   ];
+      f_N   = (D.f[DIR_0P0])[kn   ];
+      f_S   = (D.f[DIR_0M0])[ks   ];
+      f_T   = (D.f[DIR_00P])[kt   ];
+      f_B   = (D.f[DIR_00M])[kb   ];
+      f_NE  = (D.f[DIR_PP0])[kne  ];
+      f_SW  = (D.f[DIR_MM0])[ksw  ];
+      f_SE  = (D.f[DIR_PM0])[kse  ];
+      f_NW  = (D.f[DIR_MP0])[knw  ];
+      f_TE  = (D.f[DIR_P0P])[kte  ];
+      f_BW  = (D.f[DIR_M0M])[kbw  ];
+      f_BE  = (D.f[DIR_P0M])[kbe  ];
+      f_TW  = (D.f[DIR_M0P])[ktw  ];
+      f_TN  = (D.f[DIR_0PP])[ktn  ];
+      f_BS  = (D.f[DIR_0MM])[kbs  ];
+      f_BN  = (D.f[DIR_0PM])[kbn  ];
+      f_TS  = (D.f[DIR_0MP])[kts  ];
+      f_TNE = (D.f[DIR_PPP])[ktne ];
+      f_TSW = (D.f[DIR_MMP])[ktsw ];
+      f_TSE = (D.f[DIR_PMP])[ktse ];
+      f_TNW = (D.f[DIR_MPP])[ktnw ];
+      f_BNE = (D.f[DIR_PPM])[kbne ];
+      f_BSW = (D.f[DIR_MMM])[kbsw ];
+      f_BSE = (D.f[DIR_PMM])[kbse ];
+      f_BNW = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho, feq, q;
       drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -1328,63 +1353,63 @@ __global__ void QVelDeviceIncompHighNu27(
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
@@ -1618,80 +1643,80 @@ __global__ void QVelDeviceIncompHighNu27(
 
 //////////////////////////////////////////////////////////////////////////////
 __global__ void QVelDeviceCompHighNu27(
-													real* vx,
-													real* vy,
-													real* vz,
-													real* DD,
-													int* k_Q,
-													real* QQ,
-													unsigned int numberOfBCnodes, 
-													real om1,
-													unsigned int* neighborX,
-													unsigned int* neighborY,
-													unsigned int* neighborZ,
-													unsigned int size_Mat, 
-													bool isEvenTimestep)
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD,
+    int* k_Q,
+    real* QQ,
+    unsigned int numberOfBCnodes, 
+    real om1,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -1716,24 +1741,24 @@ __global__ void QVelDeviceCompHighNu27(
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -1776,58 +1801,58 @@ __global__ void QVelDeviceCompHighNu27(
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_E   = (D.f[DIR_P00   ])[ke   ];
-      f_W   = (D.f[DIR_M00   ])[kw   ];
-      f_N   = (D.f[DIR_0P0   ])[kn   ];
-      f_S   = (D.f[DIR_0M0   ])[ks   ];
-      f_T   = (D.f[DIR_00P   ])[kt   ];
-      f_B   = (D.f[DIR_00M   ])[kb   ];
-      f_NE  = (D.f[DIR_PP0  ])[kne  ];
-      f_SW  = (D.f[DIR_MM0  ])[ksw  ];
-      f_SE  = (D.f[DIR_PM0  ])[kse  ];
-      f_NW  = (D.f[DIR_MP0  ])[knw  ];
-      f_TE  = (D.f[DIR_P0P  ])[kte  ];
-      f_BW  = (D.f[DIR_M0M  ])[kbw  ];
-      f_BE  = (D.f[DIR_P0M  ])[kbe  ];
-      f_TW  = (D.f[DIR_M0P  ])[ktw  ];
-      f_TN  = (D.f[DIR_0PP  ])[ktn  ];
-      f_BS  = (D.f[DIR_0MM  ])[kbs  ];
-      f_BN  = (D.f[DIR_0PM  ])[kbn  ];
-      f_TS  = (D.f[DIR_0MP  ])[kts  ];
-      f_TNE = (D.f[DIR_PPP ])[ktne ];
-      f_TSW = (D.f[DIR_MMP ])[ktsw ];
-      f_TSE = (D.f[DIR_PMP ])[ktse ];
-      f_TNW = (D.f[DIR_MPP ])[ktnw ];
-      f_BNE = (D.f[DIR_PPM ])[kbne ];
-      f_BSW = (D.f[DIR_MMM ])[kbsw ];
-      f_BSE = (D.f[DIR_PMM ])[kbse ];
-      f_BNW = (D.f[DIR_MPM ])[kbnw ];
-      //f_W    = (D.f[DIR_P00   ])[ke   ];
-      //f_E    = (D.f[DIR_M00   ])[kw   ];
-      //f_S    = (D.f[DIR_0P0   ])[kn   ];
-      //f_N    = (D.f[DIR_0M0   ])[ks   ];
-      //f_B    = (D.f[DIR_00P   ])[kt   ];
-      //f_T    = (D.f[DIR_00M   ])[kb   ];
-      //f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      //f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      //f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      //f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      //f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      //f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      //f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      //f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      //f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      //f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      //f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      //f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      //f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      //f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      //f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      //f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      //f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      //f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      //f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      //f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_E   = (D.f[DIR_P00])[ke   ];
+      f_W   = (D.f[DIR_M00])[kw   ];
+      f_N   = (D.f[DIR_0P0])[kn   ];
+      f_S   = (D.f[DIR_0M0])[ks   ];
+      f_T   = (D.f[DIR_00P])[kt   ];
+      f_B   = (D.f[DIR_00M])[kb   ];
+      f_NE  = (D.f[DIR_PP0])[kne  ];
+      f_SW  = (D.f[DIR_MM0])[ksw  ];
+      f_SE  = (D.f[DIR_PM0])[kse  ];
+      f_NW  = (D.f[DIR_MP0])[knw  ];
+      f_TE  = (D.f[DIR_P0P])[kte  ];
+      f_BW  = (D.f[DIR_M0M])[kbw  ];
+      f_BE  = (D.f[DIR_P0M])[kbe  ];
+      f_TW  = (D.f[DIR_M0P])[ktw  ];
+      f_TN  = (D.f[DIR_0PP])[ktn  ];
+      f_BS  = (D.f[DIR_0MM])[kbs  ];
+      f_BN  = (D.f[DIR_0PM])[kbn  ];
+      f_TS  = (D.f[DIR_0MP])[kts  ];
+      f_TNE = (D.f[DIR_PPP])[ktne ];
+      f_TSW = (D.f[DIR_MMP])[ktsw ];
+      f_TSE = (D.f[DIR_PMP])[ktse ];
+      f_TNW = (D.f[DIR_MPP])[ktnw ];
+      f_BNE = (D.f[DIR_PPM])[kbne ];
+      f_BSW = (D.f[DIR_MMM])[kbsw ];
+      f_BSE = (D.f[DIR_PMM])[kbse ];
+      f_BNW = (D.f[DIR_MPM])[kbnw ];
+      //f_W    = (D.f[DIR_P00])[ke   ];
+      //f_E    = (D.f[DIR_M00])[kw   ];
+      //f_S    = (D.f[DIR_0P0])[kn   ];
+      //f_N    = (D.f[DIR_0M0])[ks   ];
+      //f_B    = (D.f[DIR_00P])[kt   ];
+      //f_T    = (D.f[DIR_00M])[kb   ];
+      //f_SW   = (D.f[DIR_PP0])[kne  ];
+      //f_NE   = (D.f[DIR_MM0])[ksw  ];
+      //f_NW   = (D.f[DIR_PM0])[kse  ];
+      //f_SE   = (D.f[DIR_MP0])[knw  ];
+      //f_BW   = (D.f[DIR_P0P])[kte  ];
+      //f_TE   = (D.f[DIR_M0M])[kbw  ];
+      //f_TW   = (D.f[DIR_P0M])[kbe  ];
+      //f_BE   = (D.f[DIR_M0P])[ktw  ];
+      //f_BS   = (D.f[DIR_0PP])[ktn  ];
+      //f_TN   = (D.f[DIR_0MM])[kbs  ];
+      //f_TS   = (D.f[DIR_0PM])[kbn  ];
+      //f_BN   = (D.f[DIR_0MP])[kts  ];
+      //f_BSW  = (D.f[DIR_PPP])[ktne ];
+      //f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      //f_BNW  = (D.f[DIR_PMP])[ktse ];
+      //f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      //f_TSW  = (D.f[DIR_PPM])[kbne ];
+      //f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      //f_TNW  = (D.f[DIR_PMM])[kbse ];
+      //f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho, feq, q;
       drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -1852,63 +1877,63 @@ __global__ void QVelDeviceCompHighNu27(
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
@@ -2194,39 +2219,32 @@ __global__ void QVelDeviceCompHighNu27(
 
 //////////////////////////////////////////////////////////////////////////////
 __global__ void QVelDeviceCompZeroPress27(
-														real* velocityX,
-														real* velocityY,
-														real* velocityZ,
-														real* distribution, 
-														int* subgridDistanceIndices, 
-														real* subgridDistances,
-														unsigned int numberOfBCnodes, 
-														real omega, 
-														unsigned int* neighborX,
-														unsigned int* neighborY,
-														unsigned int* neighborZ,
-														unsigned int numberOfLBnodes, 
-														bool isEvenTimestep)
+    real* velocityX,
+    real* velocityY,
+    real* velocityZ,
+    real* distribution, 
+    int* subgridDistanceIndices, 
+    real* subgridDistances,
+    unsigned int numberOfBCnodes, 
+    real omega, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    //////////////////////////////////////////////////////////////////////////
-	//! The velocity boundary condition is executed in the following steps
-	//!
-	////////////////////////////////////////////////////////////////////////////////
-	//! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
-	//!
-   const unsigned  x = threadIdx.x;  // global x-index 
-   const unsigned  y = blockIdx.x;   // global y-index 
-   const unsigned  z = blockIdx.y;   // global z-index 
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
-
-   const unsigned k = nx*(ny*z + y) + x;
+   //! The velocity boundary condition is executed in the following steps
+   //!
+   ////////////////////////////////////////////////////////////////////////////////
+   //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+   //!
+   const unsigned nodeIndex = getNodeIndex();
 
    //////////////////////////////////////////////////////////////////////////
    //! - Run for all indices in size of boundary condition (numberOfBCnodes)
    //!
-   if(k < numberOfBCnodes)
+   if(nodeIndex < numberOfBCnodes)
    {
 
       //////////////////////////////////////////////////////////////////////////
@@ -2239,9 +2257,9 @@ __global__ void QVelDeviceCompZeroPress27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local velocities
       //!
-      real VeloX = velocityX[k];
-      real VeloY = velocityY[k];
-      real VeloZ = velocityZ[k];
+      real VeloX = velocityX[nodeIndex];
+      real VeloY = velocityY[nodeIndex];
+      real VeloZ = velocityZ[nodeIndex];
 
 
       ////////////////////////////////////////////////////////////////////////////////
@@ -2253,7 +2271,7 @@ __global__ void QVelDeviceCompZeroPress27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set neighbor indices (necessary for indirect addressing)
       //!
-      unsigned int KQK  = subgridDistanceIndices[k];
+      unsigned int KQK  = subgridDistanceIndices[nodeIndex];
       unsigned int kzero= KQK;
       unsigned int ke   = KQK;
       unsigned int kw   = neighborX[KQK];
@@ -2285,32 +2303,32 @@ __global__ void QVelDeviceCompZeroPress27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local distributions
       //!
-      real f_W    = (dist.f[DIR_P00   ])[ke   ];
-      real f_E    = (dist.f[DIR_M00   ])[kw   ];
-      real f_S    = (dist.f[DIR_0P0   ])[kn   ];
-      real f_N    = (dist.f[DIR_0M0   ])[ks   ];
-      real f_B    = (dist.f[DIR_00P   ])[kt   ];
-      real f_T    = (dist.f[DIR_00M   ])[kb   ];
-      real f_SW   = (dist.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (dist.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (dist.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (dist.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (dist.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (dist.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (dist.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (dist.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (dist.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (dist.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (dist.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (dist.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (dist.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (dist.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (dist.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (dist.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (dist.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (dist.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (dist.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (dist.f[DIR_MPM ])[kbnw ];
+      real f_W    = (dist.f[DIR_P00])[ke   ];
+      real f_E    = (dist.f[DIR_M00])[kw   ];
+      real f_S    = (dist.f[DIR_0P0])[kn   ];
+      real f_N    = (dist.f[DIR_0M0])[ks   ];
+      real f_B    = (dist.f[DIR_00P])[kt   ];
+      real f_T    = (dist.f[DIR_00M])[kb   ];
+      real f_SW   = (dist.f[DIR_PP0])[kne  ];
+      real f_NE   = (dist.f[DIR_MM0])[ksw  ];
+      real f_NW   = (dist.f[DIR_PM0])[kse  ];
+      real f_SE   = (dist.f[DIR_MP0])[knw  ];
+      real f_BW   = (dist.f[DIR_P0P])[kte  ];
+      real f_TE   = (dist.f[DIR_M0M])[kbw  ];
+      real f_TW   = (dist.f[DIR_P0M])[kbe  ];
+      real f_BE   = (dist.f[DIR_M0P])[ktw  ];
+      real f_BS   = (dist.f[DIR_0PP])[ktn  ];
+      real f_TN   = (dist.f[DIR_0MM])[kbs  ];
+      real f_TS   = (dist.f[DIR_0PM])[kbn  ];
+      real f_BN   = (dist.f[DIR_0MP])[kts  ];
+      real f_BSW  = (dist.f[DIR_PPP])[ktne ];
+      real f_BNE  = (dist.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (dist.f[DIR_PMP])[ktse ];
+      real f_BSE  = (dist.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (dist.f[DIR_PPM])[kbne ];
+      real f_TNE  = (dist.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (dist.f[DIR_PMM])[kbse ];
+      real f_TSE  = (dist.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Calculate macroscopic quantities
@@ -2342,7 +2360,7 @@ __global__ void QVelDeviceCompZeroPress27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Update distributions with subgrid distance (q) between zero and one
       real feq, q, velocityLB, velocityBC;
-      q = (subgridD.q[DIR_P00])[k];
+      q = (subgridD.q[DIR_P00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1) // only update distribution for q between zero and one
       {
          velocityLB = vx1;
@@ -2351,7 +2369,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_M00])[kw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_E, f_W, feq, omega, drho, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_M00])[k];
+      q = (subgridD.q[DIR_M00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1;
@@ -2360,7 +2378,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_P00])[ke] = getInterpolatedDistributionForVeloWithPressureBC(q, f_W, f_E, feq, omega, drho, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_0P0])[k];
+      q = (subgridD.q[DIR_0P0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx2;
@@ -2369,7 +2387,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_0M0])[ks] = getInterpolatedDistributionForVeloWithPressureBC(q, f_N, f_S, feq, omega, drho, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_0M0])[k];
+      q = (subgridD.q[DIR_0M0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx2;
@@ -2378,7 +2396,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_0P0])[kn] = getInterpolatedDistributionForVeloWithPressureBC(q, f_S, f_N, feq, omega, drho, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_00P])[k];
+      q = (subgridD.q[DIR_00P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx3;
@@ -2387,7 +2405,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_00M])[kb] = getInterpolatedDistributionForVeloWithPressureBC(q, f_T, f_B, feq, omega, drho, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_00M])[k];
+      q = (subgridD.q[DIR_00M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx3;
@@ -2396,7 +2414,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_00P])[kt] = getInterpolatedDistributionForVeloWithPressureBC(q, f_B, f_T, feq, omega, drho, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_PP0])[k];
+      q = (subgridD.q[DIR_PP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx2;
@@ -2405,7 +2423,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_MM0])[ksw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_NE, f_SW, feq, omega, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_MM0])[k];
+      q = (subgridD.q[DIR_MM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx2;
@@ -2414,7 +2432,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_PP0])[kne] = getInterpolatedDistributionForVeloWithPressureBC(q, f_SW, f_NE, feq, omega, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_PM0])[k];
+      q = (subgridD.q[DIR_PM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx2;
@@ -2423,7 +2441,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_MP0])[knw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_SE, f_NW, feq, omega, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_MP0])[k];
+      q = (subgridD.q[DIR_MP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx2;
@@ -2432,7 +2450,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_PM0])[kse] = getInterpolatedDistributionForVeloWithPressureBC(q, f_NW, f_SE, feq, omega, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_P0P])[k];
+      q = (subgridD.q[DIR_P0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx3;
@@ -2441,7 +2459,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_M0M])[kbw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TE, f_BW, feq, omega, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_M0M])[k];
+      q = (subgridD.q[DIR_M0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx3;
@@ -2450,7 +2468,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_P0P])[kte] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BW, f_TE, feq, omega, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_P0M])[k];
+      q = (subgridD.q[DIR_P0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx3;
@@ -2459,7 +2477,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_M0P])[ktw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BE, f_TW, feq, omega, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_M0P])[k];
+      q = (subgridD.q[DIR_M0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx3;
@@ -2468,7 +2486,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_P0M])[kbe] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TW, f_BE, feq, omega, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0PP])[k];
+      q = (subgridD.q[DIR_0PP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx2 + vx3;
@@ -2477,7 +2495,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_0MM])[kbs] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TN, f_BS, feq, omega, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0MM])[k];
+      q = (subgridD.q[DIR_0MM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx2 - vx3;
@@ -2486,7 +2504,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_0PP])[ktn] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BS, f_TN, feq, omega, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0PM])[k];
+      q = (subgridD.q[DIR_0PM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx2 - vx3;
@@ -2495,7 +2513,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_0MP])[kts] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BN, f_TS, feq, omega, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0MP])[k];
+      q = (subgridD.q[DIR_0MP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx2 + vx3;
@@ -2504,7 +2522,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_0PM])[kbn] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TS, f_BN, feq, omega, drho, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_PPP])[k];
+      q = (subgridD.q[DIR_PPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx2 + vx3;
@@ -2513,7 +2531,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_MMM])[kbsw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TNE, f_BSW, feq, omega, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MMM])[k];
+      q = (subgridD.q[DIR_MMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx2 - vx3;
@@ -2522,7 +2540,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_PPP])[ktne] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BSW, f_TNE, feq, omega, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PPM])[k];
+      q = (subgridD.q[DIR_PPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx2 - vx3;
@@ -2531,7 +2549,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_MMP])[ktsw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BNE, f_TSW, feq, omega, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MMP])[k];
+      q = (subgridD.q[DIR_MMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx2 + vx3;
@@ -2540,7 +2558,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_PPM])[kbne] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TSW, f_BNE, feq, omega, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PMP])[k];
+      q = (subgridD.q[DIR_PMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx2 + vx3;
@@ -2549,7 +2567,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_MPM])[kbnw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TSE, f_BNW, feq, omega, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MPM])[k];
+      q = (subgridD.q[DIR_MPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx2 - vx3;
@@ -2558,7 +2576,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_PMP])[ktse] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BNW, f_TSE, feq, omega, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PMM])[k];
+      q = (subgridD.q[DIR_PMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx2 - vx3;
@@ -2567,7 +2585,7 @@ __global__ void QVelDeviceCompZeroPress27(
          (dist.f[DIR_MPP])[ktnw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BSE, f_TNW, feq, omega, drho, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MPP])[k];
+      q = (subgridD.q[DIR_MPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx2 + vx3;
@@ -2619,87 +2637,88 @@ __global__ void QVelDeviceCompZeroPress27(
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QVelDeviceCompZeroPress1h27( int inx,
-														int iny,
-														real* vx,
-														real* vy,
-														real* vz,
-														real* DD, 
-														int* k_Q, 
-														real* QQ,
-														unsigned int numberOfBCnodes,
-														real om1, 
-														real Phi,
-														real angularVelocity,
-														unsigned int* neighborX,
-														unsigned int* neighborY,
-														unsigned int* neighborZ,
-														real* coordX,
-														real* coordY,
-														real* coordZ,
-														unsigned int size_Mat, 
-														bool isEvenTimestep)
+__global__ void QVelDeviceCompZeroPress1h27(
+    int inx,
+    int iny,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD, 
+    int* k_Q, 
+    real* QQ,
+    unsigned int numberOfBCnodes,
+    real om1, 
+    real Phi,
+    real angularVelocity,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* coordX,
+    real* coordY,
+    real* coordZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -2738,24 +2757,24 @@ __global__ void QVelDeviceCompZeroPress1h27( int inx,
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -2797,63 +2816,63 @@ __global__ void QVelDeviceCompZeroPress1h27( int inx,
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  real vx1, vx2, vx3, drho, feq, q, cu_sq;
@@ -3090,21 +3109,22 @@ __global__ void QVelDeviceCompZeroPress1h27( int inx,
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void LB_BC_Vel_West_27( int nx, 
-                                              int ny, 
-                                              int nz, 
-                                              int itz, 
-                                              unsigned int* bcMatD, 
-                                              unsigned int* neighborX,
-                                              unsigned int* neighborY,
-                                              unsigned int* neighborZ,
-                                              real* DD, 
-                                              unsigned int size_Mat, 
-                                              bool isEvenTimestep, 
-                                              real u0x, 
-                                              unsigned int grid_nx, 
-                                              unsigned int grid_ny, 
-                                              real om) 
+__global__ void LB_BC_Vel_West_27(
+    int nx, 
+    int ny, 
+    int nz, 
+    int itz, 
+    unsigned int* bcMatD, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* DD, 
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep, 
+    real u0x, 
+    unsigned int grid_nx, 
+    unsigned int grid_ny, 
+    real om) 
 {
    //thread-index
    unsigned int ity = blockIdx.x;
@@ -3125,63 +3145,63 @@ __global__ void LB_BC_Vel_West_27( int nx,
       Distributions27 D;
       if (isEvenTimestep==true)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
 
       ////////////////////////////////////////////////////////////////////////////////
@@ -3300,33 +3320,33 @@ __global__ void LB_BC_Vel_West_27( int nx,
       real        f1_E,f1_W,f1_N,f1_S,f1_T,f1_B,f1_NE,f1_SW,f1_SE,f1_NW,f1_TE,f1_BW,f1_BE,f1_TW,f1_TN,f1_BS,f1_BN,f1_TS,f1_ZERO,
          f1_TNE,f1_TSW,f1_TSE,f1_TNW,f1_BNE,f1_BSW,f1_BSE,f1_BNW;
 
-      f1_W    = (D.f[DIR_P00   ])[k1e   ];
-      f1_E    = (D.f[DIR_M00   ])[k1w   ];
-      f1_S    = (D.f[DIR_0P0   ])[k1n   ];
-      f1_N    = (D.f[DIR_0M0   ])[k1s   ];
-      f1_B    = (D.f[DIR_00P   ])[k1t   ];
-      f1_T    = (D.f[DIR_00M   ])[k1b   ];
-      f1_SW   = (D.f[DIR_PP0  ])[k1ne  ];
-      f1_NE   = (D.f[DIR_MM0  ])[k1sw  ];
-      f1_NW   = (D.f[DIR_PM0  ])[k1se  ];
-      f1_SE   = (D.f[DIR_MP0  ])[k1nw  ];
-      f1_BW   = (D.f[DIR_P0P  ])[k1te  ];
-      f1_TE   = (D.f[DIR_M0M  ])[k1bw  ];
-      f1_TW   = (D.f[DIR_P0M  ])[k1be  ];
-      f1_BE   = (D.f[DIR_M0P  ])[k1tw  ];
-      f1_BS   = (D.f[DIR_0PP  ])[k1tn  ];
-      f1_TN   = (D.f[DIR_0MM  ])[k1bs  ];
-      f1_TS   = (D.f[DIR_0PM  ])[k1bn  ];
-      f1_BN   = (D.f[DIR_0MP  ])[k1ts  ];
+      f1_W    = (D.f[DIR_P00])[k1e   ];
+      f1_E    = (D.f[DIR_M00])[k1w   ];
+      f1_S    = (D.f[DIR_0P0])[k1n   ];
+      f1_N    = (D.f[DIR_0M0])[k1s   ];
+      f1_B    = (D.f[DIR_00P])[k1t   ];
+      f1_T    = (D.f[DIR_00M])[k1b   ];
+      f1_SW   = (D.f[DIR_PP0])[k1ne  ];
+      f1_NE   = (D.f[DIR_MM0])[k1sw  ];
+      f1_NW   = (D.f[DIR_PM0])[k1se  ];
+      f1_SE   = (D.f[DIR_MP0])[k1nw  ];
+      f1_BW   = (D.f[DIR_P0P])[k1te  ];
+      f1_TE   = (D.f[DIR_M0M])[k1bw  ];
+      f1_TW   = (D.f[DIR_P0M])[k1be  ];
+      f1_BE   = (D.f[DIR_M0P])[k1tw  ];
+      f1_BS   = (D.f[DIR_0PP])[k1tn  ];
+      f1_TN   = (D.f[DIR_0MM])[k1bs  ];
+      f1_TS   = (D.f[DIR_0PM])[k1bn  ];
+      f1_BN   = (D.f[DIR_0MP])[k1ts  ];
       f1_ZERO = (D.f[DIR_000])[k1zero];
-      f1_BSW  = (D.f[DIR_PPP ])[k1tne ];
-      f1_BNE  = (D.f[DIR_MMP ])[k1tsw ];
-      f1_BNW  = (D.f[DIR_PMP ])[k1tse ];
-      f1_BSE  = (D.f[DIR_MPP ])[k1tnw ];
-      f1_TSW  = (D.f[DIR_PPM ])[k1bne ];
-      f1_TNE  = (D.f[DIR_MMM ])[k1bsw ];
-      f1_TNW  = (D.f[DIR_PMM ])[k1bse ];
-      f1_TSE  = (D.f[DIR_MPM ])[k1bnw ];
+      f1_BSW  = (D.f[DIR_PPP])[k1tne ];
+      f1_BNE  = (D.f[DIR_MMP])[k1tsw ];
+      f1_BNW  = (D.f[DIR_PMP])[k1tse ];
+      f1_BSE  = (D.f[DIR_MPP])[k1tnw ];
+      f1_TSW  = (D.f[DIR_PPM])[k1bne ];
+      f1_TNE  = (D.f[DIR_MMM])[k1bsw ];
+      f1_TNW  = (D.f[DIR_PMM])[k1bse ];
+      f1_TSE  = (D.f[DIR_MPM])[k1bnw ];
 
       real drho1    =  f1_ZERO+f1_E+f1_W+f1_N+f1_S+f1_T+f1_B+f1_NE+f1_SW+f1_SE+f1_NW+f1_TE+f1_BW+f1_BE+f1_TW+f1_TN+f1_BS+f1_BN+f1_TS+
          f1_TNE+f1_TSW+f1_TSE+f1_TNW+f1_BNE+f1_BSW+f1_BSE+f1_BNW;
@@ -3343,32 +3363,32 @@ __global__ void LB_BC_Vel_West_27( int nx,
       real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
       (D.f[DIR_000])[kzero] =   c8o27* (drho-cu_sq);
-      (D.f[DIR_P00   ])[ke   ] =   c2o27* (drho+c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cu_sq);
-      (D.f[DIR_M00   ])[kw   ] =   c2o27* (drho+c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cu_sq);
-      (D.f[DIR_0P0   ])[kn   ] =   c2o27* (drho+c3o1*(    vx2     )+c9o2*(     vx2    )*(     vx2    )-cu_sq);
-      (D.f[DIR_0M0   ])[ks   ] =   c2o27* (drho+c3o1*(   -vx2     )+c9o2*(    -vx2    )*(    -vx2    )-cu_sq);
-      (D.f[DIR_00P   ])[kt   ] =   c2o27* (drho+c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cu_sq);
-      (D.f[DIR_00M   ])[kb   ] =   c2o27* (drho+c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cu_sq);
-      (D.f[DIR_PP0  ])[kne  ] =   c1o54* (drho+c3o1*( vx1+vx2    )+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
-      (D.f[DIR_MM0  ])[ksw  ] =   c1o54* (drho+c3o1*(-vx1-vx2    )+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
-      (D.f[DIR_PM0  ])[kse  ] =   c1o54* (drho+c3o1*( vx1-vx2    )+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
-      (D.f[DIR_MP0  ])[knw  ] =   c1o54* (drho+c3o1*(-vx1+vx2    )+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
-      (D.f[DIR_P0P  ])[kte  ] =   c1o54* (drho+c3o1*( vx1    +vx3)+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
-      (D.f[DIR_M0M  ])[kbw  ] =   c1o54* (drho+c3o1*(-vx1    -vx3)+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
-      (D.f[DIR_P0M  ])[kbe  ] =   c1o54* (drho+c3o1*( vx1    -vx3)+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
-      (D.f[DIR_M0P  ])[ktw  ] =   c1o54* (drho+c3o1*(-vx1    +vx3)+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
-      (D.f[DIR_0PP  ])[ktn  ] =   c1o54* (drho+c3o1*(     vx2+vx3)+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
-      (D.f[DIR_0MM  ])[kbs  ] =   c1o54* (drho+c3o1*(    -vx2-vx3)+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
-      (D.f[DIR_0PM  ])[kbn  ] =   c1o54* (drho+c3o1*(     vx2-vx3)+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
-      (D.f[DIR_0MP  ])[kts  ] =   c1o54* (drho+c3o1*(    -vx2+vx3)+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
-      (D.f[DIR_PPP ])[ktne ] =   c1o216*(drho+c3o1*( vx1+vx2+vx3)+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
-      (D.f[DIR_MMM ])[kbsw ] =   c1o216*(drho+c3o1*(-vx1-vx2-vx3)+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
-      (D.f[DIR_PPM ])[kbne ] =   c1o216*(drho+c3o1*( vx1+vx2-vx3)+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
-      (D.f[DIR_MMP ])[ktsw ] =   c1o216*(drho+c3o1*(-vx1-vx2+vx3)+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
-      (D.f[DIR_PMP ])[ktse ] =   c1o216*(drho+c3o1*( vx1-vx2+vx3)+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
-      (D.f[DIR_MPM ])[kbnw ] =   c1o216*(drho+c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
-      (D.f[DIR_PMM ])[kbse ] =   c1o216*(drho+c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
-      (D.f[DIR_MPP ])[ktnw ] =   c1o216*(drho+c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
+      (D.f[DIR_P00])[ke   ] =   c2o27* (drho+c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cu_sq);
+      (D.f[DIR_M00])[kw   ] =   c2o27* (drho+c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cu_sq);
+      (D.f[DIR_0P0])[kn   ] =   c2o27* (drho+c3o1*(    vx2     )+c9o2*(     vx2    )*(     vx2    )-cu_sq);
+      (D.f[DIR_0M0])[ks   ] =   c2o27* (drho+c3o1*(   -vx2     )+c9o2*(    -vx2    )*(    -vx2    )-cu_sq);
+      (D.f[DIR_00P])[kt   ] =   c2o27* (drho+c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cu_sq);
+      (D.f[DIR_00M])[kb   ] =   c2o27* (drho+c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cu_sq);
+      (D.f[DIR_PP0])[kne  ] =   c1o54* (drho+c3o1*( vx1+vx2    )+c9o2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
+      (D.f[DIR_MM0])[ksw  ] =   c1o54* (drho+c3o1*(-vx1-vx2    )+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
+      (D.f[DIR_PM0])[kse  ] =   c1o54* (drho+c3o1*( vx1-vx2    )+c9o2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
+      (D.f[DIR_MP0])[knw  ] =   c1o54* (drho+c3o1*(-vx1+vx2    )+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
+      (D.f[DIR_P0P])[kte  ] =   c1o54* (drho+c3o1*( vx1    +vx3)+c9o2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
+      (D.f[DIR_M0M])[kbw  ] =   c1o54* (drho+c3o1*(-vx1    -vx3)+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
+      (D.f[DIR_P0M])[kbe  ] =   c1o54* (drho+c3o1*( vx1    -vx3)+c9o2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
+      (D.f[DIR_M0P])[ktw  ] =   c1o54* (drho+c3o1*(-vx1    +vx3)+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
+      (D.f[DIR_0PP])[ktn  ] =   c1o54* (drho+c3o1*(     vx2+vx3)+c9o2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
+      (D.f[DIR_0MM])[kbs  ] =   c1o54* (drho+c3o1*(    -vx2-vx3)+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
+      (D.f[DIR_0PM])[kbn  ] =   c1o54* (drho+c3o1*(     vx2-vx3)+c9o2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
+      (D.f[DIR_0MP])[kts  ] =   c1o54* (drho+c3o1*(    -vx2+vx3)+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
+      (D.f[DIR_PPP])[ktne ] =   c1o216*(drho+c3o1*( vx1+vx2+vx3)+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
+      (D.f[DIR_MMM])[kbsw ] =   c1o216*(drho+c3o1*(-vx1-vx2-vx3)+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
+      (D.f[DIR_PPM])[kbne ] =   c1o216*(drho+c3o1*( vx1+vx2-vx3)+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
+      (D.f[DIR_MMP])[ktsw ] =   c1o216*(drho+c3o1*(-vx1-vx2+vx3)+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
+      (D.f[DIR_PMP])[ktse ] =   c1o216*(drho+c3o1*( vx1-vx2+vx3)+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
+      (D.f[DIR_MPM])[kbnw ] =   c1o216*(drho+c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
+      (D.f[DIR_PMM])[kbse ] =   c1o216*(drho+c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
+      (D.f[DIR_MPP])[ktnw ] =   c1o216*(drho+c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
    }
    __syncthreads();
 }          
@@ -3414,18 +3434,18 @@ __global__ void LB_BC_Vel_West_27( int nx,
 
 //////////////////////////////////////////////////////////////////////////////
 __global__ void QVelDevPlainBB27(
-   real* velocityX,
-   real* velocityY,
-   real* velocityZ,
-   real* distributions,
-   int* subgridDistanceIndices,
-   real* subgridDistances,
-   uint numberOfBCnodes,
-   uint* neighborX,
-   uint* neighborY,
-   uint* neighborZ,
-   uint numberOfLBnodes,
-   bool isEvenTimestep)
+    real* velocityX,
+    real* velocityY,
+    real* velocityZ,
+    real* distributions,
+    int* subgridDistanceIndices,
+    real* subgridDistances,
+    uint numberOfBCnodes,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    //////////////////////////////////////////////////////////////////////////
    //! The velocity boundary condition is executed in the following steps
@@ -3433,18 +3453,11 @@ __global__ void QVelDevPlainBB27(
    ////////////////////////////////////////////////////////////////////////////////
    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
    //!
-   const unsigned  x = threadIdx.x;   // global x-index
-   const unsigned  y = blockIdx.x;    // global y-index
-   const unsigned  z = blockIdx.y;    // global z-index
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
-
-   const unsigned k = nx*(ny*z + y) + x;
+   const unsigned nodeIndex = getNodeIndex();
 
    //////////////////////////////////////////////////////////////////////////
    // run for all indices in size of boundary condition (numberOfBCnodes)
-   if(k < numberOfBCnodes)
+   if(nodeIndex < numberOfBCnodes)
    {
        //////////////////////////////////////////////////////////////////////////
        //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
@@ -3456,9 +3469,9 @@ __global__ void QVelDevPlainBB27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local velocities
       //!
-      real VeloX = velocityX[k];
-      real VeloY = velocityY[k];
-      real VeloZ = velocityZ[k];
+      real VeloX = velocityX[nodeIndex];
+      real VeloY = velocityY[nodeIndex];
+      real VeloZ = velocityZ[nodeIndex];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local subgrid distances (q's)
@@ -3469,7 +3482,7 @@ __global__ void QVelDevPlainBB27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set neighbor indices (necessary for indirect addressing)
       //!
-      uint indexOfBCnode = subgridDistanceIndices[k];
+      uint indexOfBCnode = subgridDistanceIndices[nodeIndex];
       uint ke   = indexOfBCnode;
       uint kw   = neighborX[indexOfBCnode];
       uint kn   = indexOfBCnode;
@@ -3500,32 +3513,32 @@ __global__ void QVelDevPlainBB27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local distributions
       //!
-      real f_W    = (dist.f[DIR_P00   ])[ke   ];
-      real f_E    = (dist.f[DIR_M00   ])[kw   ];
-      real f_S    = (dist.f[DIR_0P0   ])[kn   ];
-      real f_N    = (dist.f[DIR_0M0   ])[ks   ];
-      real f_B    = (dist.f[DIR_00P   ])[kt   ];
-      real f_T    = (dist.f[DIR_00M   ])[kb   ];
-      real f_SW   = (dist.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (dist.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (dist.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (dist.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (dist.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (dist.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (dist.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (dist.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (dist.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (dist.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (dist.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (dist.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (dist.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (dist.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (dist.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (dist.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (dist.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (dist.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (dist.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (dist.f[DIR_MPM ])[kbnw ];
+      real f_W    = (dist.f[DIR_P00])[ke   ];
+      real f_E    = (dist.f[DIR_M00])[kw   ];
+      real f_S    = (dist.f[DIR_0P0])[kn   ];
+      real f_N    = (dist.f[DIR_0M0])[ks   ];
+      real f_B    = (dist.f[DIR_00P])[kt   ];
+      real f_T    = (dist.f[DIR_00M])[kb   ];
+      real f_SW   = (dist.f[DIR_PP0])[kne  ];
+      real f_NE   = (dist.f[DIR_MM0])[ksw  ];
+      real f_NW   = (dist.f[DIR_PM0])[kse  ];
+      real f_SE   = (dist.f[DIR_MP0])[knw  ];
+      real f_BW   = (dist.f[DIR_P0P])[kte  ];
+      real f_TE   = (dist.f[DIR_M0M])[kbw  ];
+      real f_TW   = (dist.f[DIR_P0M])[kbe  ];
+      real f_BE   = (dist.f[DIR_M0P])[ktw  ];
+      real f_BS   = (dist.f[DIR_0PP])[ktn  ];
+      real f_TN   = (dist.f[DIR_0MM])[kbs  ];
+      real f_TS   = (dist.f[DIR_0PM])[kbn  ];
+      real f_BN   = (dist.f[DIR_0MP])[kts  ];
+      real f_BSW  = (dist.f[DIR_PPP])[ktne ];
+      real f_BNE  = (dist.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (dist.f[DIR_PMP])[ktse ];
+      real f_BSE  = (dist.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (dist.f[DIR_PPM])[kbne ];
+      real f_TNE  = (dist.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (dist.f[DIR_PMM])[kbse ];
+      real f_TSE  = (dist.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - change the pointer to write the results in the correct array
@@ -3535,32 +3548,32 @@ __global__ void QVelDevPlainBB27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - rewrite distributions if there is a sub-grid distance (q) in same direction
       real q;
-      q = (subgridD.q[DIR_P00  ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_M00  ])[kw  ]=f_E   + c4o9  * (-VeloX);
-      q = (subgridD.q[DIR_M00  ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_P00  ])[ke  ]=f_W   + c4o9  * ( VeloX);
-      q = (subgridD.q[DIR_0P0  ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0M0  ])[ks  ]=f_N   + c4o9  * (-VeloY);
-      q = (subgridD.q[DIR_0M0  ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0P0  ])[kn  ]=f_S   + c4o9  * ( VeloY);
-      q = (subgridD.q[DIR_00P  ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_00M  ])[kb  ]=f_T   + c4o9  * (-VeloZ);
-      q = (subgridD.q[DIR_00M  ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_00P  ])[kt  ]=f_B   + c4o9  * ( VeloZ);
-      q = (subgridD.q[DIR_PP0 ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MM0 ])[ksw ]=f_NE  + c1o9  * (-VeloX - VeloY);
-      q = (subgridD.q[DIR_MM0 ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PP0 ])[kne ]=f_SW  + c1o9  * ( VeloX + VeloY);
-      q = (subgridD.q[DIR_PM0 ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MP0 ])[knw ]=f_SE  + c1o9  * (-VeloX + VeloY);
-      q = (subgridD.q[DIR_MP0 ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PM0 ])[kse ]=f_NW  + c1o9  * ( VeloX - VeloY);
-      q = (subgridD.q[DIR_P0P ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_M0M ])[kbw ]=f_TE  + c1o9  * (-VeloX - VeloZ);
-      q = (subgridD.q[DIR_M0M ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_P0P ])[kte ]=f_BW  + c1o9  * ( VeloX + VeloZ);
-      q = (subgridD.q[DIR_P0M ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_M0P ])[ktw ]=f_BE  + c1o9  * (-VeloX + VeloZ);
-      q = (subgridD.q[DIR_M0P ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_P0M ])[kbe ]=f_TW  + c1o9  * ( VeloX - VeloZ);
-      q = (subgridD.q[DIR_0PP ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0MM ])[kbs ]=f_TN  + c1o9  * (-VeloY - VeloZ);
-      q = (subgridD.q[DIR_0MM ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0PP ])[ktn ]=f_BS  + c1o9  * ( VeloY + VeloZ);
-      q = (subgridD.q[DIR_0PM ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0MP ])[kts ]=f_BN  + c1o9  * (-VeloY + VeloZ);
-      q = (subgridD.q[DIR_0MP ])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0PM ])[kbn ]=f_TS  + c1o9  * ( VeloY - VeloZ);
-      q = (subgridD.q[DIR_PPP])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MMM])[kbsw]=f_TNE + c1o36 * (-VeloX - VeloY - VeloZ);
-      q = (subgridD.q[DIR_MMM])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PPP])[ktne]=f_BSW + c1o36 * ( VeloX + VeloY + VeloZ);
-      q = (subgridD.q[DIR_PPM])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MMP])[ktsw]=f_BNE + c1o36 * (-VeloX - VeloY + VeloZ);
-      q = (subgridD.q[DIR_MMP])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PPM])[kbne]=f_TSW + c1o36 * ( VeloX + VeloY - VeloZ);
-      q = (subgridD.q[DIR_PMP])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MPM])[kbnw]=f_TSE + c1o36 * (-VeloX + VeloY - VeloZ);
-      q = (subgridD.q[DIR_MPM])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PMP])[ktse]=f_BNW + c1o36 * ( VeloX - VeloY + VeloZ);
-      q = (subgridD.q[DIR_PMM])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MPP])[ktnw]=f_BSE + c1o36 * (-VeloX + VeloY + VeloZ);
-      q = (subgridD.q[DIR_MPP])[k];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PMM])[kbse]=f_TNW + c1o36 * ( VeloX - VeloY - VeloZ);
+      q = (subgridD.q[DIR_P00])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_M00])[kw  ]=f_E   + c4o9  * (-VeloX);
+      q = (subgridD.q[DIR_M00])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_P00])[ke  ]=f_W   + c4o9  * ( VeloX);
+      q = (subgridD.q[DIR_0P0])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0M0])[ks  ]=f_N   + c4o9  * (-VeloY);
+      q = (subgridD.q[DIR_0M0])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0P0])[kn  ]=f_S   + c4o9  * ( VeloY);
+      q = (subgridD.q[DIR_00P])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_00M])[kb  ]=f_T   + c4o9  * (-VeloZ);
+      q = (subgridD.q[DIR_00M])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_00P])[kt  ]=f_B   + c4o9  * ( VeloZ);
+      q = (subgridD.q[DIR_PP0])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MM0])[ksw ]=f_NE  + c1o9  * (-VeloX - VeloY);
+      q = (subgridD.q[DIR_MM0])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PP0])[kne ]=f_SW  + c1o9  * ( VeloX + VeloY);
+      q = (subgridD.q[DIR_PM0])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MP0])[knw ]=f_SE  + c1o9  * (-VeloX + VeloY);
+      q = (subgridD.q[DIR_MP0])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PM0])[kse ]=f_NW  + c1o9  * ( VeloX - VeloY);
+      q = (subgridD.q[DIR_P0P])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_M0M])[kbw ]=f_TE  + c1o9  * (-VeloX - VeloZ);
+      q = (subgridD.q[DIR_M0M])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_P0P])[kte ]=f_BW  + c1o9  * ( VeloX + VeloZ);
+      q = (subgridD.q[DIR_P0M])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_M0P])[ktw ]=f_BE  + c1o9  * (-VeloX + VeloZ);
+      q = (subgridD.q[DIR_M0P])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_P0M])[kbe ]=f_TW  + c1o9  * ( VeloX - VeloZ);
+      q = (subgridD.q[DIR_0PP])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0MM])[kbs ]=f_TN  + c1o9  * (-VeloY - VeloZ);
+      q = (subgridD.q[DIR_0MM])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0PP])[ktn ]=f_BS  + c1o9  * ( VeloY + VeloZ);
+      q = (subgridD.q[DIR_0PM])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0MP])[kts ]=f_BN  + c1o9  * (-VeloY + VeloZ);
+      q = (subgridD.q[DIR_0MP])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_0PM])[kbn ]=f_TS  + c1o9  * ( VeloY - VeloZ);
+      q = (subgridD.q[DIR_PPP])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MMM])[kbsw]=f_TNE + c1o36 * (-VeloX - VeloY - VeloZ);
+      q = (subgridD.q[DIR_MMM])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PPP])[ktne]=f_BSW + c1o36 * ( VeloX + VeloY + VeloZ);
+      q = (subgridD.q[DIR_PPM])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MMP])[ktsw]=f_BNE + c1o36 * (-VeloX - VeloY + VeloZ);
+      q = (subgridD.q[DIR_MMP])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PPM])[kbne]=f_TSW + c1o36 * ( VeloX + VeloY - VeloZ);
+      q = (subgridD.q[DIR_PMP])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MPM])[kbnw]=f_TSE + c1o36 * (-VeloX + VeloY - VeloZ);
+      q = (subgridD.q[DIR_MPM])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PMP])[ktse]=f_BNW + c1o36 * ( VeloX - VeloY + VeloZ);
+      q = (subgridD.q[DIR_PMM])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_MPP])[ktnw]=f_BSE + c1o36 * (-VeloX + VeloY + VeloZ);
+      q = (subgridD.q[DIR_MPP])[nodeIndex];   if (q>=c0o1 && q<=c1o1)    (dist.f[DIR_PMM])[kbse]=f_TNW + c1o36 * ( VeloX - VeloY - VeloZ);
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -3604,80 +3617,81 @@ __global__ void QVelDevPlainBB27(
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QVelDevCouette27(real* vx,
-											real* vy,
-	 										real* vz,
-											real* DD,
-											int* k_Q, 
-											real* QQ,
-											unsigned int numberOfBCnodes, 
-											real om1, 
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int size_Mat, 
-											bool isEvenTimestep)
+__global__ void QVelDevCouette27(
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD,
+    int* k_Q, 
+    real* QQ,
+    unsigned int numberOfBCnodes, 
+    real om1, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -3702,24 +3716,24 @@ __global__ void QVelDevCouette27(real* vx,
 			 *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
 			 *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
 			 *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -3761,94 +3775,94 @@ __global__ void QVelDevCouette27(real* vx,
       ////////////////////////////////////////////////////////////////////////////////
      
       ////////////////////////////////////////////////////////////////////////////////
-      real f_W    = (D.f[DIR_P00   ])[ke   ];
-      real f_E    = (D.f[DIR_M00   ])[kw   ];
-      real f_S    = (D.f[DIR_0P0   ])[kn   ];
-      real f_N    = (D.f[DIR_0M0   ])[ks   ];
-      real f_B    = (D.f[DIR_00P   ])[kt   ];
-      real f_T    = (D.f[DIR_00M   ])[kb   ];
-      real f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      real f_W    = (D.f[DIR_P00])[ke   ];
+      real f_E    = (D.f[DIR_M00])[kw   ];
+      real f_S    = (D.f[DIR_0P0])[kn   ];
+      real f_N    = (D.f[DIR_0M0])[ks   ];
+      real f_B    = (D.f[DIR_00P])[kt   ];
+      real f_T    = (D.f[DIR_00M])[kb   ];
+      real f_SW   = (D.f[DIR_PP0])[kne  ];
+      real f_NE   = (D.f[DIR_MM0])[ksw  ];
+      real f_NW   = (D.f[DIR_PM0])[kse  ];
+      real f_SE   = (D.f[DIR_MP0])[knw  ];
+      real f_BW   = (D.f[DIR_P0P])[kte  ];
+      real f_TE   = (D.f[DIR_M0M])[kbw  ];
+      real f_TW   = (D.f[DIR_P0M])[kbe  ];
+      real f_BE   = (D.f[DIR_M0P])[ktw  ];
+      real f_BS   = (D.f[DIR_0PP])[ktn  ];
+      real f_TN   = (D.f[DIR_0MM])[kbs  ];
+      real f_TS   = (D.f[DIR_0PM])[kbn  ];
+      real f_BN   = (D.f[DIR_0MP])[kts  ];
+      real f_BSW  = (D.f[DIR_PPP])[ktne ];
+      real f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (D.f[DIR_PMP])[ktse ];
+      real f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (D.f[DIR_PPM])[kbne ];
+      real f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (D.f[DIR_PMM])[kbse ];
+      real f_TSE  = (D.f[DIR_MPM])[kbnw ];
 	  ////////////////////////////////////////////////////////////////////////////////
 
 	  ////////////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
 	  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  ///////               FlowDirection Y !!!!!!!!!!                                                           ///////////////////////////////////
@@ -3868,24 +3882,24 @@ __global__ void QVelDevCouette27(real* vx,
 	  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	  //set distributions
       real q;
-      q = q_dirE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_M00  ])[kw  ]=f_E   + ms*c2o27  * VeloX;	
-      q = q_dirW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_P00  ])[ke  ]=f_W   - ms*c2o27  * VeloX;	
-      q = q_dirN[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_0M0  ])[ks  ]=f_N   + ms*c2o27  * VeloY;	
-      q = q_dirS[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_0P0  ])[kn  ]=f_S   - ms*c2o27  * VeloY;	
-	  q = q_dirT[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_00M  ])[kb  ]=f_T   + ms*c2o27  * VeloZ - c3o2*c2o27*((c2o1*VeloY-vx2)*(c2o1*VeloY-vx2)-vx2*vx2)*on;
-      q = q_dirB[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_00P  ])[kt  ]=f_B   - ms*c2o27  * VeloZ;
-      q = q_dirNE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_MM0 ])[ksw ]=f_NE  + ms*c1o54  * VeloX + ms*c1o54  * VeloY;
-	  q = q_dirSW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_PP0 ])[kne ]=f_SW  - ms*c1o54  * VeloX - ms*c1o54  * VeloY;
-	  q = q_dirSE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_MP0 ])[knw ]=f_SE  + ms*c1o54  * VeloX - ms*c1o54  * VeloY;
-	  q = q_dirNW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_PM0 ])[kse ]=f_NW  - ms*c1o54  * VeloX + ms*c1o54  * VeloY;
-	  q = q_dirTE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_M0M ])[kbw ]=f_TE  + ms*c1o54  * VeloX + ms*c1o54  * VeloZ - c3o2*c1o54*((c2o1*VeloY-vx2)*(c2o1*VeloY-vx2)-vx2*vx2)*on-c1o12*kxxMyyFromfcNEQ;
-	  q = q_dirBW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_P0P ])[kte ]=f_BW  - ms*c1o54  * VeloX - ms*c1o54  * VeloZ;
-	  q = q_dirBE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_M0P ])[ktw ]=f_BE  + ms*c1o54  * VeloX - ms*c1o54  * VeloZ;
-	  q = q_dirTW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_P0M ])[kbe ]=f_TW  - ms*c1o54  * VeloX + ms*c1o54  * VeloZ - c3o2*c1o54*((c2o1*VeloY-vx2)*(c2o1*VeloY-vx2)-vx2*vx2)*on-c1o12*kxxMyyFromfcNEQ;
-	  q = q_dirTN[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_0MM ])[kbs ]=f_TN  + ms*c1o54  * VeloY + ms*c1o54  * VeloZ + c3o1*c1o54*((c2o1*VeloY-vx2)*(c2o1*VeloY-vx2)-vx2*vx2)*on+c1o12*kxxMyyFromfcNEQ;
-	  q = q_dirBS[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_0PP ])[ktn ]=f_BS  - ms*c1o54  * VeloY - ms*c1o54  * VeloZ;
-	  q = q_dirBN[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_0MP ])[kts ]=f_BN  + ms*c1o54  * VeloY - ms*c1o54  * VeloZ;
-	  q = q_dirTS[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_0PM ])[kbn ]=f_TS  - ms*c1o54  * VeloY + ms*c1o54  * VeloZ + c3o1*c1o54*((c2o1*VeloY-vx2)*(c2o1*VeloY-vx2)-vx2*vx2)*on+c1o12*kxxMyyFromfcNEQ;
+      q = q_dirE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_M00])[kw  ]=f_E   + ms*c2o27  * VeloX;	
+      q = q_dirW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_P00])[ke  ]=f_W   - ms*c2o27  * VeloX;	
+      q = q_dirN[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_0M0])[ks  ]=f_N   + ms*c2o27  * VeloY;	
+      q = q_dirS[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_0P0])[kn  ]=f_S   - ms*c2o27  * VeloY;	
+	  q = q_dirT[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_00M])[kb  ]=f_T   + ms*c2o27  * VeloZ - c3o2*c2o27*((c2o1*VeloY-vx2)*(c2o1*VeloY-vx2)-vx2*vx2)*on;
+      q = q_dirB[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_00P])[kt  ]=f_B   - ms*c2o27  * VeloZ;
+      q = q_dirNE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_MM0])[ksw ]=f_NE  + ms*c1o54  * VeloX + ms*c1o54  * VeloY;
+	  q = q_dirSW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_PP0])[kne ]=f_SW  - ms*c1o54  * VeloX - ms*c1o54  * VeloY;
+	  q = q_dirSE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_MP0])[knw ]=f_SE  + ms*c1o54  * VeloX - ms*c1o54  * VeloY;
+	  q = q_dirNW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_PM0])[kse ]=f_NW  - ms*c1o54  * VeloX + ms*c1o54  * VeloY;
+	  q = q_dirTE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_M0M])[kbw ]=f_TE  + ms*c1o54  * VeloX + ms*c1o54  * VeloZ - c3o2*c1o54*((c2o1*VeloY-vx2)*(c2o1*VeloY-vx2)-vx2*vx2)*on-c1o12*kxxMyyFromfcNEQ;
+	  q = q_dirBW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_P0P])[kte ]=f_BW  - ms*c1o54  * VeloX - ms*c1o54  * VeloZ;
+	  q = q_dirBE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_M0P])[ktw ]=f_BE  + ms*c1o54  * VeloX - ms*c1o54  * VeloZ;
+	  q = q_dirTW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_P0M])[kbe ]=f_TW  - ms*c1o54  * VeloX + ms*c1o54  * VeloZ - c3o2*c1o54*((c2o1*VeloY-vx2)*(c2o1*VeloY-vx2)-vx2*vx2)*on-c1o12*kxxMyyFromfcNEQ;
+	  q = q_dirTN[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_0MM])[kbs ]=f_TN  + ms*c1o54  * VeloY + ms*c1o54  * VeloZ + c3o1*c1o54*((c2o1*VeloY-vx2)*(c2o1*VeloY-vx2)-vx2*vx2)*on+c1o12*kxxMyyFromfcNEQ;
+	  q = q_dirBS[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_0PP])[ktn ]=f_BS  - ms*c1o54  * VeloY - ms*c1o54  * VeloZ;
+	  q = q_dirBN[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_0MP])[kts ]=f_BN  + ms*c1o54  * VeloY - ms*c1o54  * VeloZ;
+	  q = q_dirTS[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_0PM])[kbn ]=f_TS  - ms*c1o54  * VeloY + ms*c1o54  * VeloZ + c3o1*c1o54*((c2o1*VeloY-vx2)*(c2o1*VeloY-vx2)-vx2*vx2)*on+c1o12*kxxMyyFromfcNEQ;
       q = q_dirTNE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_MMM])[kbsw]=f_TNE + ms*c1o216 * VeloX + ms*c1o216 * VeloY + ms*c1o216 * VeloZ + c3o1*c1o216*((c2o1*VeloY-vx2)*(c2o1*VeloY-vx2)-vx2*vx2)*on;
       q = q_dirBSW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_PPP])[ktne]=f_BSW - ms*c1o216 * VeloX - ms*c1o216 * VeloY - ms*c1o216 * VeloZ;
       q = q_dirBNE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_MMP])[ktsw]=f_BNE + ms*c1o216 * VeloX + ms*c1o216 * VeloY - ms*c1o216 * VeloZ;
@@ -3894,24 +3908,24 @@ __global__ void QVelDevCouette27(real* vx,
       q = q_dirBNW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_PMP])[ktse]=f_BNW - ms*c1o216 * VeloX + ms*c1o216 * VeloY - ms*c1o216 * VeloZ;
       q = q_dirBSE[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_MPP])[ktnw]=f_BSE + ms*c1o216 * VeloX - ms*c1o216 * VeloY - ms*c1o216 * VeloZ;
       q = q_dirTNW[k];	if (q>=c0o1 && q<=c1o1)	(D.f[DIR_PMM])[kbse]=f_TNW - ms*c1o216 * VeloX + ms*c1o216 * VeloY + ms*c1o216 * VeloZ + c3o1*c1o216*((c2o1*VeloY-vx2)*(c2o1*VeloY-vx2)-vx2*vx2)*on;
-      //q = q_dirE[k];	if (q>=zero && q<=one)	(D.f[DIR_M00  ])[kw  ]=f_E   + ms*c2over27  * VeloX;	
-   //   q = q_dirW[k];	if (q>=zero && q<=one)	(D.f[DIR_P00  ])[ke  ]=f_W   - ms*c2over27  * VeloX;	
-   //   q = q_dirN[k];	if (q>=zero && q<=one)	(D.f[DIR_0M0  ])[ks  ]=f_N   + ms*c2over27  * VeloY;	
-   //   q = q_dirS[k];	if (q>=zero && q<=one)	(D.f[DIR_0P0  ])[kn  ]=f_S   - ms*c2over27  * VeloY;	
-	  //q = q_dirT[k];	if (q>=zero && q<=one)	(D.f[DIR_00M  ])[kb  ]=f_T   + ms*c2over27  * VeloZ - c1o9*((two*VeloY-vx2)*(two*VeloY-vx2)-vx2*vx2)*on;
-   //   q = q_dirB[k];	if (q>=zero && q<=one)	(D.f[DIR_00P  ])[kt  ]=f_B   - ms*c2over27  * VeloZ;
-   //   q = q_dirNE[k];	if (q>=zero && q<=one)	(D.f[DIR_MM0 ])[ksw ]=f_NE  + ms*c1over54  * VeloX + ms*c1over54  * VeloY;
-	  //q = q_dirSW[k];	if (q>=zero && q<=one)	(D.f[DIR_PP0 ])[kne ]=f_SW  - ms*c1over54  * VeloX - ms*c1over54  * VeloY;
-	  //q = q_dirSE[k];	if (q>=zero && q<=one)	(D.f[DIR_MP0 ])[knw ]=f_SE  + ms*c1over54  * VeloX - ms*c1over54  * VeloY;
-	  //q = q_dirNW[k];	if (q>=zero && q<=one)	(D.f[DIR_PM0 ])[kse ]=f_NW  - ms*c1over54  * VeloX + ms*c1over54  * VeloY;
-	  //q = q_dirTE[k];	if (q>=zero && q<=one)	(D.f[DIR_M0M ])[kbw ]=f_TE  + ms*c1over54  * VeloX + ms*c1over54  * VeloZ - c1o36*((two*VeloY-vx2)*(two*VeloY-vx2)-vx2*vx2)*on;
-	  //q = q_dirBW[k];	if (q>=zero && q<=one)	(D.f[DIR_P0P ])[kte ]=f_BW  - ms*c1over54  * VeloX - ms*c1over54  * VeloZ;
-	  //q = q_dirBE[k];	if (q>=zero && q<=one)	(D.f[DIR_M0P ])[ktw ]=f_BE  + ms*c1over54  * VeloX - ms*c1over54  * VeloZ;
-	  //q = q_dirTW[k];	if (q>=zero && q<=one)	(D.f[DIR_P0M ])[kbe ]=f_TW  - ms*c1over54  * VeloX + ms*c1over54  * VeloZ - c1o36*((two*VeloY-vx2)*(two*VeloY-vx2)-vx2*vx2)*on;
-	  //q = q_dirTN[k];	if (q>=zero && q<=one)	(D.f[DIR_0MM ])[kbs ]=f_TN  + ms*c1over54  * VeloY + ms*c1over54  * VeloZ + c1o2*c1o9*((two*VeloY-vx2)*(two*VeloY-vx2)-vx2*vx2)*on;
-	  //q = q_dirBS[k];	if (q>=zero && q<=one)	(D.f[DIR_0PP ])[ktn ]=f_BS  - ms*c1over54  * VeloY - ms*c1over54  * VeloZ;
-	  //q = q_dirBN[k];	if (q>=zero && q<=one)	(D.f[DIR_0MP ])[kts ]=f_BN  + ms*c1over54  * VeloY - ms*c1over54  * VeloZ;
-	  //q = q_dirTS[k];	if (q>=zero && q<=one)	(D.f[DIR_0PM ])[kbn ]=f_TS  - ms*c1over54  * VeloY + ms*c1over54  * VeloZ + c1o2*c1o9*((two*VeloY-vx2)*(two*VeloY-vx2)-vx2*vx2)*on;
+      //q = q_dirE[k];	if (q>=zero && q<=one)	(D.f[DIR_M00])[kw  ]=f_E   + ms*c2over27  * VeloX;	
+   //   q = q_dirW[k];	if (q>=zero && q<=one)	(D.f[DIR_P00])[ke  ]=f_W   - ms*c2over27  * VeloX;	
+   //   q = q_dirN[k];	if (q>=zero && q<=one)	(D.f[DIR_0M0])[ks  ]=f_N   + ms*c2over27  * VeloY;	
+   //   q = q_dirS[k];	if (q>=zero && q<=one)	(D.f[DIR_0P0])[kn  ]=f_S   - ms*c2over27  * VeloY;	
+	  //q = q_dirT[k];	if (q>=zero && q<=one)	(D.f[DIR_00M])[kb  ]=f_T   + ms*c2over27  * VeloZ - c1o9*((two*VeloY-vx2)*(two*VeloY-vx2)-vx2*vx2)*on;
+   //   q = q_dirB[k];	if (q>=zero && q<=one)	(D.f[DIR_00P])[kt  ]=f_B   - ms*c2over27  * VeloZ;
+   //   q = q_dirNE[k];	if (q>=zero && q<=one)	(D.f[DIR_MM0])[ksw ]=f_NE  + ms*c1over54  * VeloX + ms*c1over54  * VeloY;
+	  //q = q_dirSW[k];	if (q>=zero && q<=one)	(D.f[DIR_PP0])[kne ]=f_SW  - ms*c1over54  * VeloX - ms*c1over54  * VeloY;
+	  //q = q_dirSE[k];	if (q>=zero && q<=one)	(D.f[DIR_MP0])[knw ]=f_SE  + ms*c1over54  * VeloX - ms*c1over54  * VeloY;
+	  //q = q_dirNW[k];	if (q>=zero && q<=one)	(D.f[DIR_PM0])[kse ]=f_NW  - ms*c1over54  * VeloX + ms*c1over54  * VeloY;
+	  //q = q_dirTE[k];	if (q>=zero && q<=one)	(D.f[DIR_M0M])[kbw ]=f_TE  + ms*c1over54  * VeloX + ms*c1over54  * VeloZ - c1o36*((two*VeloY-vx2)*(two*VeloY-vx2)-vx2*vx2)*on;
+	  //q = q_dirBW[k];	if (q>=zero && q<=one)	(D.f[DIR_P0P])[kte ]=f_BW  - ms*c1over54  * VeloX - ms*c1over54  * VeloZ;
+	  //q = q_dirBE[k];	if (q>=zero && q<=one)	(D.f[DIR_M0P])[ktw ]=f_BE  + ms*c1over54  * VeloX - ms*c1over54  * VeloZ;
+	  //q = q_dirTW[k];	if (q>=zero && q<=one)	(D.f[DIR_P0M])[kbe ]=f_TW  - ms*c1over54  * VeloX + ms*c1over54  * VeloZ - c1o36*((two*VeloY-vx2)*(two*VeloY-vx2)-vx2*vx2)*on;
+	  //q = q_dirTN[k];	if (q>=zero && q<=one)	(D.f[DIR_0MM])[kbs ]=f_TN  + ms*c1over54  * VeloY + ms*c1over54  * VeloZ + c1o2*c1o9*((two*VeloY-vx2)*(two*VeloY-vx2)-vx2*vx2)*on;
+	  //q = q_dirBS[k];	if (q>=zero && q<=one)	(D.f[DIR_0PP])[ktn ]=f_BS  - ms*c1over54  * VeloY - ms*c1over54  * VeloZ;
+	  //q = q_dirBN[k];	if (q>=zero && q<=one)	(D.f[DIR_0MP])[kts ]=f_BN  + ms*c1over54  * VeloY - ms*c1over54  * VeloZ;
+	  //q = q_dirTS[k];	if (q>=zero && q<=one)	(D.f[DIR_0PM])[kbn ]=f_TS  - ms*c1over54  * VeloY + ms*c1over54  * VeloZ + c1o2*c1o9*((two*VeloY-vx2)*(two*VeloY-vx2)-vx2*vx2)*on;
    //   q = q_dirTNE[k];	if (q>=zero && q<=one)	(D.f[DIR_MMM])[kbsw]=f_TNE + ms*c1over216 * VeloX + ms*c1over216 * VeloY + ms*c1over216 * VeloZ + c1o2*c1o36*((two*VeloY-vx2)*(two*VeloY-vx2)-vx2*vx2)*on;
    //   q = q_dirBSW[k];	if (q>=zero && q<=one)	(D.f[DIR_PPP])[ktne]=f_BSW - ms*c1over216 * VeloX - ms*c1over216 * VeloY - ms*c1over216 * VeloZ;
    //   q = q_dirBNE[k];	if (q>=zero && q<=one)	(D.f[DIR_MMP])[ktsw]=f_BNE + ms*c1over216 * VeloX + ms*c1over216 * VeloY - ms*c1over216 * VeloZ;
@@ -3964,87 +3978,88 @@ __global__ void QVelDevCouette27(real* vx,
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QVelDev1h27( int inx,
-										int iny,
-										real* vx,
-										real* vy,
-										real* vz,
-										real* DD, 
-										int* k_Q, 
-										real* QQ,
-										unsigned int numberOfBCnodes, 
-										real om1,
-										real Phi,
-										real angularVelocity,
-										unsigned int* neighborX,
-										unsigned int* neighborY,
-										unsigned int* neighborZ,
-										real* coordX,
-										real* coordY,
-										real* coordZ,
-										unsigned int size_Mat, 
-										bool isEvenTimestep)
+__global__ void QVelDev1h27(
+    int inx,
+    int iny,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD, 
+    int* k_Q, 
+    real* QQ,
+    unsigned int numberOfBCnodes, 
+    real om1,
+    real Phi,
+    real angularVelocity,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* coordX,
+    real* coordY,
+    real* coordZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
 	Distributions27 D;
 	if (isEvenTimestep==true)
 	{
-		D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+		D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
 	} 
 	else
 	{
-		D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-		D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-		D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-		D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-		D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-		D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-		D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-		D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-		D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-		D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-		D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-		D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-		D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-		D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-		D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-		D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-		D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-		D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DD[DIR_000*size_Mat];
-		D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-		D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-		D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-		D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-		D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-		D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-		D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-		D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+		D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+		D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+		D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
 	}
 	////////////////////////////////////////////////////////////////////////////////
 	const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -4079,24 +4094,24 @@ __global__ void QVelDev1h27( int inx,
 			*q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
 			*q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
 			*q_dirBSE, *q_dirBNW; 
-		q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-		q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-		q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-		q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-		q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-		q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-		q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-		q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-		q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-		q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-		q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-		q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-		q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-		q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-		q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-		q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-		q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-		q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+		q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+		q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+		q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+		q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+		q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+		q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+		q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+		q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+		q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+		q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+		q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+		q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+		q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+		q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+		q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+		q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+		q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+		q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
 		q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
 		q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
 		q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -4167,32 +4182,32 @@ __global__ void QVelDev1h27( int inx,
 		//real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
 		//	f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-		//f_W    = (D.f[DIR_P00   ])[ke   ];
-		//f_E    = (D.f[DIR_M00   ])[kw   ];
-		//f_S    = (D.f[DIR_0P0   ])[kn   ];
-		//f_N    = (D.f[DIR_0M0   ])[ks   ];
-		//f_B    = (D.f[DIR_00P   ])[kt   ];
-		//f_T    = (D.f[DIR_00M   ])[kb   ];
-		//f_SW   = (D.f[DIR_PP0  ])[kne  ];
-		//f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-		//f_NW   = (D.f[DIR_PM0  ])[kse  ];
-		//f_SE   = (D.f[DIR_MP0  ])[knw  ];
-		//f_BW   = (D.f[DIR_P0P  ])[kte  ];
-		//f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-		//f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-		//f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-		//f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-		//f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-		//f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-		//f_BN   = (D.f[DIR_0MP  ])[kts  ];
-		//f_BSW  = (D.f[DIR_PPP ])[ktne ];
-		//f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-		//f_BNW  = (D.f[DIR_PMP ])[ktse ];
-		//f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-		//f_TSW  = (D.f[DIR_PPM ])[kbne ];
-		//f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-		//f_TNW  = (D.f[DIR_PMM ])[kbse ];
-		//f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+		//f_W    = (D.f[DIR_P00])[ke   ];
+		//f_E    = (D.f[DIR_M00])[kw   ];
+		//f_S    = (D.f[DIR_0P0])[kn   ];
+		//f_N    = (D.f[DIR_0M0])[ks   ];
+		//f_B    = (D.f[DIR_00P])[kt   ];
+		//f_T    = (D.f[DIR_00M])[kb   ];
+		//f_SW   = (D.f[DIR_PP0])[kne  ];
+		//f_NE   = (D.f[DIR_MM0])[ksw  ];
+		//f_NW   = (D.f[DIR_PM0])[kse  ];
+		//f_SE   = (D.f[DIR_MP0])[knw  ];
+		//f_BW   = (D.f[DIR_P0P])[kte  ];
+		//f_TE   = (D.f[DIR_M0M])[kbw  ];
+		//f_TW   = (D.f[DIR_P0M])[kbe  ];
+		//f_BE   = (D.f[DIR_M0P])[ktw  ];
+		//f_BS   = (D.f[DIR_0PP])[ktn  ];
+		//f_TN   = (D.f[DIR_0MM])[kbs  ];
+		//f_TS   = (D.f[DIR_0PM])[kbn  ];
+		//f_BN   = (D.f[DIR_0MP])[kts  ];
+		//f_BSW  = (D.f[DIR_PPP])[ktne ];
+		//f_BNE  = (D.f[DIR_MMP])[ktsw ];
+		//f_BNW  = (D.f[DIR_PMP])[ktse ];
+		//f_BSE  = (D.f[DIR_MPP])[ktnw ];
+		//f_TSW  = (D.f[DIR_PPM])[kbne ];
+		//f_TNE  = (D.f[DIR_MMM])[kbsw ];
+		//f_TNW  = (D.f[DIR_PMM])[kbse ];
+		//f_TSE  = (D.f[DIR_MPM])[kbnw ];
 		////////////////////////////////////////////////////////////////////////////////
 		real /*vx1, vx2,*/ vx3, drho, feq, q, cu_sq;
 		//drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -4217,63 +4232,63 @@ __global__ void QVelDev1h27( int inx,
 		//////////////////////////////////////////////////////////////////////////
 		if (isEvenTimestep==false)
 		{
-			D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-			D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-			D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-			D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-			D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-			D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-			D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-			D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-			D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-			D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-			D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-			D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-			D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-			D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-			D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-			D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-			D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-			D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-			D.f[DIR_000] = &DD[DIR_000*size_Mat];
-			D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-			D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-			D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-			D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-			D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-			D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-			D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-			D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+			D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+			D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+			D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+			D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+			D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+			D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+			D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+			D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+			D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+			D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+			D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+			D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+			D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+			D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+			D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+			D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+			D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+			D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+			D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+			D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+			D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+			D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+			D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+			D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+			D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+			D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+			D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
 		} 
 		else
 		{
-			D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-			D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-			D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-			D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-			D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-			D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-			D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-			D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-			D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-			D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-			D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-			D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-			D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-			D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-			D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-			D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-			D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-			D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-			D.f[DIR_000] = &DD[DIR_000*size_Mat];
-			D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-			D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-			D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-			D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-			D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-			D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-			D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-			D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+			D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+			D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+			D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+			D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+			D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+			D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+			D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+			D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+			D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+			D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+			D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+			D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+			D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+			D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+			D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+			D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+			D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+			D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+			D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+			D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+			D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+			D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+			D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+			D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+			D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+			D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+			D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
 		}
 		////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 		//Test
@@ -4748,39 +4763,32 @@ __global__ void QVelDev1h27( int inx,
 
 //////////////////////////////////////////////////////////////////////////////
 __global__ void QVelDeviceComp27(
-											real* velocityX,
-											real* velocityY,
-											real* velocityZ,
-											real* distributions,
-											int* subgridDistanceIndices,
-											real* subgridDistances,
-											unsigned int numberOfBCnodes,
-											real omega,
-											unsigned int* neighborX,
-											unsigned int* neighborY,
-											unsigned int* neighborZ,
-											unsigned int numberOfLBnodes,
-											bool isEvenTimestep)
+    real* velocityX,
+    real* velocityY,
+    real* velocityZ,
+    real* distributions,
+    int* subgridDistanceIndices,
+    real* subgridDistances,
+    unsigned int numberOfBCnodes,
+    real omega,
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes,
+    bool isEvenTimestep)
 {
    //////////////////////////////////////////////////////////////////////////
    //! The velocity boundary condition is executed in the following steps
    //!
-   ////////////////////////////////////////////////////////////////////////////////
-   //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
-   //!
-   const unsigned  x = threadIdx.x;  // global x-index 
-   const unsigned  y = blockIdx.x;   // global y-index 
-   const unsigned  z = blockIdx.y;   // global z-index 
-
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
-
-   const unsigned k = nx*(ny*z + y) + x;
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
 
    //////////////////////////////////////////////////////////////////////////
    //! - Run for all indices in size of boundary condition (numberOfBCnodes)
    //!
-   if(k < numberOfBCnodes)
+   if(nodeIndex < numberOfBCnodes)
    {
       //////////////////////////////////////////////////////////////////////////
       //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
@@ -4792,9 +4800,9 @@ __global__ void QVelDeviceComp27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local velocities
       //!
-      real VeloX = velocityX[k];
-      real VeloY = velocityY[k];
-      real VeloZ = velocityZ[k];
+      real VeloX = velocityX[nodeIndex];
+      real VeloY = velocityY[nodeIndex];
+      real VeloZ = velocityZ[nodeIndex];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local subgrid distances (q's)
@@ -4805,7 +4813,7 @@ __global__ void QVelDeviceComp27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set neighbor indices (necessary for indirect addressing)
       //!
-      unsigned int indexOfBCnode  = subgridDistanceIndices[k];
+      unsigned int indexOfBCnode  = subgridDistanceIndices[nodeIndex];
       unsigned int kzero= indexOfBCnode;
       unsigned int ke   = indexOfBCnode;
       unsigned int kw   = neighborX[indexOfBCnode];
@@ -4837,32 +4845,32 @@ __global__ void QVelDeviceComp27(
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local distributions
       //!
-      real f_W    = (dist.f[DIR_P00   ])[ke   ];
-      real f_E    = (dist.f[DIR_M00   ])[kw   ];
-      real f_S    = (dist.f[DIR_0P0   ])[kn   ];
-      real f_N    = (dist.f[DIR_0M0   ])[ks   ];
-      real f_B    = (dist.f[DIR_00P   ])[kt   ];
-      real f_T    = (dist.f[DIR_00M   ])[kb   ];
-      real f_SW   = (dist.f[DIR_PP0  ])[kne  ];
-      real f_NE   = (dist.f[DIR_MM0  ])[ksw  ];
-      real f_NW   = (dist.f[DIR_PM0  ])[kse  ];
-      real f_SE   = (dist.f[DIR_MP0  ])[knw  ];
-      real f_BW   = (dist.f[DIR_P0P  ])[kte  ];
-      real f_TE   = (dist.f[DIR_M0M  ])[kbw  ];
-      real f_TW   = (dist.f[DIR_P0M  ])[kbe  ];
-      real f_BE   = (dist.f[DIR_M0P  ])[ktw  ];
-      real f_BS   = (dist.f[DIR_0PP  ])[ktn  ];
-      real f_TN   = (dist.f[DIR_0MM  ])[kbs  ];
-      real f_TS   = (dist.f[DIR_0PM  ])[kbn  ];
-      real f_BN   = (dist.f[DIR_0MP  ])[kts  ];
-      real f_BSW  = (dist.f[DIR_PPP ])[ktne ];
-      real f_BNE  = (dist.f[DIR_MMP ])[ktsw ];
-      real f_BNW  = (dist.f[DIR_PMP ])[ktse ];
-      real f_BSE  = (dist.f[DIR_MPP ])[ktnw ];
-      real f_TSW  = (dist.f[DIR_PPM ])[kbne ];
-      real f_TNE  = (dist.f[DIR_MMM ])[kbsw ];
-      real f_TNW  = (dist.f[DIR_PMM ])[kbse ];
-      real f_TSE  = (dist.f[DIR_MPM ])[kbnw ];
+      real f_W    = (dist.f[DIR_P00])[ke   ];
+      real f_E    = (dist.f[DIR_M00])[kw   ];
+      real f_S    = (dist.f[DIR_0P0])[kn   ];
+      real f_N    = (dist.f[DIR_0M0])[ks   ];
+      real f_B    = (dist.f[DIR_00P])[kt   ];
+      real f_T    = (dist.f[DIR_00M])[kb   ];
+      real f_SW   = (dist.f[DIR_PP0])[kne  ];
+      real f_NE   = (dist.f[DIR_MM0])[ksw  ];
+      real f_NW   = (dist.f[DIR_PM0])[kse  ];
+      real f_SE   = (dist.f[DIR_MP0])[knw  ];
+      real f_BW   = (dist.f[DIR_P0P])[kte  ];
+      real f_TE   = (dist.f[DIR_M0M])[kbw  ];
+      real f_TW   = (dist.f[DIR_P0M])[kbe  ];
+      real f_BE   = (dist.f[DIR_M0P])[ktw  ];
+      real f_BS   = (dist.f[DIR_0PP])[ktn  ];
+      real f_TN   = (dist.f[DIR_0MM])[kbs  ];
+      real f_TS   = (dist.f[DIR_0PM])[kbn  ];
+      real f_BN   = (dist.f[DIR_0MP])[kts  ];
+      real f_BSW  = (dist.f[DIR_PPP])[ktne ];
+      real f_BNE  = (dist.f[DIR_MMP])[ktsw ];
+      real f_BNW  = (dist.f[DIR_PMP])[ktse ];
+      real f_BSE  = (dist.f[DIR_MPP])[ktnw ];
+      real f_TSW  = (dist.f[DIR_PPM])[kbne ];
+      real f_TNE  = (dist.f[DIR_MMM])[kbsw ];
+      real f_TNW  = (dist.f[DIR_PMM])[kbse ];
+      real f_TSE  = (dist.f[DIR_MPM])[kbnw ];
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - Calculate macroscopic quantities
@@ -4894,7 +4902,7 @@ __global__ void QVelDeviceComp27(
       //! - Update distributions with subgrid distance (q) between zero and one
       //!
       real feq, q, velocityLB, velocityBC;
-      q = (subgridD.q[DIR_P00])[k];
+      q = (subgridD.q[DIR_P00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1) // only update distribution for q between zero and one
       {
          velocityLB = vx1;
@@ -4903,7 +4911,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_M00])[kw] = getInterpolatedDistributionForVeloBC(q, f_E, f_W, feq, omega, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_M00])[k];
+      q = (subgridD.q[DIR_M00])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1;
@@ -4912,7 +4920,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_P00])[ke] = getInterpolatedDistributionForVeloBC(q, f_W, f_E, feq, omega, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_0P0])[k];
+      q = (subgridD.q[DIR_0P0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx2;
@@ -4921,7 +4929,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_0M0])[ks] = getInterpolatedDistributionForVeloBC(q, f_N, f_S, feq, omega, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_0M0])[k];
+      q = (subgridD.q[DIR_0M0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx2;
@@ -4930,7 +4938,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_0P0])[kn] = getInterpolatedDistributionForVeloBC(q, f_S, f_N, feq, omega, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_00P])[k];
+      q = (subgridD.q[DIR_00P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx3;
@@ -4939,7 +4947,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_00M])[kb] = getInterpolatedDistributionForVeloBC(q, f_T, f_B, feq, omega, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_00M])[k];
+      q = (subgridD.q[DIR_00M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx3;
@@ -4948,7 +4956,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_00P])[kt] = getInterpolatedDistributionForVeloBC(q, f_B, f_T, feq, omega, velocityBC, c2o27);
       }
 
-      q = (subgridD.q[DIR_PP0])[k];
+      q = (subgridD.q[DIR_PP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx2;
@@ -4957,7 +4965,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_MM0])[ksw] = getInterpolatedDistributionForVeloBC(q, f_NE, f_SW, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_MM0])[k];
+      q = (subgridD.q[DIR_MM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx2;
@@ -4966,7 +4974,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_PP0])[kne] = getInterpolatedDistributionForVeloBC(q, f_SW, f_NE, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_PM0])[k];
+      q = (subgridD.q[DIR_PM0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx2;
@@ -4975,7 +4983,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_MP0])[knw] = getInterpolatedDistributionForVeloBC(q, f_SE, f_NW, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_MP0])[k];
+      q = (subgridD.q[DIR_MP0])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx2;
@@ -4984,7 +4992,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_PM0])[kse] = getInterpolatedDistributionForVeloBC(q, f_NW, f_SE, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_P0P])[k];
+      q = (subgridD.q[DIR_P0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx3;
@@ -4993,7 +5001,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_M0M])[kbw] = getInterpolatedDistributionForVeloBC(q, f_TE, f_BW, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_M0M])[k];
+      q = (subgridD.q[DIR_M0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx3;
@@ -5002,7 +5010,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_P0P])[kte] = getInterpolatedDistributionForVeloBC(q, f_BW, f_TE, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_P0M])[k];
+      q = (subgridD.q[DIR_P0M])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx3;
@@ -5011,7 +5019,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_M0P])[ktw] = getInterpolatedDistributionForVeloBC(q, f_BE, f_TW, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_M0P])[k];
+      q = (subgridD.q[DIR_M0P])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx3;
@@ -5020,7 +5028,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_P0M])[kbe] = getInterpolatedDistributionForVeloBC(q, f_TW, f_BE, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0PP])[k];
+      q = (subgridD.q[DIR_0PP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx2 + vx3;
@@ -5029,7 +5037,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_0MM])[kbs] = getInterpolatedDistributionForVeloBC(q, f_TN, f_BS, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0MM])[k];
+      q = (subgridD.q[DIR_0MM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx2 - vx3;
@@ -5038,7 +5046,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_0PP])[ktn] = getInterpolatedDistributionForVeloBC(q, f_BS, f_TN, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0PM])[k];
+      q = (subgridD.q[DIR_0PM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx2 - vx3;
@@ -5047,7 +5055,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_0MP])[kts] = getInterpolatedDistributionForVeloBC(q, f_BN, f_TS, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_0MP])[k];
+      q = (subgridD.q[DIR_0MP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx2 + vx3;
@@ -5056,7 +5064,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_0PM])[kbn] = getInterpolatedDistributionForVeloBC(q, f_TS, f_BN, feq, omega, velocityBC, c1o54);
       }
 
-      q = (subgridD.q[DIR_PPP])[k];
+      q = (subgridD.q[DIR_PPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx2 + vx3;
@@ -5065,7 +5073,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_MMM])[kbsw] = getInterpolatedDistributionForVeloBC(q, f_TNE, f_BSW, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MMM])[k];
+      q = (subgridD.q[DIR_MMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx2 - vx3;
@@ -5074,7 +5082,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_PPP])[ktne] = getInterpolatedDistributionForVeloBC(q, f_BSW, f_TNE, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PPM])[k];
+      q = (subgridD.q[DIR_PPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 + vx2 - vx3;
@@ -5083,7 +5091,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_MMP])[ktsw] = getInterpolatedDistributionForVeloBC(q, f_BNE, f_TSW, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MMP])[k];
+      q = (subgridD.q[DIR_MMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 - vx2 + vx3;
@@ -5092,7 +5100,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_PPM])[kbne] = getInterpolatedDistributionForVeloBC(q, f_TSW, f_BNE, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PMP])[k];
+      q = (subgridD.q[DIR_PMP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx2 + vx3;
@@ -5101,7 +5109,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_MPM])[kbnw] = getInterpolatedDistributionForVeloBC(q, f_TSE, f_BNW, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MPM])[k];
+      q = (subgridD.q[DIR_MPM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx2 - vx3;
@@ -5110,7 +5118,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_PMP])[ktse] = getInterpolatedDistributionForVeloBC(q, f_BNW, f_TSE, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_PMM])[k];
+      q = (subgridD.q[DIR_PMM])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = vx1 - vx2 - vx3;
@@ -5119,7 +5127,7 @@ __global__ void QVelDeviceComp27(
          (dist.f[DIR_MPP])[ktnw] = getInterpolatedDistributionForVeloBC(q, f_BSE, f_TNW, feq, omega, velocityBC, c1o216);
       }
 
-      q = (subgridD.q[DIR_MPP])[k];
+      q = (subgridD.q[DIR_MPP])[nodeIndex];
       if (q>=c0o1 && q<=c1o1)
       {
          velocityLB = -vx1 + vx2 + vx3;
@@ -5170,82 +5178,83 @@ __global__ void QVelDeviceComp27(
 
 
 //////////////////////////////////////////////////////////////////////////////
-__global__ void QVelDevice27(int inx,
-                                        int iny,
-                                        real* vx,
-                                        real* vy,
-                                        real* vz,
-                                        real* DD, 
-                                        int* k_Q, 
-                                        real* QQ,
-                                        unsigned int numberOfBCnodes, 
-                                        real om1, 
-                                        unsigned int* neighborX,
-                                        unsigned int* neighborY,
-                                        unsigned int* neighborZ,
-                                        unsigned int size_Mat, 
-                                        bool isEvenTimestep)
+__global__ void QVelDevice27(
+    int inx,
+    int iny,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* DD, 
+    int* k_Q, 
+    real* QQ,
+    unsigned int numberOfBCnodes, 
+    real om1, 
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    unsigned long long numberOfLBnodes, 
+    bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -5270,24 +5279,24 @@ __global__ void QVelDevice27(int inx,
             *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
             *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
             *q_dirBSE, *q_dirBNW; 
-      q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -5358,32 +5367,32 @@ __global__ void QVelDevice27(int inx,
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       real vx1, vx2, vx3, drho, feq, q;
       drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -5408,63 +5417,63 @@ __global__ void QVelDevice27(int inx,
       //////////////////////////////////////////////////////////////////////////
       if (isEvenTimestep==false)
       {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
       } 
       else
       {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
       }
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
@@ -5723,19 +5732,20 @@ __global__ void QVelDevice27(int inx,
 
 
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void PropellerBC(unsigned int* neighborX,
-                                       unsigned int* neighborY,
-                                       unsigned int* neighborZ,
-                                       real* rho,
-                                       real* ux,
-                                       real* uy,
-                                       real* uz,
-                                       int* k_Q, 
-									   unsigned int size_Prop,
-                                       unsigned int size_Mat,
-                                       unsigned int* bcMatD,
-                                       real* DD,
-                                       bool EvenOrOdd)
+__global__ void PropellerBC(
+    unsigned int* neighborX,
+    unsigned int* neighborY,
+    unsigned int* neighborZ,
+    real* rho,
+    real* ux,
+    real* uy,
+    real* uz,
+    int* k_Q, 
+    unsigned int size_Prop,
+    unsigned long long numberOfLBnodes,
+    unsigned int* bcMatD,
+    real* DD,
+    bool EvenOrOdd)
 {
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -5754,63 +5764,63 @@ __global__ void PropellerBC(unsigned int* neighborX,
         Distributions27 D;
         if (EvenOrOdd==true)
         {
-			D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-			D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-			D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-			D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-			D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-			D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-			D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-			D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-			D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-			D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-			D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-			D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-			D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-			D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-			D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-			D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-			D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-			D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-			D.f[DIR_000] = &DD[DIR_000*size_Mat];
-			D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-			D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-			D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-			D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-			D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-			D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-			D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-			D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+			D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+			D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+			D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+			D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+			D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+			D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+			D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+			D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+			D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+			D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+			D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+			D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+			D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+			D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+			D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+			D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+			D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+			D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+			D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+			D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+			D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+			D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+			D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+			D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+			D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+			D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+			D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
         }
         else
         {
-			D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-			D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-			D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-			D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-			D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-			D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-			D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-			D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-			D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-			D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-			D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-			D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-			D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-			D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-			D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-			D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-			D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-			D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-			D.f[DIR_000] = &DD[DIR_000*size_Mat];
-			D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-			D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-			D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
-			D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-			D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-			D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-			D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-			D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
+			D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+			D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+			D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+			D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+			D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+			D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+			D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+			D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+			D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+			D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+			D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+			D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+			D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+			D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+			D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+			D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+			D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+			D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+			D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+			D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+			D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+			D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
+			D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+			D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+			D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+			D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+			D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
         }
         //////////////////////////////////////////////////////////////////////////
 		unsigned int KQK = k_Q[k];
@@ -5859,58 +5869,58 @@ __global__ void PropellerBC(unsigned int* neighborX,
 		f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW, f_ZERO;
 
 		f_ZERO= (D.f[DIR_000])[kzero];
-		f_E   = (D.f[DIR_P00   ])[ke   ];
-		f_W   = (D.f[DIR_M00   ])[kw   ];
-		f_N   = (D.f[DIR_0P0   ])[kn   ];
-		f_S   = (D.f[DIR_0M0   ])[ks   ];
-		f_T   = (D.f[DIR_00P   ])[kt   ];
-		f_B   = (D.f[DIR_00M   ])[kb   ];
-		f_NE  = (D.f[DIR_PP0  ])[kne  ];
-		f_SW  = (D.f[DIR_MM0  ])[ksw  ];
-		f_SE  = (D.f[DIR_PM0  ])[kse  ];
-		f_NW  = (D.f[DIR_MP0  ])[knw  ];
-		f_TE  = (D.f[DIR_P0P  ])[kte  ];
-		f_BW  = (D.f[DIR_M0M  ])[kbw  ];
-		f_BE  = (D.f[DIR_P0M  ])[kbe  ];
-		f_TW  = (D.f[DIR_M0P  ])[ktw  ];
-		f_TN  = (D.f[DIR_0PP  ])[ktn  ];
-		f_BS  = (D.f[DIR_0MM  ])[kbs  ];
-		f_BN  = (D.f[DIR_0PM  ])[kbn  ];
-		f_TS  = (D.f[DIR_0MP  ])[kts  ];
-		f_TNE = (D.f[DIR_PPP ])[ktne ];
-		f_BSW = (D.f[DIR_MMM ])[kbsw ];
-		f_BNE = (D.f[DIR_PPM ])[kbne ];
-		f_TSW = (D.f[DIR_MMP ])[ktsw ];
-		f_TSE = (D.f[DIR_PMP ])[ktse ];
-		f_BNW = (D.f[DIR_MPM ])[kbnw ];
-		f_BSE = (D.f[DIR_PMM ])[kbse ];
-		f_TNW = (D.f[DIR_MPP ])[ktnw ];
-		//f_W    = (D.f[DIR_P00   ])[ke   ];
-		//f_E    = (D.f[DIR_M00   ])[kw   ];
-		//f_S    = (D.f[DIR_0P0   ])[kn   ];
-		//f_N    = (D.f[DIR_0M0   ])[ks   ];
-		//f_B    = (D.f[DIR_00P   ])[kt   ];
-		//f_T    = (D.f[DIR_00M   ])[kb   ];
-		//f_SW   = (D.f[DIR_PP0  ])[kne  ];
-		//f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-		//f_NW   = (D.f[DIR_PM0  ])[kse  ];
-		//f_SE   = (D.f[DIR_MP0  ])[knw  ];
-		//f_BW   = (D.f[DIR_P0P  ])[kte  ];
-		//f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-		//f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-		//f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-		//f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-		//f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-		//f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-		//f_BN   = (D.f[DIR_0MP  ])[kts  ];
-		//f_BSW  = (D.f[DIR_PPP ])[ktne ];
-		//f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-		//f_TSW  = (D.f[DIR_PPM ])[kbne ];
-		//f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-		//f_BNW  = (D.f[DIR_PMP ])[ktse ];
-		//f_TSE  = (D.f[DIR_MPM ])[kbnw ];
-		//f_TNW  = (D.f[DIR_PMM ])[kbse ];
-		//f_BSE  = (D.f[DIR_MPP ])[ktnw ];
+		f_E   = (D.f[DIR_P00])[ke   ];
+		f_W   = (D.f[DIR_M00])[kw   ];
+		f_N   = (D.f[DIR_0P0])[kn   ];
+		f_S   = (D.f[DIR_0M0])[ks   ];
+		f_T   = (D.f[DIR_00P])[kt   ];
+		f_B   = (D.f[DIR_00M])[kb   ];
+		f_NE  = (D.f[DIR_PP0])[kne  ];
+		f_SW  = (D.f[DIR_MM0])[ksw  ];
+		f_SE  = (D.f[DIR_PM0])[kse  ];
+		f_NW  = (D.f[DIR_MP0])[knw  ];
+		f_TE  = (D.f[DIR_P0P])[kte  ];
+		f_BW  = (D.f[DIR_M0M])[kbw  ];
+		f_BE  = (D.f[DIR_P0M])[kbe  ];
+		f_TW  = (D.f[DIR_M0P])[ktw  ];
+		f_TN  = (D.f[DIR_0PP])[ktn  ];
+		f_BS  = (D.f[DIR_0MM])[kbs  ];
+		f_BN  = (D.f[DIR_0PM])[kbn  ];
+		f_TS  = (D.f[DIR_0MP])[kts  ];
+		f_TNE = (D.f[DIR_PPP])[ktne ];
+		f_BSW = (D.f[DIR_MMM])[kbsw ];
+		f_BNE = (D.f[DIR_PPM])[kbne ];
+		f_TSW = (D.f[DIR_MMP])[ktsw ];
+		f_TSE = (D.f[DIR_PMP])[ktse ];
+		f_BNW = (D.f[DIR_MPM])[kbnw ];
+		f_BSE = (D.f[DIR_PMM])[kbse ];
+		f_TNW = (D.f[DIR_MPP])[ktnw ];
+		//f_W    = (D.f[DIR_P00])[ke   ];
+		//f_E    = (D.f[DIR_M00])[kw   ];
+		//f_S    = (D.f[DIR_0P0])[kn   ];
+		//f_N    = (D.f[DIR_0M0])[ks   ];
+		//f_B    = (D.f[DIR_00P])[kt   ];
+		//f_T    = (D.f[DIR_00M])[kb   ];
+		//f_SW   = (D.f[DIR_PP0])[kne  ];
+		//f_NE   = (D.f[DIR_MM0])[ksw  ];
+		//f_NW   = (D.f[DIR_PM0])[kse  ];
+		//f_SE   = (D.f[DIR_MP0])[knw  ];
+		//f_BW   = (D.f[DIR_P0P])[kte  ];
+		//f_TE   = (D.f[DIR_M0M])[kbw  ];
+		//f_TW   = (D.f[DIR_P0M])[kbe  ];
+		//f_BE   = (D.f[DIR_M0P])[ktw  ];
+		//f_BS   = (D.f[DIR_0PP])[ktn  ];
+		//f_TN   = (D.f[DIR_0MM])[kbs  ];
+		//f_TS   = (D.f[DIR_0PM])[kbn  ];
+		//f_BN   = (D.f[DIR_0MP])[kts  ];
+		//f_BSW  = (D.f[DIR_PPP])[ktne ];
+		//f_TNE  = (D.f[DIR_MMM])[kbsw ];
+		//f_TSW  = (D.f[DIR_PPM])[kbne ];
+		//f_BNE  = (D.f[DIR_MMP])[ktsw ];
+		//f_BNW  = (D.f[DIR_PMP])[ktse ];
+		//f_TSE  = (D.f[DIR_MPM])[kbnw ];
+		//f_TNW  = (D.f[DIR_PMM])[kbse ];
+		//f_BSE  = (D.f[DIR_MPP])[ktnw ];
 		//////////////////////////////////////////////////////////////////////////////////
 		real vxo1, vxo2, vxo3, drho;
 		drho   =  /*zero;*/f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
@@ -5992,88 +6002,88 @@ __global__ void PropellerBC(unsigned int* neighborX,
          f_TNW  = f_TNW  + ((c1o1+drho) * (-  c1o216*(c3o1*(-vxo1+vxo2+vxo3)+c9o2*(-vxo1+vxo2+vxo3)*(-vxo1+vxo2+vxo3)-cusq) +   c1o216*(c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cusq2)));
 
 		(D.f[DIR_000])[kzero] =  f_ZERO;
-        (D.f[DIR_P00   ])[ke   ] =  f_E   ;	// f_W   ;//    	
-        (D.f[DIR_M00   ])[kw   ] =  f_W   ;	// f_E   ;//    	
-        (D.f[DIR_0P0   ])[kn   ] =  f_N   ;	// f_S   ;//    	
-        (D.f[DIR_0M0   ])[ks   ] =  f_S   ;	// f_N   ;//    	
-        (D.f[DIR_00P   ])[kt   ] =  f_T   ;	// f_B   ;//    	
-        (D.f[DIR_00M   ])[kb   ] =  f_B   ;	// f_T   ;//    	
-        (D.f[DIR_PP0  ])[kne  ] =  f_NE  ;	// f_SW  ;//    	
-        (D.f[DIR_MM0  ])[ksw  ] =  f_SW  ;	// f_NE  ;//    	
-        (D.f[DIR_PM0  ])[kse  ] =  f_SE  ;	// f_NW  ;//    	
-        (D.f[DIR_MP0  ])[knw  ] =  f_NW  ;	// f_SE  ;//    	
-        (D.f[DIR_P0P  ])[kte  ] =  f_TE  ;	// f_BW  ;//    	
-        (D.f[DIR_M0M  ])[kbw  ] =  f_BW  ;	// f_TE  ;//    	
-        (D.f[DIR_P0M  ])[kbe  ] =  f_BE  ;	// f_TW  ;//    	
-        (D.f[DIR_M0P  ])[ktw  ] =  f_TW  ;	// f_BE  ;//    	
-        (D.f[DIR_0PP  ])[ktn  ] =  f_TN  ;	// f_BS  ;//    	
-        (D.f[DIR_0MM  ])[kbs  ] =  f_BS  ;	// f_TN  ;//    	
-        (D.f[DIR_0PM  ])[kbn  ] =  f_BN  ;	// f_TS  ;//    	
-        (D.f[DIR_0MP  ])[kts  ] =  f_TS  ;	// f_BN  ;//    	
-        (D.f[DIR_PPP ])[ktne ] =  f_TNE ;	// f_BSW ;//    	
-        (D.f[DIR_MMM ])[kbsw ] =  f_BSW ;	// f_BNE ;//    	
-        (D.f[DIR_PPM ])[kbne ] =  f_BNE ;	// f_BNW ;//    	
-        (D.f[DIR_MMP ])[ktsw ] =  f_TSW ;	// f_BSE ;//    	
-        (D.f[DIR_PMP ])[ktse ] =  f_TSE ;	// f_TSW ;//    	
-        (D.f[DIR_MPM ])[kbnw ] =  f_BNW ;	// f_TNE ;//    	
-        (D.f[DIR_PMM ])[kbse ] =  f_BSE ;	// f_TNW ;//    	
-        (D.f[DIR_MPP ])[ktnw ] =  f_TNW ;	// f_TSE ;//    	
+        (D.f[DIR_P00])[ke   ] =  f_E   ;	// f_W   ;//    	
+        (D.f[DIR_M00])[kw   ] =  f_W   ;	// f_E   ;//    	
+        (D.f[DIR_0P0])[kn   ] =  f_N   ;	// f_S   ;//    	
+        (D.f[DIR_0M0])[ks   ] =  f_S   ;	// f_N   ;//    	
+        (D.f[DIR_00P])[kt   ] =  f_T   ;	// f_B   ;//    	
+        (D.f[DIR_00M])[kb   ] =  f_B   ;	// f_T   ;//    	
+        (D.f[DIR_PP0])[kne  ] =  f_NE  ;	// f_SW  ;//    	
+        (D.f[DIR_MM0])[ksw  ] =  f_SW  ;	// f_NE  ;//    	
+        (D.f[DIR_PM0])[kse  ] =  f_SE  ;	// f_NW  ;//    	
+        (D.f[DIR_MP0])[knw  ] =  f_NW  ;	// f_SE  ;//    	
+        (D.f[DIR_P0P])[kte  ] =  f_TE  ;	// f_BW  ;//    	
+        (D.f[DIR_M0M])[kbw  ] =  f_BW  ;	// f_TE  ;//    	
+        (D.f[DIR_P0M])[kbe  ] =  f_BE  ;	// f_TW  ;//    	
+        (D.f[DIR_M0P])[ktw  ] =  f_TW  ;	// f_BE  ;//    	
+        (D.f[DIR_0PP])[ktn  ] =  f_TN  ;	// f_BS  ;//    	
+        (D.f[DIR_0MM])[kbs  ] =  f_BS  ;	// f_TN  ;//    	
+        (D.f[DIR_0PM])[kbn  ] =  f_BN  ;	// f_TS  ;//    	
+        (D.f[DIR_0MP])[kts  ] =  f_TS  ;	// f_BN  ;//    	
+        (D.f[DIR_PPP])[ktne ] =  f_TNE ;	// f_BSW ;//    	
+        (D.f[DIR_MMM])[kbsw ] =  f_BSW ;	// f_BNE ;//    	
+        (D.f[DIR_PPM])[kbne ] =  f_BNE ;	// f_BNW ;//    	
+        (D.f[DIR_MMP])[ktsw ] =  f_TSW ;	// f_BSE ;//    	
+        (D.f[DIR_PMP])[ktse ] =  f_TSE ;	// f_TSW ;//    	
+        (D.f[DIR_MPM])[kbnw ] =  f_BNW ;	// f_TNE ;//    	
+        (D.f[DIR_PMM])[kbse ] =  f_BSE ;	// f_TNW ;//    	
+        (D.f[DIR_MPP])[ktnw ] =  f_TNW ;	// f_TSE ;//    	
 
 		//////////////////////////////////////////////////////////////////////////
         ////(D.f[DIR_000])[kzero] =   c8over27* (drho-cu_sq);
-        //(D.f[DIR_P00   ])[ke   ] =   three*c2over27* ( vx1        );		//six
-        //(D.f[DIR_M00   ])[kw   ] =   three*c2over27* (-vx1        );		//six
-        //(D.f[DIR_0P0   ])[kn   ] =   three*c2over27* (     vx2    );		//six
-        //(D.f[DIR_0M0   ])[ks   ] =   three*c2over27* (    -vx2    );		//six
-        //(D.f[DIR_00P   ])[kt   ] =   three*c2over27* (         vx3);		//six
-        //(D.f[DIR_00M   ])[kb   ] =   three*c2over27* (        -vx3);		//six
-        //(D.f[DIR_PP0  ])[kne  ] =   three*c1over54* ( vx1+vx2    );		//six
-        //(D.f[DIR_MM0  ])[ksw  ] =   three*c1over54* (-vx1-vx2    );		//six
-        //(D.f[DIR_PM0  ])[kse  ] =   three*c1over54* ( vx1-vx2    );		//six
-        //(D.f[DIR_MP0  ])[knw  ] =   three*c1over54* (-vx1+vx2    );		//six
-        //(D.f[DIR_P0P  ])[kte  ] =   three*c1over54* ( vx1    +vx3);		//six
-        //(D.f[DIR_M0M  ])[kbw  ] =   three*c1over54* (-vx1    -vx3);		//six
-        //(D.f[DIR_P0M  ])[kbe  ] =   three*c1over54* ( vx1    -vx3);		//six
-        //(D.f[DIR_M0P  ])[ktw  ] =   three*c1over54* (-vx1    +vx3);		//six
-        //(D.f[DIR_0PP  ])[ktn  ] =   three*c1over54* (     vx2+vx3);		//six
-        //(D.f[DIR_0MM  ])[kbs  ] =   three*c1over54* (    -vx2-vx3);		//six
-        //(D.f[DIR_0PM  ])[kbn  ] =   three*c1over54* (     vx2-vx3);		//six
-        //(D.f[DIR_0MP  ])[kts  ] =   three*c1over54* (    -vx2+vx3);		//six
-        //(D.f[DIR_PPP ])[ktne ] =   three*c1over216*( vx1+vx2+vx3);		//six
-        //(D.f[DIR_MMM ])[kbsw ] =   three*c1over216*(-vx1-vx2-vx3);		//six
-        //(D.f[DIR_PPM ])[kbne ] =   three*c1over216*( vx1+vx2-vx3);		//six
-        //(D.f[DIR_MMP ])[ktsw ] =   three*c1over216*(-vx1-vx2+vx3);		//six
-        //(D.f[DIR_PMP ])[ktse ] =   three*c1over216*( vx1-vx2+vx3);		//six
-        //(D.f[DIR_MPM ])[kbnw ] =   three*c1over216*(-vx1+vx2-vx3);		//six
-        //(D.f[DIR_PMM ])[kbse ] =   three*c1over216*( vx1-vx2-vx3);		//six
-        //(D.f[DIR_MPP ])[ktnw ] =   three*c1over216*(-vx1+vx2+vx3);		//six
+        //(D.f[DIR_P00])[ke   ] =   three*c2over27* ( vx1        );		//six
+        //(D.f[DIR_M00])[kw   ] =   three*c2over27* (-vx1        );		//six
+        //(D.f[DIR_0P0])[kn   ] =   three*c2over27* (     vx2    );		//six
+        //(D.f[DIR_0M0])[ks   ] =   three*c2over27* (    -vx2    );		//six
+        //(D.f[DIR_00P])[kt   ] =   three*c2over27* (         vx3);		//six
+        //(D.f[DIR_00M])[kb   ] =   three*c2over27* (        -vx3);		//six
+        //(D.f[DIR_PP0])[kne  ] =   three*c1over54* ( vx1+vx2    );		//six
+        //(D.f[DIR_MM0])[ksw  ] =   three*c1over54* (-vx1-vx2    );		//six
+        //(D.f[DIR_PM0])[kse  ] =   three*c1over54* ( vx1-vx2    );		//six
+        //(D.f[DIR_MP0])[knw  ] =   three*c1over54* (-vx1+vx2    );		//six
+        //(D.f[DIR_P0P])[kte  ] =   three*c1over54* ( vx1    +vx3);		//six
+        //(D.f[DIR_M0M])[kbw  ] =   three*c1over54* (-vx1    -vx3);		//six
+        //(D.f[DIR_P0M])[kbe  ] =   three*c1over54* ( vx1    -vx3);		//six
+        //(D.f[DIR_M0P])[ktw  ] =   three*c1over54* (-vx1    +vx3);		//six
+        //(D.f[DIR_0PP])[ktn  ] =   three*c1over54* (     vx2+vx3);		//six
+        //(D.f[DIR_0MM])[kbs  ] =   three*c1over54* (    -vx2-vx3);		//six
+        //(D.f[DIR_0PM])[kbn  ] =   three*c1over54* (     vx2-vx3);		//six
+        //(D.f[DIR_0MP])[kts  ] =   three*c1over54* (    -vx2+vx3);		//six
+        //(D.f[DIR_PPP])[ktne ] =   three*c1over216*( vx1+vx2+vx3);		//six
+        //(D.f[DIR_MMM])[kbsw ] =   three*c1over216*(-vx1-vx2-vx3);		//six
+        //(D.f[DIR_PPM])[kbne ] =   three*c1over216*( vx1+vx2-vx3);		//six
+        //(D.f[DIR_MMP])[ktsw ] =   three*c1over216*(-vx1-vx2+vx3);		//six
+        //(D.f[DIR_PMP])[ktse ] =   three*c1over216*( vx1-vx2+vx3);		//six
+        //(D.f[DIR_MPM])[kbnw ] =   three*c1over216*(-vx1+vx2-vx3);		//six
+        //(D.f[DIR_PMM])[kbse ] =   three*c1over216*( vx1-vx2-vx3);		//six
+        //(D.f[DIR_MPP])[ktnw ] =   three*c1over216*(-vx1+vx2+vx3);		//six
         //(D.f[DIR_000])[kzero] =   c8over27* (drho-cu_sq);
-        //(D.f[DIR_P00   ])[ke   ] =   c2over27* (drho+three*( vx1        )+c9over2*( vx1        )*( vx1        )-cu_sq);
-        //(D.f[DIR_M00   ])[kw   ] =   c2over27* (drho+three*(-vx1        )+c9over2*(-vx1        )*(-vx1        )-cu_sq);
-        //(D.f[DIR_0P0   ])[kn   ] =   c2over27* (drho+three*(    vx2     )+c9over2*(     vx2    )*(     vx2    )-cu_sq);
-        //(D.f[DIR_0M0   ])[ks   ] =   c2over27* (drho+three*(   -vx2     )+c9over2*(    -vx2    )*(    -vx2    )-cu_sq);
-        //(D.f[DIR_00P   ])[kt   ] =   c2over27* (drho+three*(         vx3)+c9over2*(         vx3)*(         vx3)-cu_sq);
-        //(D.f[DIR_00M   ])[kb   ] =   c2over27* (drho+three*(        -vx3)+c9over2*(        -vx3)*(        -vx3)-cu_sq);
-        //(D.f[DIR_PP0  ])[kne  ] =   c1over54* (drho+three*( vx1+vx2    )+c9over2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
-        //(D.f[DIR_MM0  ])[ksw  ] =   c1over54* (drho+three*(-vx1-vx2    )+c9over2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
-        //(D.f[DIR_PM0  ])[kse  ] =   c1over54* (drho+three*( vx1-vx2    )+c9over2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
-        //(D.f[DIR_MP0  ])[knw  ] =   c1over54* (drho+three*(-vx1+vx2    )+c9over2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
-        //(D.f[DIR_P0P  ])[kte  ] =   c1over54* (drho+three*( vx1    +vx3)+c9over2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
-        //(D.f[DIR_M0M  ])[kbw  ] =   c1over54* (drho+three*(-vx1    -vx3)+c9over2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
-        //(D.f[DIR_P0M  ])[kbe  ] =   c1over54* (drho+three*( vx1    -vx3)+c9over2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
-        //(D.f[DIR_M0P  ])[ktw  ] =   c1over54* (drho+three*(-vx1    +vx3)+c9over2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
-        //(D.f[DIR_0PP  ])[ktn  ] =   c1over54* (drho+three*(     vx2+vx3)+c9over2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
-        //(D.f[DIR_0MM  ])[kbs  ] =   c1over54* (drho+three*(    -vx2-vx3)+c9over2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
-        //(D.f[DIR_0PM  ])[kbn  ] =   c1over54* (drho+three*(     vx2-vx3)+c9over2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
-        //(D.f[DIR_0MP  ])[kts  ] =   c1over54* (drho+three*(    -vx2+vx3)+c9over2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
-        //(D.f[DIR_PPP ])[ktne ] =   c1over216*(drho+three*( vx1+vx2+vx3)+c9over2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
-        //(D.f[DIR_MMM ])[kbsw ] =   c1over216*(drho+three*(-vx1-vx2-vx3)+c9over2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
-        //(D.f[DIR_PPM ])[kbne ] =   c1over216*(drho+three*( vx1+vx2-vx3)+c9over2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
-        //(D.f[DIR_MMP ])[ktsw ] =   c1over216*(drho+three*(-vx1-vx2+vx3)+c9over2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
-        //(D.f[DIR_PMP ])[ktse ] =   c1over216*(drho+three*( vx1-vx2+vx3)+c9over2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
-        //(D.f[DIR_MPM ])[kbnw ] =   c1over216*(drho+three*(-vx1+vx2-vx3)+c9over2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
-        //(D.f[DIR_PMM ])[kbse ] =   c1over216*(drho+three*( vx1-vx2-vx3)+c9over2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
-        //(D.f[DIR_MPP ])[ktnw ] =   c1over216*(drho+three*(-vx1+vx2+vx3)+c9over2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
+        //(D.f[DIR_P00])[ke   ] =   c2over27* (drho+three*( vx1        )+c9over2*( vx1        )*( vx1        )-cu_sq);
+        //(D.f[DIR_M00])[kw   ] =   c2over27* (drho+three*(-vx1        )+c9over2*(-vx1        )*(-vx1        )-cu_sq);
+        //(D.f[DIR_0P0])[kn   ] =   c2over27* (drho+three*(    vx2     )+c9over2*(     vx2    )*(     vx2    )-cu_sq);
+        //(D.f[DIR_0M0])[ks   ] =   c2over27* (drho+three*(   -vx2     )+c9over2*(    -vx2    )*(    -vx2    )-cu_sq);
+        //(D.f[DIR_00P])[kt   ] =   c2over27* (drho+three*(         vx3)+c9over2*(         vx3)*(         vx3)-cu_sq);
+        //(D.f[DIR_00M])[kb   ] =   c2over27* (drho+three*(        -vx3)+c9over2*(        -vx3)*(        -vx3)-cu_sq);
+        //(D.f[DIR_PP0])[kne  ] =   c1over54* (drho+three*( vx1+vx2    )+c9over2*( vx1+vx2    )*( vx1+vx2    )-cu_sq);
+        //(D.f[DIR_MM0])[ksw  ] =   c1over54* (drho+three*(-vx1-vx2    )+c9over2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq);
+        //(D.f[DIR_PM0])[kse  ] =   c1over54* (drho+three*( vx1-vx2    )+c9over2*( vx1-vx2    )*( vx1-vx2    )-cu_sq);
+        //(D.f[DIR_MP0])[knw  ] =   c1over54* (drho+three*(-vx1+vx2    )+c9over2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq);
+        //(D.f[DIR_P0P])[kte  ] =   c1over54* (drho+three*( vx1    +vx3)+c9over2*( vx1    +vx3)*( vx1    +vx3)-cu_sq);
+        //(D.f[DIR_M0M])[kbw  ] =   c1over54* (drho+three*(-vx1    -vx3)+c9over2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq);
+        //(D.f[DIR_P0M])[kbe  ] =   c1over54* (drho+three*( vx1    -vx3)+c9over2*( vx1    -vx3)*( vx1    -vx3)-cu_sq);
+        //(D.f[DIR_M0P])[ktw  ] =   c1over54* (drho+three*(-vx1    +vx3)+c9over2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq);
+        //(D.f[DIR_0PP])[ktn  ] =   c1over54* (drho+three*(     vx2+vx3)+c9over2*(     vx2+vx3)*(     vx2+vx3)-cu_sq);
+        //(D.f[DIR_0MM])[kbs  ] =   c1over54* (drho+three*(    -vx2-vx3)+c9over2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq);
+        //(D.f[DIR_0PM])[kbn  ] =   c1over54* (drho+three*(     vx2-vx3)+c9over2*(     vx2-vx3)*(     vx2-vx3)-cu_sq);
+        //(D.f[DIR_0MP])[kts  ] =   c1over54* (drho+three*(    -vx2+vx3)+c9over2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq);
+        //(D.f[DIR_PPP])[ktne ] =   c1over216*(drho+three*( vx1+vx2+vx3)+c9over2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq);
+        //(D.f[DIR_MMM])[kbsw ] =   c1over216*(drho+three*(-vx1-vx2-vx3)+c9over2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq);
+        //(D.f[DIR_PPM])[kbne ] =   c1over216*(drho+three*( vx1+vx2-vx3)+c9over2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq);
+        //(D.f[DIR_MMP])[ktsw ] =   c1over216*(drho+three*(-vx1-vx2+vx3)+c9over2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq);
+        //(D.f[DIR_PMP])[ktse ] =   c1over216*(drho+three*( vx1-vx2+vx3)+c9over2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq);
+        //(D.f[DIR_MPM])[kbnw ] =   c1over216*(drho+three*(-vx1+vx2-vx3)+c9over2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq);
+        //(D.f[DIR_PMM])[kbse ] =   c1over216*(drho+three*( vx1-vx2-vx3)+c9over2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq);
+        //(D.f[DIR_MPP])[ktnw ] =   c1over216*(drho+three*(-vx1+vx2+vx3)+c9over2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq);
 		}
     }
 }
diff --git a/src/gpu/VirtualFluids_GPU/GPU/WaleCumulant27.cu b/src/gpu/VirtualFluids_GPU/GPU/WaleCumulant27.cu
index 16028e2f9f87716f43ed60f82ed513289e381b7c..cbb892296322bc164241ad18c8ab63201d34647e 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/WaleCumulant27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/WaleCumulant27.cu
@@ -23,7 +23,7 @@ __global__ void LB_Kernel_WaleBySoniMalav_Cum_AA2016_Comp_SP_27(
 	real* veloZ,
 	real* DDStart,
 	real* turbulentViscosity,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	int level,
 	real* forces,
 	bool EvenOrOdd)
@@ -39,7 +39,7 @@ __global__ void LB_Kernel_WaleBySoniMalav_Cum_AA2016_Comp_SP_27(
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if (k<size_Mat)
+	if (k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -50,63 +50,63 @@ __global__ void LB_Kernel_WaleBySoniMalav_Cum_AA2016_Comp_SP_27(
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/GPU/WallFunction.cu b/src/gpu/VirtualFluids_GPU/GPU/WallFunction.cu
index d48fa80fd14ce15f4a380ed46403654b43c805e8..d2fe5935af9b2d3ad78f492e3a9d182873d20808 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/WallFunction.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/WallFunction.cu
@@ -20,69 +20,69 @@ __global__ void WallFunction27(
 										  unsigned int* neighborX,
 										  unsigned int* neighborY,
 										  unsigned int* neighborZ,
-										  unsigned int size_Mat, 
+										  unsigned long long numberOfLBnodes, 
 										  bool isEvenTimestep)
 {
    Distributions27 D;
    if (isEvenTimestep==true)
    {
-      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
    } 
    else
    {
-      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
+      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
+      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
+      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
+      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
+      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
+      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
+      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
+      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
+      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
+      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
+      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
+      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
+      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
+      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
+      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
+      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
+      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
+      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
+      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
+      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
+      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
+      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
+      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
+      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
+      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
+      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
    }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
@@ -107,24 +107,24 @@ __global__ void WallFunction27(
       //      *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
       //      *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
       //      *q_dirBSE, *q_dirBNW; 
-      //q_dirE   = &QQ[DIR_P00   * numberOfBCnodes];
-      //q_dirW   = &QQ[DIR_M00   * numberOfBCnodes];
-      //q_dirN   = &QQ[DIR_0P0   * numberOfBCnodes];
-      //q_dirS   = &QQ[DIR_0M0   * numberOfBCnodes];
-      //q_dirT   = &QQ[DIR_00P   * numberOfBCnodes];
-      //q_dirB   = &QQ[DIR_00M   * numberOfBCnodes];
-      //q_dirNE  = &QQ[DIR_PP0  * numberOfBCnodes];
-      //q_dirSW  = &QQ[DIR_MM0  * numberOfBCnodes];
-      //q_dirSE  = &QQ[DIR_PM0  * numberOfBCnodes];
-      //q_dirNW  = &QQ[DIR_MP0  * numberOfBCnodes];
-      //q_dirTE  = &QQ[DIR_P0P  * numberOfBCnodes];
-      //q_dirBW  = &QQ[DIR_M0M  * numberOfBCnodes];
-      //q_dirBE  = &QQ[DIR_P0M  * numberOfBCnodes];
-      //q_dirTW  = &QQ[DIR_M0P  * numberOfBCnodes];
-      //q_dirTN  = &QQ[DIR_0PP  * numberOfBCnodes];
-      //q_dirBS  = &QQ[DIR_0MM  * numberOfBCnodes];
-      //q_dirBN  = &QQ[DIR_0PM  * numberOfBCnodes];
-      //q_dirTS  = &QQ[DIR_0MP  * numberOfBCnodes];
+      //q_dirE   = &QQ[DIR_P00 * numberOfBCnodes];
+      //q_dirW   = &QQ[DIR_M00 * numberOfBCnodes];
+      //q_dirN   = &QQ[DIR_0P0 * numberOfBCnodes];
+      //q_dirS   = &QQ[DIR_0M0 * numberOfBCnodes];
+      //q_dirT   = &QQ[DIR_00P * numberOfBCnodes];
+      //q_dirB   = &QQ[DIR_00M * numberOfBCnodes];
+      //q_dirNE  = &QQ[DIR_PP0 * numberOfBCnodes];
+      //q_dirSW  = &QQ[DIR_MM0 * numberOfBCnodes];
+      //q_dirSE  = &QQ[DIR_PM0 * numberOfBCnodes];
+      //q_dirNW  = &QQ[DIR_MP0 * numberOfBCnodes];
+      //q_dirTE  = &QQ[DIR_P0P * numberOfBCnodes];
+      //q_dirBW  = &QQ[DIR_M0M * numberOfBCnodes];
+      //q_dirBE  = &QQ[DIR_P0M * numberOfBCnodes];
+      //q_dirTW  = &QQ[DIR_M0P * numberOfBCnodes];
+      //q_dirTN  = &QQ[DIR_0PP * numberOfBCnodes];
+      //q_dirBS  = &QQ[DIR_0MM * numberOfBCnodes];
+      //q_dirBN  = &QQ[DIR_0PM * numberOfBCnodes];
+      //q_dirTS  = &QQ[DIR_0MP * numberOfBCnodes];
       //q_dirTNE = &QQ[DIR_PPP * numberOfBCnodes];
       //q_dirTSW = &QQ[DIR_MMP * numberOfBCnodes];
       //q_dirTSE = &QQ[DIR_PMP * numberOfBCnodes];
@@ -167,32 +167,32 @@ __global__ void WallFunction27(
       real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
          f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
 
-      f_W    = (D.f[DIR_P00   ])[ke   ];
-      f_E    = (D.f[DIR_M00   ])[kw   ];
-      f_S    = (D.f[DIR_0P0   ])[kn   ];
-      f_N    = (D.f[DIR_0M0   ])[ks   ];
-      f_B    = (D.f[DIR_00P   ])[kt   ];
-      f_T    = (D.f[DIR_00M   ])[kb   ];
-      f_SW   = (D.f[DIR_PP0  ])[kne  ];
-      f_NE   = (D.f[DIR_MM0  ])[ksw  ];
-      f_NW   = (D.f[DIR_PM0  ])[kse  ];
-      f_SE   = (D.f[DIR_MP0  ])[knw  ];
-      f_BW   = (D.f[DIR_P0P  ])[kte  ];
-      f_TE   = (D.f[DIR_M0M  ])[kbw  ];
-      f_TW   = (D.f[DIR_P0M  ])[kbe  ];
-      f_BE   = (D.f[DIR_M0P  ])[ktw  ];
-      f_BS   = (D.f[DIR_0PP  ])[ktn  ];
-      f_TN   = (D.f[DIR_0MM  ])[kbs  ];
-      f_TS   = (D.f[DIR_0PM  ])[kbn  ];
-      f_BN   = (D.f[DIR_0MP  ])[kts  ];
-      f_BSW  = (D.f[DIR_PPP ])[ktne ];
-      f_BNE  = (D.f[DIR_MMP ])[ktsw ];
-      f_BNW  = (D.f[DIR_PMP ])[ktse ];
-      f_BSE  = (D.f[DIR_MPP ])[ktnw ];
-      f_TSW  = (D.f[DIR_PPM ])[kbne ];
-      f_TNE  = (D.f[DIR_MMM ])[kbsw ];
-      f_TNW  = (D.f[DIR_PMM ])[kbse ];
-      f_TSE  = (D.f[DIR_MPM ])[kbnw ];
+      f_W    = (D.f[DIR_P00])[ke   ];
+      f_E    = (D.f[DIR_M00])[kw   ];
+      f_S    = (D.f[DIR_0P0])[kn   ];
+      f_N    = (D.f[DIR_0M0])[ks   ];
+      f_B    = (D.f[DIR_00P])[kt   ];
+      f_T    = (D.f[DIR_00M])[kb   ];
+      f_SW   = (D.f[DIR_PP0])[kne  ];
+      f_NE   = (D.f[DIR_MM0])[ksw  ];
+      f_NW   = (D.f[DIR_PM0])[kse  ];
+      f_SE   = (D.f[DIR_MP0])[knw  ];
+      f_BW   = (D.f[DIR_P0P])[kte  ];
+      f_TE   = (D.f[DIR_M0M])[kbw  ];
+      f_TW   = (D.f[DIR_P0M])[kbe  ];
+      f_BE   = (D.f[DIR_M0P])[ktw  ];
+      f_BS   = (D.f[DIR_0PP])[ktn  ];
+      f_TN   = (D.f[DIR_0MM])[kbs  ];
+      f_TS   = (D.f[DIR_0PM])[kbn  ];
+      f_BN   = (D.f[DIR_0MP])[kts  ];
+      f_BSW  = (D.f[DIR_PPP])[ktne ];
+      f_BNE  = (D.f[DIR_MMP])[ktsw ];
+      f_BNW  = (D.f[DIR_PMP])[ktse ];
+      f_BSE  = (D.f[DIR_MPP])[ktnw ];
+      f_TSW  = (D.f[DIR_PPM])[kbne ];
+      f_TNE  = (D.f[DIR_MMM])[kbsw ];
+      f_TNW  = (D.f[DIR_PMM])[kbse ];
+      f_TSE  = (D.f[DIR_MPM])[kbnw ];
       ////////////////////////////////////////////////////////////////////////////////
       // real vx2, vx3, feq, q;
       real vx1, drho;
@@ -234,63 +234,63 @@ __global__ void WallFunction27(
    //   //////////////////////////////////////////////////////////////////////////
    //   if (isEvenTimestep==false)
    //   {
-   //      D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-   //      D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-   //      D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-   //      D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-   //      D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-   //      D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-   //      D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-   //      D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-   //      D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-   //      D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-   //      D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-   //      D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-   //      D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-   //      D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-   //      D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-   //      D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-   //      D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-   //      D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-   //      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   //      D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-   //      D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-   //      D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-   //      D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-   //      D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-   //      D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-   //      D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-   //      D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
+   //      D.f[DIR_P00] = &DD[DIR_P00 * size_Mat];
+   //      D.f[DIR_M00] = &DD[DIR_M00 * size_Mat];
+   //      D.f[DIR_0P0] = &DD[DIR_0P0 * size_Mat];
+   //      D.f[DIR_0M0] = &DD[DIR_0M0 * size_Mat];
+   //      D.f[DIR_00P] = &DD[DIR_00P * size_Mat];
+   //      D.f[DIR_00M] = &DD[DIR_00M * size_Mat];
+   //      D.f[DIR_PP0] = &DD[DIR_PP0 * size_Mat];
+   //      D.f[DIR_MM0] = &DD[DIR_MM0 * size_Mat];
+   //      D.f[DIR_PM0] = &DD[DIR_PM0 * size_Mat];
+   //      D.f[DIR_MP0] = &DD[DIR_MP0 * size_Mat];
+   //      D.f[DIR_P0P] = &DD[DIR_P0P * size_Mat];
+   //      D.f[DIR_M0M] = &DD[DIR_M0M * size_Mat];
+   //      D.f[DIR_P0M] = &DD[DIR_P0M * size_Mat];
+   //      D.f[DIR_M0P] = &DD[DIR_M0P * size_Mat];
+   //      D.f[DIR_0PP] = &DD[DIR_0PP * size_Mat];
+   //      D.f[DIR_0MM] = &DD[DIR_0MM * size_Mat];
+   //      D.f[DIR_0PM] = &DD[DIR_0PM * size_Mat];
+   //      D.f[DIR_0MP] = &DD[DIR_0MP * size_Mat];
+   //      D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+   //      D.f[DIR_PPP] = &DD[DIR_PPP * size_Mat];
+   //      D.f[DIR_MMP] = &DD[DIR_MMP * size_Mat];
+   //      D.f[DIR_PMP] = &DD[DIR_PMP * size_Mat];
+   //      D.f[DIR_MPP] = &DD[DIR_MPP * size_Mat];
+   //      D.f[DIR_PPM] = &DD[DIR_PPM * size_Mat];
+   //      D.f[DIR_MMM] = &DD[DIR_MMM * size_Mat];
+   //      D.f[DIR_PMM] = &DD[DIR_PMM * size_Mat];
+   //      D.f[DIR_MPM] = &DD[DIR_MPM * size_Mat];
    //   } 
    //   else
    //   {
-   //      D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-   //      D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-   //      D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-   //      D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-   //      D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-   //      D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-   //      D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-   //      D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-   //      D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-   //      D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-   //      D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-   //      D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-   //      D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-   //      D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-   //      D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-   //      D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-   //      D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-   //      D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-   //      D.f[DIR_000] = &DD[DIR_000*size_Mat];
-   //      D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-   //      D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-   //      D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-   //      D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-   //      D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-   //      D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-   //      D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-   //      D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
+   //      D.f[DIR_M00] = &DD[DIR_P00 * size_Mat];
+   //      D.f[DIR_P00] = &DD[DIR_M00 * size_Mat];
+   //      D.f[DIR_0M0] = &DD[DIR_0P0 * size_Mat];
+   //      D.f[DIR_0P0] = &DD[DIR_0M0 * size_Mat];
+   //      D.f[DIR_00M] = &DD[DIR_00P * size_Mat];
+   //      D.f[DIR_00P] = &DD[DIR_00M * size_Mat];
+   //      D.f[DIR_MM0] = &DD[DIR_PP0 * size_Mat];
+   //      D.f[DIR_PP0] = &DD[DIR_MM0 * size_Mat];
+   //      D.f[DIR_MP0] = &DD[DIR_PM0 * size_Mat];
+   //      D.f[DIR_PM0] = &DD[DIR_MP0 * size_Mat];
+   //      D.f[DIR_M0M] = &DD[DIR_P0P * size_Mat];
+   //      D.f[DIR_P0P] = &DD[DIR_M0M * size_Mat];
+   //      D.f[DIR_M0P] = &DD[DIR_P0M * size_Mat];
+   //      D.f[DIR_P0M] = &DD[DIR_M0P * size_Mat];
+   //      D.f[DIR_0MM] = &DD[DIR_0PP * size_Mat];
+   //      D.f[DIR_0PP] = &DD[DIR_0MM * size_Mat];
+   //      D.f[DIR_0MP] = &DD[DIR_0PM * size_Mat];
+   //      D.f[DIR_0PM] = &DD[DIR_0MP * size_Mat];
+   //      D.f[DIR_000] = &DD[DIR_000 * size_Mat];
+   //      D.f[DIR_PPP] = &DD[DIR_MMM * size_Mat];
+   //      D.f[DIR_MMP] = &DD[DIR_PPM * size_Mat];
+   //      D.f[DIR_PMP] = &DD[DIR_MPM * size_Mat];
+   //      D.f[DIR_MPP] = &DD[DIR_PMM * size_Mat];
+   //      D.f[DIR_PPM] = &DD[DIR_MMP * size_Mat];
+   //      D.f[DIR_MMM] = &DD[DIR_PPP * size_Mat];
+   //      D.f[DIR_PMM] = &DD[DIR_MPP * size_Mat];
+   //      D.f[DIR_MPM] = &DD[DIR_PMP * size_Mat];
    //   }
    //   ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //   //Test
diff --git a/src/gpu/VirtualFluids_GPU/Init/InitLattice.cpp b/src/gpu/VirtualFluids_GPU/Init/InitLattice.cpp
index 2c85de9e3ec57d50a66fde2c49d3e703676fbf04..508e4498c36d352761c3ecaf24abaa52a5f84bbe 100644
--- a/src/gpu/VirtualFluids_GPU/Init/InitLattice.cpp
+++ b/src/gpu/VirtualFluids_GPU/Init/InitLattice.cpp
@@ -45,26 +45,44 @@ void initLattice(SPtr<Parameter> para, SPtr<PreProcessor> preProcessor, SPtr<Cud
         preProcessor->init(para, lev);
 
         CalcMacCompSP27(
-            para->getParD(lev)->velocityX, para->getParD(lev)->velocityY, para->getParD(lev)->velocityZ, para->getParD(lev)->rho,
-            para->getParD(lev)->pressure, para->getParD(lev)->typeOfGridNode, para->getParD(lev)->neighborX,
-            para->getParD(lev)->neighborY, para->getParD(lev)->neighborZ, para->getParD(lev)->numberOfNodes,
-            para->getParD(lev)->numberofthreads, para->getParD(lev)->distributions.f[0], para->getParD(lev)->isEvenTimestep);
+            para->getParD(lev)->velocityX, 
+            para->getParD(lev)->velocityY, 
+            para->getParD(lev)->velocityZ, 
+            para->getParD(lev)->rho,
+            para->getParD(lev)->pressure, 
+            para->getParD(lev)->typeOfGridNode, 
+            para->getParD(lev)->neighborX,
+            para->getParD(lev)->neighborY, 
+            para->getParD(lev)->neighborZ, 
+            para->getParD(lev)->numberOfNodes,
+            para->getParD(lev)->numberofthreads, 
+            para->getParD(lev)->distributions.f[0], 
+            para->getParD(lev)->isEvenTimestep);
 
         if (para->getCalcMedian()) {
             constexpr uint tdiff = 1;
-            CalcMacMedSP27(para->getParD(lev)->vx_SP_Med, para->getParD(lev)->vy_SP_Med, para->getParD(lev)->vz_SP_Med,
-                           para->getParD(lev)->rho_SP_Med, para->getParD(lev)->press_SP_Med, para->getParD(lev)->typeOfGridNode,
-                           para->getParD(lev)->neighborX, para->getParD(lev)->neighborY,
-                           para->getParD(lev)->neighborZ, tdiff, para->getParD(lev)->numberOfNodes,
-                           para->getParD(lev)->numberofthreads, para->getParD(lev)->isEvenTimestep);
+            CalcMacMedSP27(
+                para->getParD(lev)->vx_SP_Med, 
+                para->getParD(lev)->vy_SP_Med, 
+                para->getParD(lev)->vz_SP_Med,
+                para->getParD(lev)->rho_SP_Med, 
+                para->getParD(lev)->press_SP_Med, 
+                para->getParD(lev)->typeOfGridNode,
+                para->getParD(lev)->neighborX, 
+                para->getParD(lev)->neighborY,
+                para->getParD(lev)->neighborZ, 
+                tdiff, 
+                para->getParD(lev)->numberOfNodes,
+                para->getParD(lev)->numberofthreads, 
+                para->getParD(lev)->isEvenTimestep);
         }
         // advection - diffusion
         if (para->getDiffOn()) {
 
             cudaMemoryManager->cudaAllocConcentration(lev);
 
-            for (unsigned int i = 0; i < para->getParH(lev)->numberOfNodes; i++) {
-                para->getParH(lev)->Conc[i] = para->getTemperatureInit();
+            for (size_t index = 0; index < para->getParH(lev)->numberOfNodes; index++) {
+                para->getParH(lev)->Conc[index] = para->getTemperatureInit();
             }
             initTemperatur(para.get(), cudaMemoryManager.get(), lev);
         }
diff --git a/src/gpu/VirtualFluids_GPU/Init/PositionReader.cpp b/src/gpu/VirtualFluids_GPU/Init/PositionReader.cpp
index 4e5a862d3fd1ed19109073aae0fe4c731f7f3e91..6eaa0b17653aaf5257c00e674c87e2844c26cf5d 100644
--- a/src/gpu/VirtualFluids_GPU/Init/PositionReader.cpp
+++ b/src/gpu/VirtualFluids_GPU/Init/PositionReader.cpp
@@ -3,6 +3,7 @@
 #include "Parameter/Parameter.h"
 
 #include <basics/utilities/UbFileInputASCII.h>
+
 using namespace vf::lbm::dir;
 
 //////////////////////////////////////////////////////////////////////////
@@ -169,7 +170,7 @@ void PositionReader::definePropellerQs(Parameter* para)
 	//////////////////////////////////////////////////////////////////
 	for(uint u=0; u<para->getParH(para->getFine())->propellerBC.numberOfBCnodes; u++)
 	{
-		for (int dir = DIR_P00; dir<=DIR_MMM; dir++)
+		for (size_t dir = DIR_P00; dir<=DIR_MMM; dir++)
 		{
 			if ((dir==DIR_P00)  || 
 				(dir==DIR_PP0) || (dir==DIR_PM0) || (dir==DIR_P0P) || (dir==DIR_P0M) ||
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h
index 9f9f7539bc5a1e28612d956ca32234c5a3589f8a..50b4460d774010ea7d7b98cfa6fa505cdfeb88c2 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h
@@ -3,8 +3,11 @@
 
 #include <vector>
 
+#include "LBM/LB.h" 
+
 #include "Kernel/Utilities/KernelGroup.h"
 #include "PreProcessor/PreProcessorType.h"
+#include "Parameter/CudaStreamManager.h"
 
 #include <helper_cuda.h>
 
@@ -13,7 +16,7 @@ class Kernel
 public:
     virtual ~Kernel()  = default;
     virtual void run() = 0;
-    virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1) = 0; //if stream == -1: run on default stream
+    virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices, CollisionTemplate collisionTemplate, CudaStreamIndex streamIdx=CudaStreamIndex::Legacy) = 0;
 
     virtual bool checkParameter()                                = 0;
     virtual std::vector<PreProcessorType> getPreProcessorTypes() = 0;
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp
index 630aaf7339afc2907ab6bfbf65bd5fc55f75e215..9bd3945aa81147d03be2b1eac3ddec7c24d71532 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp
+++ b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp
@@ -1,9 +1,11 @@
 #include "KernelImp.h"
 
+#include "LBM/LB.h" 
+
 #include "Kernel/Utilities/CheckParameterStrategy/CheckParameterStrategy.h"
 
 
-void KernelImp::runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream)
+void KernelImp::runOnIndices(const unsigned int *indices, unsigned int size_indices, CollisionTemplate collisionTemplate, CudaStreamIndex streamIndex)
 {
     printf("Method not implemented for this Kernel \n");
 }
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h
index 0141ddda7e9579cc84148d26727ed81c084ea0c5..a96c2c123472ca33f635273e06a5bf36a745654d 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h
+++ b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h
@@ -1,6 +1,8 @@
 #ifndef KERNEL_IMP_H
 #define KERNEL_IMP_H
 
+#include "LBM/LB.h" 
+
 #include "Kernel.h"
 
 #include <memory>
@@ -9,12 +11,12 @@
 
 class CheckParameterStrategy;
 class Parameter;
-
+class CudaStreamManager; 
 class KernelImp : public Kernel
 {
 public:
     virtual void run() = 0;
-    virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1);
+    virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices, CollisionTemplate collisionTemplate, CudaStreamIndex streamIndex=CudaStreamIndex::Legacy);
 
     bool checkParameter();
     std::vector<PreProcessorType> getPreProcessorTypes();
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod27/ADComp27/ADComp27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod27/ADComp27/ADComp27.cu
index 51b9e4537fa0857e9302aa638ae7729fa9adcdbe..d4d6307f688da4c8fa37c54fb4958681d5ec4941 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod27/ADComp27/ADComp27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod27/ADComp27/ADComp27.cu
@@ -2,6 +2,7 @@
 
 #include "ADComp27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<ADComp27> ADComp27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,34 +11,19 @@ std::shared_ptr<ADComp27> ADComp27::getNewInstance(std::shared_ptr<Parameter> pa
 
 void ADComp27::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_KERNEL_AD_COMP_27 << < grid, threads >> >(	para->getParD(level)->diffusivity,
-												para->getParD(level)->typeOfGridNode,
-												para->getParD(level)->neighborX,
-												para->getParD(level)->neighborY,
-												para->getParD(level)->neighborZ,
-												para->getParD(level)->distributions.f[0],
-												para->getParD(level)->distributionsAD27.f[0],
-												para->getParD(level)->numberOfNodes,
-												para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_ThS27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_KERNEL_AD_COMP_27<<< grid.grid, grid.threads >>>(
+        para->getParD(level)->diffusivity,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->distributionsAD27.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_KERNEL_AD_COMP_27 execution failed");
 }
 
 ADComp27::ADComp27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod27/ADComp27/ADComp27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod27/ADComp27/ADComp27_Device.cu
index b4c1236300bbb49fe2df1b3f458f506e989e142b..40adfff91713b7d6db1e861be9282d1f38516c22 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod27/ADComp27/ADComp27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod27/ADComp27/ADComp27_Device.cu
@@ -38,125 +38,125 @@ __global__ void LB_KERNEL_AD_COMP_27(real diffusivity,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			Distributions27 D27;
 			if (EvenOrOdd == true)
 			{
-				D27.f[DIR_P00] = &DD27[DIR_P00   *size_Mat];
-				D27.f[DIR_M00] = &DD27[DIR_M00   *size_Mat];
-				D27.f[DIR_0P0] = &DD27[DIR_0P0   *size_Mat];
-				D27.f[DIR_0M0] = &DD27[DIR_0M0   *size_Mat];
-				D27.f[DIR_00P] = &DD27[DIR_00P   *size_Mat];
-				D27.f[DIR_00M] = &DD27[DIR_00M   *size_Mat];
-				D27.f[DIR_PP0] = &DD27[DIR_PP0  *size_Mat];
-				D27.f[DIR_MM0] = &DD27[DIR_MM0  *size_Mat];
-				D27.f[DIR_PM0] = &DD27[DIR_PM0  *size_Mat];
-				D27.f[DIR_MP0] = &DD27[DIR_MP0  *size_Mat];
-				D27.f[DIR_P0P] = &DD27[DIR_P0P  *size_Mat];
-				D27.f[DIR_M0M] = &DD27[DIR_M0M  *size_Mat];
-				D27.f[DIR_P0M] = &DD27[DIR_P0M  *size_Mat];
-				D27.f[DIR_M0P] = &DD27[DIR_M0P  *size_Mat];
-				D27.f[DIR_0PP] = &DD27[DIR_0PP  *size_Mat];
-				D27.f[DIR_0MM] = &DD27[DIR_0MM  *size_Mat];
-				D27.f[DIR_0PM] = &DD27[DIR_0PM  *size_Mat];
-				D27.f[DIR_0MP] = &DD27[DIR_0MP  *size_Mat];
-				D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-				D27.f[DIR_PPP] = &DD27[DIR_PPP *size_Mat];
-				D27.f[DIR_MMP] = &DD27[DIR_MMP *size_Mat];
-				D27.f[DIR_PMP] = &DD27[DIR_PMP *size_Mat];
-				D27.f[DIR_MPP] = &DD27[DIR_MPP *size_Mat];
-				D27.f[DIR_PPM] = &DD27[DIR_PPM *size_Mat];
-				D27.f[DIR_MMM] = &DD27[DIR_MMM *size_Mat];
-				D27.f[DIR_PMM] = &DD27[DIR_PMM *size_Mat];
-				D27.f[DIR_MPM] = &DD27[DIR_MPM *size_Mat];
+				D27.f[DIR_P00] = &DD27[DIR_P00 * size_Mat];
+				D27.f[DIR_M00] = &DD27[DIR_M00 * size_Mat];
+				D27.f[DIR_0P0] = &DD27[DIR_0P0 * size_Mat];
+				D27.f[DIR_0M0] = &DD27[DIR_0M0 * size_Mat];
+				D27.f[DIR_00P] = &DD27[DIR_00P * size_Mat];
+				D27.f[DIR_00M] = &DD27[DIR_00M * size_Mat];
+				D27.f[DIR_PP0] = &DD27[DIR_PP0 * size_Mat];
+				D27.f[DIR_MM0] = &DD27[DIR_MM0 * size_Mat];
+				D27.f[DIR_PM0] = &DD27[DIR_PM0 * size_Mat];
+				D27.f[DIR_MP0] = &DD27[DIR_MP0 * size_Mat];
+				D27.f[DIR_P0P] = &DD27[DIR_P0P * size_Mat];
+				D27.f[DIR_M0M] = &DD27[DIR_M0M * size_Mat];
+				D27.f[DIR_P0M] = &DD27[DIR_P0M * size_Mat];
+				D27.f[DIR_M0P] = &DD27[DIR_M0P * size_Mat];
+				D27.f[DIR_0PP] = &DD27[DIR_0PP * size_Mat];
+				D27.f[DIR_0MM] = &DD27[DIR_0MM * size_Mat];
+				D27.f[DIR_0PM] = &DD27[DIR_0PM * size_Mat];
+				D27.f[DIR_0MP] = &DD27[DIR_0MP * size_Mat];
+				D27.f[DIR_000] = &DD27[DIR_000 * size_Mat];
+				D27.f[DIR_PPP] = &DD27[DIR_PPP * size_Mat];
+				D27.f[DIR_MMP] = &DD27[DIR_MMP * size_Mat];
+				D27.f[DIR_PMP] = &DD27[DIR_PMP * size_Mat];
+				D27.f[DIR_MPP] = &DD27[DIR_MPP * size_Mat];
+				D27.f[DIR_PPM] = &DD27[DIR_PPM * size_Mat];
+				D27.f[DIR_MMM] = &DD27[DIR_MMM * size_Mat];
+				D27.f[DIR_PMM] = &DD27[DIR_PMM * size_Mat];
+				D27.f[DIR_MPM] = &DD27[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D27.f[DIR_M00] = &DD27[DIR_P00   *size_Mat];
-				D27.f[DIR_P00] = &DD27[DIR_M00   *size_Mat];
-				D27.f[DIR_0M0] = &DD27[DIR_0P0   *size_Mat];
-				D27.f[DIR_0P0] = &DD27[DIR_0M0   *size_Mat];
-				D27.f[DIR_00M] = &DD27[DIR_00P   *size_Mat];
-				D27.f[DIR_00P] = &DD27[DIR_00M   *size_Mat];
-				D27.f[DIR_MM0] = &DD27[DIR_PP0  *size_Mat];
-				D27.f[DIR_PP0] = &DD27[DIR_MM0  *size_Mat];
-				D27.f[DIR_MP0] = &DD27[DIR_PM0  *size_Mat];
-				D27.f[DIR_PM0] = &DD27[DIR_MP0  *size_Mat];
-				D27.f[DIR_M0M] = &DD27[DIR_P0P  *size_Mat];
-				D27.f[DIR_P0P] = &DD27[DIR_M0M  *size_Mat];
-				D27.f[DIR_M0P] = &DD27[DIR_P0M  *size_Mat];
-				D27.f[DIR_P0M] = &DD27[DIR_M0P  *size_Mat];
-				D27.f[DIR_0MM] = &DD27[DIR_0PP  *size_Mat];
-				D27.f[DIR_0PP] = &DD27[DIR_0MM  *size_Mat];
-				D27.f[DIR_0MP] = &DD27[DIR_0PM  *size_Mat];
-				D27.f[DIR_0PM] = &DD27[DIR_0MP  *size_Mat];
-				D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-				D27.f[DIR_MMM] = &DD27[DIR_PPP *size_Mat];
-				D27.f[DIR_PPM] = &DD27[DIR_MMP *size_Mat];
-				D27.f[DIR_MPM] = &DD27[DIR_PMP *size_Mat];
-				D27.f[DIR_PMM] = &DD27[DIR_MPP *size_Mat];
-				D27.f[DIR_MMP] = &DD27[DIR_PPM *size_Mat];
-				D27.f[DIR_PPP] = &DD27[DIR_MMM *size_Mat];
-				D27.f[DIR_MPP] = &DD27[DIR_PMM *size_Mat];
-				D27.f[DIR_PMP] = &DD27[DIR_MPM *size_Mat];
+				D27.f[DIR_M00] = &DD27[DIR_P00 * size_Mat];
+				D27.f[DIR_P00] = &DD27[DIR_M00 * size_Mat];
+				D27.f[DIR_0M0] = &DD27[DIR_0P0 * size_Mat];
+				D27.f[DIR_0P0] = &DD27[DIR_0M0 * size_Mat];
+				D27.f[DIR_00M] = &DD27[DIR_00P * size_Mat];
+				D27.f[DIR_00P] = &DD27[DIR_00M * size_Mat];
+				D27.f[DIR_MM0] = &DD27[DIR_PP0 * size_Mat];
+				D27.f[DIR_PP0] = &DD27[DIR_MM0 * size_Mat];
+				D27.f[DIR_MP0] = &DD27[DIR_PM0 * size_Mat];
+				D27.f[DIR_PM0] = &DD27[DIR_MP0 * size_Mat];
+				D27.f[DIR_M0M] = &DD27[DIR_P0P * size_Mat];
+				D27.f[DIR_P0P] = &DD27[DIR_M0M * size_Mat];
+				D27.f[DIR_M0P] = &DD27[DIR_P0M * size_Mat];
+				D27.f[DIR_P0M] = &DD27[DIR_M0P * size_Mat];
+				D27.f[DIR_0MM] = &DD27[DIR_0PP * size_Mat];
+				D27.f[DIR_0PP] = &DD27[DIR_0MM * size_Mat];
+				D27.f[DIR_0MP] = &DD27[DIR_0PM * size_Mat];
+				D27.f[DIR_0PM] = &DD27[DIR_0MP * size_Mat];
+				D27.f[DIR_000] = &DD27[DIR_000 * size_Mat];
+				D27.f[DIR_MMM] = &DD27[DIR_PPP * size_Mat];
+				D27.f[DIR_PPM] = &DD27[DIR_MMP * size_Mat];
+				D27.f[DIR_MPM] = &DD27[DIR_PMP * size_Mat];
+				D27.f[DIR_PMM] = &DD27[DIR_MPP * size_Mat];
+				D27.f[DIR_MMP] = &DD27[DIR_PPM * size_Mat];
+				D27.f[DIR_PPP] = &DD27[DIR_MMM * size_Mat];
+				D27.f[DIR_MPP] = &DD27[DIR_PMM * size_Mat];
+				D27.f[DIR_PMP] = &DD27[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod7/ADComp7/ADComp7.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod7/ADComp7/ADComp7.cu
index ab9b0c444513455e0498d79614575e87c2afb6a0..3ee06a1e9ea77c8443d94f44ea54d11ffe7304ac 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod7/ADComp7/ADComp7.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod7/ADComp7/ADComp7.cu
@@ -2,6 +2,7 @@
 
 #include "ADComp7_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<ADComp7> ADComp7::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,34 +11,19 @@ std::shared_ptr<ADComp7> ADComp7::getNewInstance(std::shared_ptr<Parameter> para
 
 void ADComp7::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_AD_Comp_7 << < grid, threads >> >(	para->getParD(level)->diffusivity,
-											para->getParD(level)->typeOfGridNode,
-											para->getParD(level)->neighborX,
-											para->getParD(level)->neighborY,
-											para->getParD(level)->neighborZ,
-											para->getParD(level)->distributions.f[0], 
-											para->getParD(level)->distributionsAD7.f[0], 
-											para->getParD(level)->numberOfNodes,
-											para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_ThS7 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_AD_Comp_7<<< grid.grid, grid.threads >>>(
+        para->getParD(level)->diffusivity,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0], 
+        para->getParD(level)->distributionsAD7.f[0], 
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_AD_Comp_7 execution failed");
 }
 
 ADComp7::ADComp7(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod7/ADComp7/ADComp7_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod7/ADComp7/ADComp7_Device.cu
index 52ab9ba6e968ec2293f0a1c4959323c43f328206..ddaed84703640cd9c7d12d142ccc1bf8f9ea7efc 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod7/ADComp7/ADComp7_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Compressible/Mod7/ADComp7/ADComp7_Device.cu
@@ -39,63 +39,63 @@ __global__ void LB_Kernel_AD_Comp_7(real diffusivity,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			Distributions7 D7;
@@ -157,33 +157,33 @@ __global__ void LB_Kernel_AD_Comp_7(real diffusivity,
 			real fTNE = (D.f[DIR_MMM])[kbsw];
 			real fTNW = (D.f[DIR_PMM])[kbs];//kbse
 			real fTSE = (D.f[DIR_MPM])[kbw];//kbnw
-										   //real fE    =  (D.f[DIR_P00   ])[k  ];//ke
-										   //real fW    =  (D.f[DIR_M00   ])[kw ];
-										   //real fN    =  (D.f[DIR_0P0   ])[k  ];//kn
-										   //real fS    =  (D.f[DIR_0M0   ])[ks ];
-										   //real fT    =  (D.f[DIR_00P   ])[k  ];//kt
-										   //real fB    =  (D.f[DIR_00M   ])[kb ];
-										   //real fNE   =  (D.f[DIR_PP0  ])[k  ];//kne
-										   //real fSW   =  (D.f[DIR_MM0  ])[ksw];
-										   //real fSE   =  (D.f[DIR_PM0  ])[ks ];//kse
-										   //real fNW   =  (D.f[DIR_MP0  ])[kw ];//knw
-										   //real fTE   =  (D.f[DIR_P0P  ])[k  ];//kte
-										   //real fBW   =  (D.f[DIR_M0M  ])[kbw];
-										   //real fBE   =  (D.f[DIR_P0M  ])[kb ];//kbe
-										   //real fTW   =  (D.f[DIR_M0P  ])[kw ];//ktw
-										   //real fTN   =  (D.f[DIR_0PP  ])[k  ];//ktn
-										   //real fBS   =  (D.f[DIR_0MM  ])[kbs];
-										   //real fBN   =  (D.f[DIR_0PM  ])[kb ];//kbn
-										   //real fTS   =  (D.f[DIR_0MP  ])[ks ];//kts
+										   //real fE    =  (D.f[DIR_P00])[k  ];//ke
+										   //real fW    =  (D.f[DIR_M00])[kw ];
+										   //real fN    =  (D.f[DIR_0P0])[k  ];//kn
+										   //real fS    =  (D.f[DIR_0M0])[ks ];
+										   //real fT    =  (D.f[DIR_00P])[k  ];//kt
+										   //real fB    =  (D.f[DIR_00M])[kb ];
+										   //real fNE   =  (D.f[DIR_PP0])[k  ];//kne
+										   //real fSW   =  (D.f[DIR_MM0])[ksw];
+										   //real fSE   =  (D.f[DIR_PM0])[ks ];//kse
+										   //real fNW   =  (D.f[DIR_MP0])[kw ];//knw
+										   //real fTE   =  (D.f[DIR_P0P])[k  ];//kte
+										   //real fBW   =  (D.f[DIR_M0M])[kbw];
+										   //real fBE   =  (D.f[DIR_P0M])[kb ];//kbe
+										   //real fTW   =  (D.f[DIR_M0P])[kw ];//ktw
+										   //real fTN   =  (D.f[DIR_0PP])[k  ];//ktn
+										   //real fBS   =  (D.f[DIR_0MM])[kbs];
+										   //real fBN   =  (D.f[DIR_0PM])[kb ];//kbn
+										   //real fTS   =  (D.f[DIR_0MP])[ks ];//kts
 										   //real fZERO =  (D.f[DIR_000])[k  ];//kzero
-										   //real fTNE   = (D.f[DIR_PPP ])[k  ];//ktne
-										   //real fTSW   = (D.f[DIR_MMP ])[ksw];//ktsw
-										   //real fTSE   = (D.f[DIR_PMP ])[ks ];//ktse
-										   //real fTNW   = (D.f[DIR_MPP ])[kw ];//ktnw
-										   //real fBNE   = (D.f[DIR_PPM ])[kb ];//kbne
-										   //real fBSW   = (D.f[DIR_MMM ])[kbsw];
-										   //real fBSE   = (D.f[DIR_PMM ])[kbs];//kbse
-										   //real fBNW   = (D.f[DIR_MPM ])[kbw];//kbnw
+										   //real fTNE   = (D.f[DIR_PPP])[k  ];//ktne
+										   //real fTSW   = (D.f[DIR_MMP])[ksw];//ktsw
+										   //real fTSE   = (D.f[DIR_PMP])[ks ];//ktse
+										   //real fTNW   = (D.f[DIR_MPP])[kw ];//ktnw
+										   //real fBNE   = (D.f[DIR_PPM])[kb ];//kbne
+										   //real fBSW   = (D.f[DIR_MMM])[kbsw];
+										   //real fBSE   = (D.f[DIR_PMM])[kbs];//kbse
+										   //real fBNW   = (D.f[DIR_MPM])[kbw];//kbnw
 										   ////////////////////////////////////////////////////////////////////////////////
 			real f7ZERO = (D7.f[0])[k];
 			real f7E = (D7.f[1])[k];
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod27/ADIncomp27/ADIncomp27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod27/ADIncomp27/ADIncomp27.cu
index 4ad8a4678ae2e4025a90f639ae366311a247e4b3..f2a9feaa998b628fb782844d1a7d946317e5af5f 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod27/ADIncomp27/ADIncomp27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod27/ADIncomp27/ADIncomp27.cu
@@ -2,6 +2,7 @@
 
 #include "ADIncomp27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<ADIncomp27> ADIncomp27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,34 +11,19 @@ std::shared_ptr<ADIncomp27> ADIncomp27::getNewInstance(std::shared_ptr<Parameter
 
 void ADIncomp27::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_AD_Incomp_27 << < grid, threads >> >(	para->getParD(level)->diffusivity, 
-													para->getParD(level)->typeOfGridNode,
-													para->getParD(level)->neighborX, 
-													para->getParD(level)->neighborY, 
-													para->getParD(level)->neighborZ,
-													para->getParD(level)->distributions.f[0], 
-													para->getParD(level)->distributionsAD27.f[0], 
-													para->getParD(level)->numberOfNodes,
-													para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_AD_Incomp_27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_AD_Incomp_27<<< grid.grid, grid.threads >>>(
+        para->getParD(level)->diffusivity, 
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX, 
+        para->getParD(level)->neighborY, 
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0], 
+        para->getParD(level)->distributionsAD27.f[0], 
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_AD_Incomp_27 execution failed");
 }
 
 ADIncomp27::ADIncomp27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod27/ADIncomp27/ADIncomp27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod27/ADIncomp27/ADIncomp27_Device.cu
index e686825ed100417110b02360876dec076553d7de..f9fdcee0f34106b05da0edc16e3fdd89f859752e 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod27/ADIncomp27/ADIncomp27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod27/ADIncomp27/ADIncomp27_Device.cu
@@ -13,7 +13,7 @@ __global__ void LB_Kernel_AD_Incomp_27(real diffusivity,
 	unsigned int* neighborZ,
 	real* DDStart,
 	real* DD27,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	bool EvenOrOdd)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -27,7 +27,7 @@ __global__ void LB_Kernel_AD_Incomp_27(real diffusivity,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if (k<size_Mat)
+	if (k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -38,125 +38,125 @@ __global__ void LB_Kernel_AD_Incomp_27(real diffusivity,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			Distributions27 D27;
 			if (EvenOrOdd == true)
 			{
-				D27.f[DIR_P00] = &DD27[DIR_P00   *size_Mat];
-				D27.f[DIR_M00] = &DD27[DIR_M00   *size_Mat];
-				D27.f[DIR_0P0] = &DD27[DIR_0P0   *size_Mat];
-				D27.f[DIR_0M0] = &DD27[DIR_0M0   *size_Mat];
-				D27.f[DIR_00P] = &DD27[DIR_00P   *size_Mat];
-				D27.f[DIR_00M] = &DD27[DIR_00M   *size_Mat];
-				D27.f[DIR_PP0] = &DD27[DIR_PP0  *size_Mat];
-				D27.f[DIR_MM0] = &DD27[DIR_MM0  *size_Mat];
-				D27.f[DIR_PM0] = &DD27[DIR_PM0  *size_Mat];
-				D27.f[DIR_MP0] = &DD27[DIR_MP0  *size_Mat];
-				D27.f[DIR_P0P] = &DD27[DIR_P0P  *size_Mat];
-				D27.f[DIR_M0M] = &DD27[DIR_M0M  *size_Mat];
-				D27.f[DIR_P0M] = &DD27[DIR_P0M  *size_Mat];
-				D27.f[DIR_M0P] = &DD27[DIR_M0P  *size_Mat];
-				D27.f[DIR_0PP] = &DD27[DIR_0PP  *size_Mat];
-				D27.f[DIR_0MM] = &DD27[DIR_0MM  *size_Mat];
-				D27.f[DIR_0PM] = &DD27[DIR_0PM  *size_Mat];
-				D27.f[DIR_0MP] = &DD27[DIR_0MP  *size_Mat];
-				D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-				D27.f[DIR_PPP] = &DD27[DIR_PPP *size_Mat];
-				D27.f[DIR_MMP] = &DD27[DIR_MMP *size_Mat];
-				D27.f[DIR_PMP] = &DD27[DIR_PMP *size_Mat];
-				D27.f[DIR_MPP] = &DD27[DIR_MPP *size_Mat];
-				D27.f[DIR_PPM] = &DD27[DIR_PPM *size_Mat];
-				D27.f[DIR_MMM] = &DD27[DIR_MMM *size_Mat];
-				D27.f[DIR_PMM] = &DD27[DIR_PMM *size_Mat];
-				D27.f[DIR_MPM] = &DD27[DIR_MPM *size_Mat];
+				D27.f[DIR_P00] = &DD27[DIR_P00 * numberOfLBnodes];
+				D27.f[DIR_M00] = &DD27[DIR_M00 * numberOfLBnodes];
+				D27.f[DIR_0P0] = &DD27[DIR_0P0 * numberOfLBnodes];
+				D27.f[DIR_0M0] = &DD27[DIR_0M0 * numberOfLBnodes];
+				D27.f[DIR_00P] = &DD27[DIR_00P * numberOfLBnodes];
+				D27.f[DIR_00M] = &DD27[DIR_00M * numberOfLBnodes];
+				D27.f[DIR_PP0] = &DD27[DIR_PP0 * numberOfLBnodes];
+				D27.f[DIR_MM0] = &DD27[DIR_MM0 * numberOfLBnodes];
+				D27.f[DIR_PM0] = &DD27[DIR_PM0 * numberOfLBnodes];
+				D27.f[DIR_MP0] = &DD27[DIR_MP0 * numberOfLBnodes];
+				D27.f[DIR_P0P] = &DD27[DIR_P0P * numberOfLBnodes];
+				D27.f[DIR_M0M] = &DD27[DIR_M0M * numberOfLBnodes];
+				D27.f[DIR_P0M] = &DD27[DIR_P0M * numberOfLBnodes];
+				D27.f[DIR_M0P] = &DD27[DIR_M0P * numberOfLBnodes];
+				D27.f[DIR_0PP] = &DD27[DIR_0PP * numberOfLBnodes];
+				D27.f[DIR_0MM] = &DD27[DIR_0MM * numberOfLBnodes];
+				D27.f[DIR_0PM] = &DD27[DIR_0PM * numberOfLBnodes];
+				D27.f[DIR_0MP] = &DD27[DIR_0MP * numberOfLBnodes];
+				D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+				D27.f[DIR_PPP] = &DD27[DIR_PPP * numberOfLBnodes];
+				D27.f[DIR_MMP] = &DD27[DIR_MMP * numberOfLBnodes];
+				D27.f[DIR_PMP] = &DD27[DIR_PMP * numberOfLBnodes];
+				D27.f[DIR_MPP] = &DD27[DIR_MPP * numberOfLBnodes];
+				D27.f[DIR_PPM] = &DD27[DIR_PPM * numberOfLBnodes];
+				D27.f[DIR_MMM] = &DD27[DIR_MMM * numberOfLBnodes];
+				D27.f[DIR_PMM] = &DD27[DIR_PMM * numberOfLBnodes];
+				D27.f[DIR_MPM] = &DD27[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D27.f[DIR_M00] = &DD27[DIR_P00   *size_Mat];
-				D27.f[DIR_P00] = &DD27[DIR_M00   *size_Mat];
-				D27.f[DIR_0M0] = &DD27[DIR_0P0   *size_Mat];
-				D27.f[DIR_0P0] = &DD27[DIR_0M0   *size_Mat];
-				D27.f[DIR_00M] = &DD27[DIR_00P   *size_Mat];
-				D27.f[DIR_00P] = &DD27[DIR_00M   *size_Mat];
-				D27.f[DIR_MM0] = &DD27[DIR_PP0  *size_Mat];
-				D27.f[DIR_PP0] = &DD27[DIR_MM0  *size_Mat];
-				D27.f[DIR_MP0] = &DD27[DIR_PM0  *size_Mat];
-				D27.f[DIR_PM0] = &DD27[DIR_MP0  *size_Mat];
-				D27.f[DIR_M0M] = &DD27[DIR_P0P  *size_Mat];
-				D27.f[DIR_P0P] = &DD27[DIR_M0M  *size_Mat];
-				D27.f[DIR_M0P] = &DD27[DIR_P0M  *size_Mat];
-				D27.f[DIR_P0M] = &DD27[DIR_M0P  *size_Mat];
-				D27.f[DIR_0MM] = &DD27[DIR_0PP  *size_Mat];
-				D27.f[DIR_0PP] = &DD27[DIR_0MM  *size_Mat];
-				D27.f[DIR_0MP] = &DD27[DIR_0PM  *size_Mat];
-				D27.f[DIR_0PM] = &DD27[DIR_0MP  *size_Mat];
-				D27.f[DIR_000] = &DD27[DIR_000*size_Mat];
-				D27.f[DIR_MMM] = &DD27[DIR_PPP *size_Mat];
-				D27.f[DIR_PPM] = &DD27[DIR_MMP *size_Mat];
-				D27.f[DIR_MPM] = &DD27[DIR_PMP *size_Mat];
-				D27.f[DIR_PMM] = &DD27[DIR_MPP *size_Mat];
-				D27.f[DIR_MMP] = &DD27[DIR_PPM *size_Mat];
-				D27.f[DIR_PPP] = &DD27[DIR_MMM *size_Mat];
-				D27.f[DIR_MPP] = &DD27[DIR_PMM *size_Mat];
-				D27.f[DIR_PMP] = &DD27[DIR_MPM *size_Mat];
+				D27.f[DIR_M00] = &DD27[DIR_P00 * numberOfLBnodes];
+				D27.f[DIR_P00] = &DD27[DIR_M00 * numberOfLBnodes];
+				D27.f[DIR_0M0] = &DD27[DIR_0P0 * numberOfLBnodes];
+				D27.f[DIR_0P0] = &DD27[DIR_0M0 * numberOfLBnodes];
+				D27.f[DIR_00M] = &DD27[DIR_00P * numberOfLBnodes];
+				D27.f[DIR_00P] = &DD27[DIR_00M * numberOfLBnodes];
+				D27.f[DIR_MM0] = &DD27[DIR_PP0 * numberOfLBnodes];
+				D27.f[DIR_PP0] = &DD27[DIR_MM0 * numberOfLBnodes];
+				D27.f[DIR_MP0] = &DD27[DIR_PM0 * numberOfLBnodes];
+				D27.f[DIR_PM0] = &DD27[DIR_MP0 * numberOfLBnodes];
+				D27.f[DIR_M0M] = &DD27[DIR_P0P * numberOfLBnodes];
+				D27.f[DIR_P0P] = &DD27[DIR_M0M * numberOfLBnodes];
+				D27.f[DIR_M0P] = &DD27[DIR_P0M * numberOfLBnodes];
+				D27.f[DIR_P0M] = &DD27[DIR_M0P * numberOfLBnodes];
+				D27.f[DIR_0MM] = &DD27[DIR_0PP * numberOfLBnodes];
+				D27.f[DIR_0PP] = &DD27[DIR_0MM * numberOfLBnodes];
+				D27.f[DIR_0MP] = &DD27[DIR_0PM * numberOfLBnodes];
+				D27.f[DIR_0PM] = &DD27[DIR_0MP * numberOfLBnodes];
+				D27.f[DIR_000] = &DD27[DIR_000 * numberOfLBnodes];
+				D27.f[DIR_MMM] = &DD27[DIR_PPP * numberOfLBnodes];
+				D27.f[DIR_PPM] = &DD27[DIR_MMP * numberOfLBnodes];
+				D27.f[DIR_MPM] = &DD27[DIR_PMP * numberOfLBnodes];
+				D27.f[DIR_PMM] = &DD27[DIR_MPP * numberOfLBnodes];
+				D27.f[DIR_MMP] = &DD27[DIR_PPM * numberOfLBnodes];
+				D27.f[DIR_PPP] = &DD27[DIR_MMM * numberOfLBnodes];
+				D27.f[DIR_MPP] = &DD27[DIR_PMM * numberOfLBnodes];
+				D27.f[DIR_PMP] = &DD27[DIR_MPM * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -197,33 +197,33 @@ __global__ void LB_Kernel_AD_Incomp_27(real diffusivity,
 			real fTNW = (D.f[DIR_PMM])[kbs];//kbse
 			real fTSE = (D.f[DIR_MPM])[kbw];//kbnw
 										   ////////////////////////////////////////////////////////////////////////////////
-										   //real f27E    =  (D27.f[DIR_P00   ])[k  ];//ke
-										   //real f27W    =  (D27.f[DIR_M00   ])[kw ];
-										   //real f27N    =  (D27.f[DIR_0P0   ])[k  ];//kn
-										   //real f27S    =  (D27.f[DIR_0M0   ])[ks ];
-										   //real f27T    =  (D27.f[DIR_00P   ])[k  ];//kt
-										   //real f27B    =  (D27.f[DIR_00M   ])[kb ];
-										   //real f27NE   =  (D27.f[DIR_PP0  ])[k  ];//kne
-										   //real f27SW   =  (D27.f[DIR_MM0  ])[ksw];
-										   //real f27SE   =  (D27.f[DIR_PM0  ])[ks ];//kse
-										   //real f27NW   =  (D27.f[DIR_MP0  ])[kw ];//knw
-										   //real f27TE   =  (D27.f[DIR_P0P  ])[k  ];//kte
-										   //real f27BW   =  (D27.f[DIR_M0M  ])[kbw];
-										   //real f27BE   =  (D27.f[DIR_P0M  ])[kb ];//kbe
-										   //real f27TW   =  (D27.f[DIR_M0P  ])[kw ];//ktw
-										   //real f27TN   =  (D27.f[DIR_0PP  ])[k  ];//ktn
-										   //real f27BS   =  (D27.f[DIR_0MM  ])[kbs];
-										   //real f27BN   =  (D27.f[DIR_0PM  ])[kb ];//kbn
-										   //real f27TS   =  (D27.f[DIR_0MP  ])[ks ];//kts
+										   //real f27E    =  (D27.f[DIR_P00])[k  ];//ke
+										   //real f27W    =  (D27.f[DIR_M00])[kw ];
+										   //real f27N    =  (D27.f[DIR_0P0])[k  ];//kn
+										   //real f27S    =  (D27.f[DIR_0M0])[ks ];
+										   //real f27T    =  (D27.f[DIR_00P])[k  ];//kt
+										   //real f27B    =  (D27.f[DIR_00M])[kb ];
+										   //real f27NE   =  (D27.f[DIR_PP0])[k  ];//kne
+										   //real f27SW   =  (D27.f[DIR_MM0])[ksw];
+										   //real f27SE   =  (D27.f[DIR_PM0])[ks ];//kse
+										   //real f27NW   =  (D27.f[DIR_MP0])[kw ];//knw
+										   //real f27TE   =  (D27.f[DIR_P0P])[k  ];//kte
+										   //real f27BW   =  (D27.f[DIR_M0M])[kbw];
+										   //real f27BE   =  (D27.f[DIR_P0M])[kb ];//kbe
+										   //real f27TW   =  (D27.f[DIR_M0P])[kw ];//ktw
+										   //real f27TN   =  (D27.f[DIR_0PP])[k  ];//ktn
+										   //real f27BS   =  (D27.f[DIR_0MM])[kbs];
+										   //real f27BN   =  (D27.f[DIR_0PM])[kb ];//kbn
+										   //real f27TS   =  (D27.f[DIR_0MP])[ks ];//kts
 										   //real f27ZERO =  (D27.f[DIR_000])[k  ];//kzero
-										   //real f27TNE  =  (D27.f[DIR_PPP ])[k  ];//ktne
-										   //real f27TSW  =  (D27.f[DIR_MMP ])[ksw];//ktsw
-										   //real f27TSE  =  (D27.f[DIR_PMP ])[ks ];//ktse
-										   //real f27TNW  =  (D27.f[DIR_MPP ])[kw ];//ktnw
-										   //real f27BNE  =  (D27.f[DIR_PPM ])[kb ];//kbne
-										   //real f27BSW  =  (D27.f[DIR_MMM ])[kbsw];
-										   //real f27BSE  =  (D27.f[DIR_PMM ])[kbs];//kbse
-										   //real f27BNW  =  (D27.f[DIR_MPM ])[kbw];//kbnw
+										   //real f27TNE  =  (D27.f[DIR_PPP])[k  ];//ktne
+										   //real f27TSW  =  (D27.f[DIR_MMP])[ksw];//ktsw
+										   //real f27TSE  =  (D27.f[DIR_PMP])[ks ];//ktse
+										   //real f27TNW  =  (D27.f[DIR_MPP])[kw ];//ktnw
+										   //real f27BNE  =  (D27.f[DIR_PPM])[kb ];//kbne
+										   //real f27BSW  =  (D27.f[DIR_MMM])[kbsw];
+										   //real f27BSE  =  (D27.f[DIR_PMM])[kbs];//kbse
+										   //real f27BNW  =  (D27.f[DIR_MPM])[kbw];//kbnw
 										   ////////////////////////////////////////////////////////////////////////////////
 										   //real vx1     =  ((fTNE-fBSW)+(fBNE-fTSW)+(fTSE-fBNW)+(fBSE-fTNW) +(fNE-fSW)+(fSE-fNW)+(fTE-fBW)+(fBE-fTW)+(fE-fW));
 										   //real vx2     =  ((fTNE-fBSW)+(fBNE-fTSW)+(fBNW-fTSE)+(fTNW-fBSE) +(fNE-fSW)+(fNW-fSE)+(fTN-fBS)+(fBN-fTS)+(fN-fS));
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod27/ADIncomp27/ADIncomp27_Device.cuh b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod27/ADIncomp27/ADIncomp27_Device.cuh
index a6d94de4fadb9a93a9e5fed63d87731b12ec2a07..3abee563f676910f422bba0930060c2a0b0c0e21 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod27/ADIncomp27/ADIncomp27_Device.cuh
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod27/ADIncomp27/ADIncomp27_Device.cuh
@@ -11,7 +11,7 @@ __global__ void LB_Kernel_AD_Incomp_27(real diffusivity,
 	unsigned int* neighborZ,
 	real* DDStart,
 	real* DD27,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	bool EvenOrOdd);
 
 #endif
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADIncomp7/ADIncomp7.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADIncomp7/ADIncomp7.cu
index 27da776eb7612307fa4f9af2886594fc0c75d90b..d0c6a6a24ab4d0ebebee9324bdafa1f9e3db51b9 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADIncomp7/ADIncomp7.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADIncomp7/ADIncomp7.cu
@@ -2,6 +2,7 @@
 
 #include "ADIncomp7_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<ADIncomp7> ADIncomp7::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,34 +11,19 @@ std::shared_ptr<ADIncomp7> ADIncomp7::getNewInstance(std::shared_ptr<Parameter>
 
 void ADIncomp7::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_AD_Incomp_7 << < grid, threads >> >(	para->getParD(level)->diffusivity, 
-													para->getParD(level)->typeOfGridNode,
-													para->getParD(level)->neighborX, 
-													para->getParD(level)->neighborY, 
-													para->getParD(level)->neighborZ,
-													para->getParD(level)->distributions.f[0],
-													para->getParD(level)->distributionsAD7.f[0], 
-													para->getParD(level)->numberOfNodes,
-													para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_AD_Incomp_7 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_AD_Incomp_7<<< grid.grid, grid.threads >>>(
+        para->getParD(level)->diffusivity, 
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX, 
+        para->getParD(level)->neighborY, 
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->distributionsAD7.f[0], 
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_AD_Incomp_7 execution failed");
 }
 
 ADIncomp7::ADIncomp7(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADIncomp7/ADIncomp7_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADIncomp7/ADIncomp7_Device.cu
index d49b0b48d20d976076a52f804d485b68da55348e..e0bcc4e515b1b2ccf71f1050e2d572b60a40d94b 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADIncomp7/ADIncomp7_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADIncomp7/ADIncomp7_Device.cu
@@ -13,7 +13,7 @@ __global__ void LB_Kernel_AD_Incomp_7(real diffusivity,
 	unsigned int* neighborZ,
 	real* DDStart,
 	real* DD7,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	bool EvenOrOdd)
 {
 	////////////////////////////////////////////////////////////////////////////////
@@ -27,7 +27,7 @@ __global__ void LB_Kernel_AD_Incomp_7(real diffusivity,
 	const unsigned k = nx*(ny*z + y) + x;
 	//////////////////////////////////////////////////////////////////////////
 
-	if (k<size_Mat)
+	if (k<numberOfLBnodes)
 	{
 		////////////////////////////////////////////////////////////////////////////////
 		unsigned int BC;
@@ -38,85 +38,85 @@ __global__ void LB_Kernel_AD_Incomp_7(real diffusivity,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+				D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+				D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+				D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 			}
 
 			Distributions7 D7;
 			if (EvenOrOdd == true)
 			{
-				D7.f[0] = &DD7[0 * size_Mat];
-				D7.f[1] = &DD7[1 * size_Mat];
-				D7.f[2] = &DD7[2 * size_Mat];
-				D7.f[3] = &DD7[3 * size_Mat];
-				D7.f[4] = &DD7[4 * size_Mat];
-				D7.f[5] = &DD7[5 * size_Mat];
-				D7.f[6] = &DD7[6 * size_Mat];
+				D7.f[0] = &DD7[0 * numberOfLBnodes];
+				D7.f[1] = &DD7[1 * numberOfLBnodes];
+				D7.f[2] = &DD7[2 * numberOfLBnodes];
+				D7.f[3] = &DD7[3 * numberOfLBnodes];
+				D7.f[4] = &DD7[4 * numberOfLBnodes];
+				D7.f[5] = &DD7[5 * numberOfLBnodes];
+				D7.f[6] = &DD7[6 * numberOfLBnodes];
 			}
 			else
 			{
-				D7.f[0] = &DD7[0 * size_Mat];
-				D7.f[2] = &DD7[1 * size_Mat];
-				D7.f[1] = &DD7[2 * size_Mat];
-				D7.f[4] = &DD7[3 * size_Mat];
-				D7.f[3] = &DD7[4 * size_Mat];
-				D7.f[6] = &DD7[5 * size_Mat];
-				D7.f[5] = &DD7[6 * size_Mat];
+				D7.f[0] = &DD7[0 * numberOfLBnodes];
+				D7.f[2] = &DD7[1 * numberOfLBnodes];
+				D7.f[1] = &DD7[2 * numberOfLBnodes];
+				D7.f[4] = &DD7[3 * numberOfLBnodes];
+				D7.f[3] = &DD7[4 * numberOfLBnodes];
+				D7.f[6] = &DD7[5 * numberOfLBnodes];
+				D7.f[5] = &DD7[6 * numberOfLBnodes];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -156,33 +156,33 @@ __global__ void LB_Kernel_AD_Incomp_7(real diffusivity,
 			real fTNE = (D.f[DIR_MMM])[kbsw];
 			real fTNW = (D.f[DIR_PMM])[kbs];//kbse
 			real fTSE = (D.f[DIR_MPM])[kbw];//kbnw
-										   //real fE    =  (D.f[DIR_P00   ])[k  ];//ke
-										   //real fW    =  (D.f[DIR_M00   ])[kw ];
-										   //real fN    =  (D.f[DIR_0P0   ])[k  ];//kn
-										   //real fS    =  (D.f[DIR_0M0   ])[ks ];
-										   //real fT    =  (D.f[DIR_00P   ])[k  ];//kt
-										   //real fB    =  (D.f[DIR_00M   ])[kb ];
-										   //real fNE   =  (D.f[DIR_PP0  ])[k  ];//kne
-										   //real fSW   =  (D.f[DIR_MM0  ])[ksw];
-										   //real fSE   =  (D.f[DIR_PM0  ])[ks ];//kse
-										   //real fNW   =  (D.f[DIR_MP0  ])[kw ];//knw
-										   //real fTE   =  (D.f[DIR_P0P  ])[k  ];//kte
-										   //real fBW   =  (D.f[DIR_M0M  ])[kbw];
-										   //real fBE   =  (D.f[DIR_P0M  ])[kb ];//kbe
-										   //real fTW   =  (D.f[DIR_M0P  ])[kw ];//ktw
-										   //real fTN   =  (D.f[DIR_0PP  ])[k  ];//ktn
-										   //real fBS   =  (D.f[DIR_0MM  ])[kbs];
-										   //real fBN   =  (D.f[DIR_0PM  ])[kb ];//kbn
-										   //real fTS   =  (D.f[DIR_0MP  ])[ks ];//kts
+										   //real fE    =  (D.f[DIR_P00])[k  ];//ke
+										   //real fW    =  (D.f[DIR_M00])[kw ];
+										   //real fN    =  (D.f[DIR_0P0])[k  ];//kn
+										   //real fS    =  (D.f[DIR_0M0])[ks ];
+										   //real fT    =  (D.f[DIR_00P])[k  ];//kt
+										   //real fB    =  (D.f[DIR_00M])[kb ];
+										   //real fNE   =  (D.f[DIR_PP0])[k  ];//kne
+										   //real fSW   =  (D.f[DIR_MM0])[ksw];
+										   //real fSE   =  (D.f[DIR_PM0])[ks ];//kse
+										   //real fNW   =  (D.f[DIR_MP0])[kw ];//knw
+										   //real fTE   =  (D.f[DIR_P0P])[k  ];//kte
+										   //real fBW   =  (D.f[DIR_M0M])[kbw];
+										   //real fBE   =  (D.f[DIR_P0M])[kb ];//kbe
+										   //real fTW   =  (D.f[DIR_M0P])[kw ];//ktw
+										   //real fTN   =  (D.f[DIR_0PP])[k  ];//ktn
+										   //real fBS   =  (D.f[DIR_0MM])[kbs];
+										   //real fBN   =  (D.f[DIR_0PM])[kb ];//kbn
+										   //real fTS   =  (D.f[DIR_0MP])[ks ];//kts
 										   //real fZERO =  (D.f[DIR_000])[k  ];//kzero
-										   //real fTNE   = (D.f[DIR_PPP ])[k  ];//ktne
-										   //real fTSW   = (D.f[DIR_MMP ])[ksw];//ktsw
-										   //real fTSE   = (D.f[DIR_PMP ])[ks ];//ktse
-										   //real fTNW   = (D.f[DIR_MPP ])[kw ];//ktnw
-										   //real fBNE   = (D.f[DIR_PPM ])[kb ];//kbne
-										   //real fBSW   = (D.f[DIR_MMM ])[kbsw];
-										   //real fBSE   = (D.f[DIR_PMM ])[kbs];//kbse
-										   //real fBNW   = (D.f[DIR_MPM ])[kbw];//kbnw
+										   //real fTNE   = (D.f[DIR_PPP])[k  ];//ktne
+										   //real fTSW   = (D.f[DIR_MMP])[ksw];//ktsw
+										   //real fTSE   = (D.f[DIR_PMP])[ks ];//ktse
+										   //real fTNW   = (D.f[DIR_MPP])[kw ];//ktnw
+										   //real fBNE   = (D.f[DIR_PPM])[kb ];//kbne
+										   //real fBSW   = (D.f[DIR_MMM])[kbsw];
+										   //real fBSE   = (D.f[DIR_PMM])[kbs];//kbse
+										   //real fBNW   = (D.f[DIR_MPM])[kbw];//kbnw
 										   ////////////////////////////////////////////////////////////////////////////////
 			real f7ZERO = (D7.f[0])[k];
 			real f7E = (D7.f[1])[k];
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADIncomp7/ADIncomp7_Device.cuh b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADIncomp7/ADIncomp7_Device.cuh
index 25a17ddbd7038635a2beb2c39212822cbf762034..845ecda946a4e45678082b72b5c74dc96e5810c5 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADIncomp7/ADIncomp7_Device.cuh
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADIncomp7/ADIncomp7_Device.cuh
@@ -11,7 +11,7 @@ __global__ void LB_Kernel_AD_Incomp_7(real diffusivity,
 	unsigned int* neighborZ,
 	real* DDStart,
 	real* DD7,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	bool EvenOrOdd);
 
 #endif
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGK/BGKCompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGK/BGKCompSP27.cu
index d2f9f60890379d07ecc3d04f4a54d59a0754907a..8c99f3b030984aef6215d5479be4b321145ee54f 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGK/BGKCompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGK/BGKCompSP27.cu
@@ -2,6 +2,7 @@
 
 #include "BGKCompSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<BGKCompSP27> BGKCompSP27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,33 +11,18 @@ std::shared_ptr<BGKCompSP27> BGKCompSP27::getNewInstance(std::shared_ptr<Paramet
 
 void BGKCompSP27::run()
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_BGK_Comp_SP_27 << < grid, threads >> >(	para->getParD(level)->omega,
-														para->getParD(level)->typeOfGridNode,
-														para->getParD(level)->neighborX,
-														para->getParD(level)->neighborY,
-														para->getParD(level)->neighborZ,
-														para->getParD(level)->distributions.f[0],
-														para->getParD(level)->numberOfNodes,
-														para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_BGK_Comp_SP_27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_BGK_Comp_SP_27<<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_BGK_Comp_SP_27 execution failed");
 }
 
 BGKCompSP27::BGKCompSP27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGK/BGKCompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGK/BGKCompSP27_Device.cu
index 09196d13e94a2404ba280e8a8e9394f0a79e8211..3bdb65c455bd67d66e8b35961f2fa7e1de45f763 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGK/BGKCompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGK/BGKCompSP27_Device.cu
@@ -38,63 +38,63 @@ __global__ void LB_Kernel_BGK_Comp_SP_27(	real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKPlus/BGKPlusCompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKPlus/BGKPlusCompSP27.cu
index beebda2437ca4e7385ab812b9106edabe213227e..a4b136d1c21b1e4c68432eef5e21ff8c968bdfec 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKPlus/BGKPlusCompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKPlus/BGKPlusCompSP27.cu
@@ -2,6 +2,7 @@
 
 #include "BGKPlusCompSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<BGKPlusCompSP27> BGKPlusCompSP27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,33 +11,18 @@ std::shared_ptr<BGKPlusCompSP27> BGKPlusCompSP27::getNewInstance(std::shared_ptr
 
 void BGKPlusCompSP27::run()
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_BGK_Plus_Comp_SP_27 << < grid, threads >> >(	para->getParD(level)->omega,
-															para->getParD(level)->typeOfGridNode,
-															para->getParD(level)->neighborX,
-															para->getParD(level)->neighborY,
-															para->getParD(level)->neighborZ,
-															para->getParD(level)->distributions.f[0],
-															size_Mat,
-															para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_BGK_Plus_Comp_SP_27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_BGK_Plus_Comp_SP_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_BGK_Plus_Comp_SP_27 execution failed");
 }
 
 BGKPlusCompSP27::BGKPlusCompSP27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKPlus/BGKPlusCompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKPlus/BGKPlusCompSP27_Device.cu
index 325f65ece9baddf88adc91baa753bdfc4bd0eced..1f44fee9ea8b20241f87bea6310c96db2b82d1c4 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKPlus/BGKPlusCompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKPlus/BGKPlusCompSP27_Device.cu
@@ -38,63 +38,63 @@ __global__ void LB_Kernel_BGK_Plus_Comp_SP_27(
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -127,33 +127,33 @@ __global__ void LB_Kernel_BGK_Plus_Comp_SP_27(
 			//unsigned int ktne = k;
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 											////////////////////////////////////////////////////////////////////////////////////
 											//slow
 											//real oMdrho = one - ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKUnified/BGKUnified.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKUnified/BGKUnified.cu
index 3d7f6fb9a8980454ebc83c51c7dd8865688fa166..1107d343801f8ac3626b03a93ca92415217732ac 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKUnified/BGKUnified.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKUnified/BGKUnified.cu
@@ -6,6 +6,7 @@
 #include "../RunLBMKernel.cuh"
 
 #include <lbm/BGK.h>
+#include <lbm/KernelParameter.h>
 
 
 namespace vf
@@ -31,15 +32,16 @@ BGKUnified::BGKUnified(std::shared_ptr<Parameter> para, int level)
 
 void BGKUnified::run()
 {
-    GPUKernelParameter kernelParameter{ para->getParD(level)->omega,
-                                                 para->getParD(level)->typeOfGridNode,
-                                                 para->getParD(level)->neighborX,
-                                                 para->getParD(level)->neighborY,
-                                                 para->getParD(level)->neighborZ,
-                                                 para->getParD(level)->distributions.f[0],
-                                                 (int)para->getParD(level)->numberOfNodes,
-                                                 nullptr, /* forces not used in bgk kernel */
-                                                 para->getParD(level)->isEvenTimestep };
+    GPUKernelParameter kernelParameter{
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        (int)para->getParD(level)->numberOfNodes,
+        nullptr, /* forces not used in bgk kernel */
+        para->getParD(level)->isEvenTimestep };
 
     auto lambda = [] __device__(lbm::KernelParameter parameter) {
         return lbm::bgk(parameter);
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cascade/CascadeCompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cascade/CascadeCompSP27.cu
index eca3a9953024e44fd91e7f9f98956e4329574d09..dcfda06db462fd83120751a32a40365445d659ba 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cascade/CascadeCompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cascade/CascadeCompSP27.cu
@@ -2,6 +2,7 @@
 
 #include "CascadeCompSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<CascadeCompSP27> CascadeCompSP27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,33 +11,18 @@ std::shared_ptr<CascadeCompSP27> CascadeCompSP27::getNewInstance(std::shared_ptr
 
 void CascadeCompSP27::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_Cascade_Comp_SP_27 << < grid, threads >> >(	para->getParD(level)->omega,
-															para->getParD(level)->typeOfGridNode,
-															para->getParD(level)->neighborX,
-															para->getParD(level)->neighborY,
-															para->getParD(level)->neighborZ,
-															para->getParD(level)->distributions.f[0],
-															para->getParD(level)->numberOfNodes,
-															para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_Cascade_Comp_SP_27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_Cascade_Comp_SP_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_Cascade_Comp_SP_27 execution failed");
 }
 
 CascadeCompSP27::CascadeCompSP27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cascade/CascadeCompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cascade/CascadeCompSP27_Device.cu
index 3f69fa47288343fbdd91e77dbb7f154501349098..af0a7c118191243c80c420856a70711a1fc17d2b 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cascade/CascadeCompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cascade/CascadeCompSP27_Device.cu
@@ -37,63 +37,63 @@ __global__ void LB_Kernel_Cascade_Comp_SP_27(real omega,
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -126,33 +126,33 @@ __global__ void LB_Kernel_Cascade_Comp_SP_27(real omega,
 			//unsigned int ktne = k;
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];//[ke   ];
-			real mfabb = (D.f[DIR_M00   ])[kw ];//[kw   ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];//[kn   ];
-			real mfbab = (D.f[DIR_0M0   ])[ks ];//[ks   ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];//[kt   ];
-			real mfbba = (D.f[DIR_00M   ])[kb ];//[kb   ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];//[kne  ];
-			real mfaab = (D.f[DIR_MM0  ])[ksw];//[ksw  ];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];//[kse  ];
-			real mfacb = (D.f[DIR_MP0  ])[kw ];//[knw  ];
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];//[kte  ];
-			real mfaba = (D.f[DIR_M0M  ])[kbw];//[kbw  ];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];//[kbe  ];
-			real mfabc = (D.f[DIR_M0P  ])[kw ];//[ktw  ];
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];//[ktn  ];
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];//[kbs  ];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];//[kbn  ];
-			real mfbac = (D.f[DIR_0MP  ])[ks ];//[kts  ];
+			real mfcbb = (D.f[DIR_P00])[k  ];//[ke   ];
+			real mfabb = (D.f[DIR_M00])[kw ];//[kw   ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];//[kn   ];
+			real mfbab = (D.f[DIR_0M0])[ks ];//[ks   ];
+			real mfbbc = (D.f[DIR_00P])[k  ];//[kt   ];
+			real mfbba = (D.f[DIR_00M])[kb ];//[kb   ];
+			real mfccb = (D.f[DIR_PP0])[k  ];//[kne  ];
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];
+			real mfcab = (D.f[DIR_PM0])[ks ];//[kse  ];
+			real mfacb = (D.f[DIR_MP0])[kw ];//[knw  ];
+			real mfcbc = (D.f[DIR_P0P])[k  ];//[kte  ];
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];
+			real mfcba = (D.f[DIR_P0M])[kb ];//[kbe  ];
+			real mfabc = (D.f[DIR_M0P])[kw ];//[ktw  ];
+			real mfbcc = (D.f[DIR_0PP])[k  ];//[ktn  ];
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];
+			real mfbca = (D.f[DIR_0PM])[kb ];//[kbn  ];
+			real mfbac = (D.f[DIR_0MP])[ks ];//[kts  ];
 			real mfbbb = (D.f[DIR_000])[k  ];//[kzero];
-			real mfccc = (D.f[DIR_PPP ])[k  ];//[ktne ];
-			real mfaac = (D.f[DIR_MMP ])[ksw];//[ktsw ];
-			real mfcac = (D.f[DIR_PMP ])[ks ];//[ktse ];
-			real mfacc = (D.f[DIR_MPP ])[kw ];//[ktnw ];
-			real mfcca = (D.f[DIR_PPM ])[kb ];//[kbne ];
-			real mfaaa = (D.f[DIR_MMM ])[kbsw];//[kbsw ]
-			real mfcaa = (D.f[DIR_PMM ])[kbs];//[kbse ];
-			real mfaca = (D.f[DIR_MPM ])[kbw];//[kbnw ];
+			real mfccc = (D.f[DIR_PPP])[k  ];//[ktne ];
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];
+			real mfcac = (D.f[DIR_PMP])[ks ];//[ktse ];
+			real mfacc = (D.f[DIR_MPP])[kw ];//[ktnw ];
+			real mfcca = (D.f[DIR_PPM])[kb ];//[kbne ];
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ]
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];
 			////////////////////////////////////////////////////////////////////////////////////
 			real rho = (mfccc+mfaaa + mfaca+mfcac + mfacc+mfcaa + mfaac+mfcca + 
 						   mfbac+mfbca + mfbaa+mfbcc + mfabc+mfcba + mfaba+mfcbc + mfacb+mfcab + mfaab+mfccb +
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cumulant/CumulantCompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cumulant/CumulantCompSP27.cu
index 3f45c7ea71c385f948eac2e052a8d970010c413d..7817c398285dda131401bd14c3ccdd8c119c5680 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cumulant/CumulantCompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cumulant/CumulantCompSP27.cu
@@ -2,6 +2,7 @@
 
 #include "CumulantCompSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<CumulantCompSP27> CumulantCompSP27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,33 +11,18 @@ std::shared_ptr<CumulantCompSP27> CumulantCompSP27::getNewInstance(std::shared_p
 
 void CumulantCompSP27::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_Cum_Comp_SP_27 << < grid, threads >> >(	para->getParD(level)->omega,
-														para->getParD(level)->typeOfGridNode,
-														para->getParD(level)->neighborX,
-														para->getParD(level)->neighborY,
-														para->getParD(level)->neighborZ,
-														para->getParD(level)->distributions.f[0],
-														para->getParD(level)->numberOfNodes,
-														para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_Kum_Comp_SP_27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_Cum_Comp_SP_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_Cum_Comp_SP_27 execution failed");
 }
 
 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cumulant/CumulantCompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cumulant/CumulantCompSP27_Device.cu
index ad2ffdf4170d98125e6758c0e2f548122093cea6..1dfab5846795e61509cdba28478fe6ce623983b5 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cumulant/CumulantCompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cumulant/CumulantCompSP27_Device.cu
@@ -37,63 +37,63 @@ __global__ void LB_Kernel_Cum_Comp_SP_27(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantAll4/CumulantAll4CompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantAll4/CumulantAll4CompSP27.cu
index 9a84df86e41b3fdff75c2ebf580813afc5ee3feb..1518dcc209de1edf8a88dae72c1f10c3d4666610 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantAll4/CumulantAll4CompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantAll4/CumulantAll4CompSP27.cu
@@ -1,8 +1,8 @@
 #include "CumulantAll4CompSP27.h"
 
 #include "CumulantAll4CompSP27_Device.cuh"
-
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<CumulantAll4CompSP27> CumulantAll4CompSP27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -11,36 +11,21 @@ std::shared_ptr<CumulantAll4CompSP27> CumulantAll4CompSP27::getNewInstance(std::
 
 void CumulantAll4CompSP27::run()
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_Cumulant_D3Q27All4 << < grid, threads >> >(	para->getParD(level)->omega,
-															para->getParD(level)->typeOfGridNode,
-															para->getParD(level)->neighborX,
-															para->getParD(level)->neighborY,
-															para->getParD(level)->neighborZ,
-															para->getParD(level)->distributions.f[0],
-															size_Mat,
-															level,
-															para->getForcesDev(),
-                                                            para->getQuadricLimitersDev(),
-															para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_Cumulant_D3Q27All4 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_Cumulant_D3Q27All4 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        level,
+        para->getForcesDev(),
+        para->getQuadricLimitersDev(),
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_Cumulant_D3Q27All4 execution failed");
 }
 
 CumulantAll4CompSP27::CumulantAll4CompSP27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantAll4/CumulantAll4CompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantAll4/CumulantAll4CompSP27_Device.cu
index 681dbff2ba37a1e0de56341b39cc2dec791f656b..3593b41c4c62c8a8b19719e22e9d65d6b5fd987d 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantAll4/CumulantAll4CompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantAll4/CumulantAll4CompSP27_Device.cu
@@ -42,63 +42,63 @@ __global__ void LB_Kernel_Cumulant_D3Q27All4(	real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -160,33 +160,33 @@ __global__ void LB_Kernel_Cumulant_D3Q27All4(	real omega,
 			//unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 											////////////////////////////////////////////////////////////////////////////////////
 			real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
 				(((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15/CumulantK15Comp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15/CumulantK15Comp.cu
index 1b6ba1a2278b68f085a4b7df699b7ca230811f39..5a480e5d9c97126e491655b4bbe2aeefef3e7161 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15/CumulantK15Comp.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15/CumulantK15Comp.cu
@@ -12,7 +12,7 @@ std::shared_ptr<CumulantK15Comp> CumulantK15Comp::getNewInstance(std::shared_ptr
 void CumulantK15Comp::run()
 {
 	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
+	int size_Mat = (int)para->getParD(level)->numberOfNodes;
 
 	int Grid = (size_Mat / numberOfThreads) + 1;
 	int Grid1, Grid2;
@@ -29,16 +29,17 @@ void CumulantK15Comp::run()
 	dim3 grid(Grid1, Grid2, 1);
 	dim3 threads(numberOfThreads, 1, 1);
 
-	LB_Kernel_CumulantK15Comp <<< grid, threads >>>(para->getParD(level)->omega,
-													para->getParD(level)->typeOfGridNode,
-													para->getParD(level)->neighborX,
-													para->getParD(level)->neighborY,
-													para->getParD(level)->neighborZ,
-													para->getParD(level)->distributions.f[0],
-													size_Mat,
-													level,
-													para->getForcesDev(),
-													para->getParD(level)->isEvenTimestep);
+	LB_Kernel_CumulantK15Comp <<< grid, threads >>>(
+		para->getParD(level)->omega,
+		para->getParD(level)->typeOfGridNode,
+		para->getParD(level)->neighborX,
+		para->getParD(level)->neighborY,
+		para->getParD(level)->neighborZ,
+		para->getParD(level)->distributions.f[0],
+		para->getParD(level)->numberOfNodes,
+		level,
+		para->getForcesDev(),
+		para->getParD(level)->isEvenTimestep);
 	getLastCudaError("LB_Kernel_CumulantK15Comp execution failed");
 }
 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15/CumulantK15Comp_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15/CumulantK15Comp_Device.cu
index 93d57d6c9871d66537f25b9188467d46e3b3d05c..f7fb1f0a6441cfc6f38ad9684fd5bc8dd1be7135 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15/CumulantK15Comp_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15/CumulantK15Comp_Device.cu
@@ -39,63 +39,63 @@ __global__ void LB_Kernel_CumulantK15Comp(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -156,33 +156,33 @@ __global__ void LB_Kernel_CumulantK15Comp(real omega,
 			//unsigned int ktne = k;
 			//unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 											////////////////////////////////////////////////////////////////////////////////////
 			real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
 				(((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Bulk/CumulantK15BulkComp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Bulk/CumulantK15BulkComp.cu
index 188984d001f89d72c967dd6390ca10ae5d2eab32..51876f30b8c8e37d8cb3355edde5dcf2b04675d0 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Bulk/CumulantK15BulkComp.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Bulk/CumulantK15BulkComp.cu
@@ -2,6 +2,7 @@
 
 #include "CumulantK15BulkComp_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<CumulantK15BulkComp> CumulantK15BulkComp::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,35 +11,20 @@ std::shared_ptr<CumulantK15BulkComp> CumulantK15BulkComp::getNewInstance(std::sh
 
 void CumulantK15BulkComp::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_CumulantK15BulkComp <<< grid, threads >>>(para->getParD(level)->omega,
-														para->getParD(level)->typeOfGridNode,
-														para->getParD(level)->neighborX,
-														para->getParD(level)->neighborY,
-														para->getParD(level)->neighborZ,
-														para->getParD(level)->distributions.f[0],
-														para->getParD(level)->numberOfNodes,
-														level,
-														para->getForcesDev(),
-														para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_CumulantK15BulkComp execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_CumulantK15BulkComp <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        level,
+        para->getForcesDev(),
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_CumulantK15BulkComp execution failed");
 }
 
 CumulantK15BulkComp::CumulantK15BulkComp(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Bulk/CumulantK15BulkComp_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Bulk/CumulantK15BulkComp_Device.cu
index d2a2f61df902cfd7c5ef52b09f8e7738a108615e..085775d324bf65d783afdd745c06429d697c3788 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Bulk/CumulantK15BulkComp_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Bulk/CumulantK15BulkComp_Device.cu
@@ -39,63 +39,63 @@ __global__ void LB_Kernel_CumulantK15BulkComp(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -156,33 +156,33 @@ __global__ void LB_Kernel_CumulantK15BulkComp(real omega,
 			//unsigned int ktne = k;
 			//unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 											////////////////////////////////////////////////////////////////////////////////////
 			real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
 				(((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Sponge/CumulantK15SpongeComp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Sponge/CumulantK15SpongeComp.cu
index d28c077031ff9125d1cbc1187def1d1d8fe4d6e8..613464125bafc572fe7951b8c372e3455ea5b21d 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Sponge/CumulantK15SpongeComp.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Sponge/CumulantK15SpongeComp.cu
@@ -2,6 +2,7 @@
 
 #include "CumulantK15SpongeComp_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<CumulantK15SpongeComp> CumulantK15SpongeComp::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,36 +11,21 @@ std::shared_ptr<CumulantK15SpongeComp> CumulantK15SpongeComp::getNewInstance(std
 
 void CumulantK15SpongeComp::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_CumulantK15SpongeComp <<< grid, threads >>>(	para->getParD(level)->omega,
-															para->getParD(level)->typeOfGridNode,
-															para->getParD(level)->neighborX,
-															para->getParD(level)->neighborY,
-															para->getParD(level)->neighborZ,
-															para->getParD(level)->coordinateX,
-															para->getParD(level)->coordinateY,
-															para->getParD(level)->coordinateZ,
-															para->getParD(level)->distributions.f[0],
-															para->getParD(level)->numberOfNodes,
-															para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_CumulantK15SpongeComp execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_CumulantK15SpongeComp <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->coordinateX,
+        para->getParD(level)->coordinateY,
+        para->getParD(level)->coordinateZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_CumulantK15SpongeComp execution failed");
 }
 
 CumulantK15SpongeComp::CumulantK15SpongeComp(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Sponge/CumulantK15SpongeComp_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Sponge/CumulantK15SpongeComp_Device.cu
index c2144d324aa3378e8fc9fc5b511bbed385b48a84..13788e65e70eb30803111a39a70d39682648a006 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Sponge/CumulantK15SpongeComp_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Sponge/CumulantK15SpongeComp_Device.cu
@@ -40,63 +40,63 @@ __global__ void LB_Kernel_CumulantK15SpongeComp(real omegaIn,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Unified/CumulantK15Unified.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Unified/CumulantK15Unified.cu
index 0b72b46cf25f331172be4abb8dded6d8e5e2b9c5..24b0bbc6f43a63093da6b6dcb3ce401b8a614f75 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Unified/CumulantK15Unified.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK15Unified/CumulantK15Unified.cu
@@ -30,15 +30,16 @@ CumulantK15Unified::CumulantK15Unified(std::shared_ptr<Parameter> para, int leve
 
 void CumulantK15Unified::run()
 {
-    GPUKernelParameter kernelParameter{ para->getParD(level)->omega,
-                                                 para->getParD(level)->typeOfGridNode,
-                                                 para->getParD(level)->neighborX,
-                                                 para->getParD(level)->neighborY,
-                                                 para->getParD(level)->neighborZ,
-                                                 para->getParD(level)->distributions.f[0],
-                                                 (int)para->getParD(level)->numberOfNodes,
-                                                 para->getParD(level)->forcing,
-                                                 para->getParD(level)->isEvenTimestep };
+    GPUKernelParameter kernelParameter{
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        (int)para->getParD(level)->numberOfNodes,
+        para->getParD(level)->forcing,
+        para->getParD(level)->isEvenTimestep };
 
     auto lambda = [] __device__(lbm::KernelParameter parameter) {
         return lbm::cumulantChimera(parameter, lbm::setRelaxationRatesK15);
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ea3442fecca63fdcb45878d742a547ce492ab5c8
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17.cu
@@ -0,0 +1,140 @@
+#include "CumulantK17.h"
+#include <logger/Logger.h>
+#include "Parameter/Parameter.h"
+#include "Parameter/CudaStreamManager.h"
+#include "CumulantK17_Device.cuh"
+
+#include <cuda.h>
+
+template<TurbulenceModel turbulenceModel>
+std::shared_ptr< CumulantK17<turbulenceModel> > CumulantK17<turbulenceModel>::getNewInstance(std::shared_ptr<Parameter> para, int level)
+{
+    return std::shared_ptr<CumulantK17<turbulenceModel> >(new CumulantK17<turbulenceModel>(para,level));
+}
+
+template<TurbulenceModel turbulenceModel>
+void CumulantK17<turbulenceModel>::run()
+{
+    LB_Kernel_CumulantK17 < turbulenceModel, false, false  > <<< cudaGrid.grid, cudaGrid.threads >>>(   para->getParD(level)->omega,
+                                                                                                        para->getParD(level)->neighborX, para->getParD(level)->neighborY, para->getParD(level)->neighborZ,
+                                                                                                        para->getParD(level)->distributions.f[0],
+                                                                                                        para->getParD(level)->rho,
+                                                                                                        para->getParD(level)->velocityX, para->getParD(level)->velocityY, para->getParD(level)->velocityZ,
+                                                                                                        para->getParD(level)->turbViscosity,
+                                                                                                        para->getSGSConstant(),
+                                                                                                        para->getParD(level)->numberOfNodes,
+                                                                                                        level,
+                                                                                                        para->getForcesDev(),
+                                                                                                        para->getParD(level)->forceX_SP, para->getParD(level)->forceY_SP, para->getParD(level)->forceZ_SP,
+                                                                                                        para->getQuadricLimitersDev(),
+                                                                                                        para->getParD(level)->isEvenTimestep,
+                                                                                                        para->getParD(level)->taggedFluidNodeIndices[CollisionTemplate::Default],
+                                                                                                        para->getParD(level)->numberOfTaggedFluidNodes[CollisionTemplate::Default]);
+
+    getLastCudaError("LB_Kernel_CumulantK17 execution failed");
+}
+
+template<TurbulenceModel turbulenceModel>
+void CumulantK17<turbulenceModel>::runOnIndices( const unsigned int *indices, unsigned int size_indices, CollisionTemplate collisionTemplate, CudaStreamIndex streamIndex )
+{
+    cudaStream_t stream = para->getStreamManager()->getStream(streamIndex);
+
+    switch (collisionTemplate)
+    {
+        case CollisionTemplate::Default:
+            LB_Kernel_CumulantK17 < turbulenceModel, false, false  > <<< cudaGrid.grid, cudaGrid.threads, 0, stream >>>(para->getParD(level)->omega,
+                                                                                                                        para->getParD(level)->neighborX, para->getParD(level)->neighborY, para->getParD(level)->neighborZ,
+                                                                                                                        para->getParD(level)->distributions.f[0],
+                                                                                                                        para->getParD(level)->rho,
+                                                                                                                        para->getParD(level)->velocityX, para->getParD(level)->velocityY, para->getParD(level)->velocityZ,
+                                                                                                                        para->getParD(level)->turbViscosity,
+                                                                                                                        para->getSGSConstant(),
+                                                                                                                        para->getParD(level)->numberOfNodes,
+                                                                                                                        level,
+                                                                                                                        para->getForcesDev(),
+                                                                                                                        para->getParD(level)->forceX_SP, para->getParD(level)->forceY_SP, para->getParD(level)->forceZ_SP,
+                                                                                                                        para->getQuadricLimitersDev(),
+                                                                                                                        para->getParD(level)->isEvenTimestep,
+                                                                                                                        indices,
+                                                                                                                        size_indices);
+            break;
+
+        case CollisionTemplate::WriteMacroVars:
+            LB_Kernel_CumulantK17 < turbulenceModel, true, false  > <<< cudaGrid.grid, cudaGrid.threads, 0, stream >>>( para->getParD(level)->omega,
+                                                                                                                        para->getParD(level)->neighborX, para->getParD(level)->neighborY, para->getParD(level)->neighborZ,
+                                                                                                                        para->getParD(level)->distributions.f[0],
+                                                                                                                        para->getParD(level)->rho,
+                                                                                                                        para->getParD(level)->velocityX, para->getParD(level)->velocityY, para->getParD(level)->velocityZ,
+                                                                                                                        para->getParD(level)->turbViscosity,
+                                                                                                                        para->getSGSConstant(),
+                                                                                                                        para->getParD(level)->numberOfNodes,
+                                                                                                                        level,
+                                                                                                                        para->getForcesDev(),
+                                                                                                                        para->getParD(level)->forceX_SP, para->getParD(level)->forceY_SP, para->getParD(level)->forceZ_SP,
+                                                                                                                        para->getQuadricLimitersDev(),
+                                                                                                                        para->getParD(level)->isEvenTimestep,
+                                                                                                                        indices,
+                                                                                                                        size_indices);
+            break;
+
+        case CollisionTemplate::SubDomainBorder:
+        case CollisionTemplate::AllFeatures:
+            LB_Kernel_CumulantK17 < turbulenceModel, true, true  > <<< cudaGrid.grid, cudaGrid.threads, 0, stream >>>(  para->getParD(level)->omega,
+                                                                                                                        para->getParD(level)->neighborX, para->getParD(level)->neighborY, para->getParD(level)->neighborZ,
+                                                                                                                        para->getParD(level)->distributions.f[0],
+                                                                                                                        para->getParD(level)->rho,
+                                                                                                                        para->getParD(level)->velocityX, para->getParD(level)->velocityY, para->getParD(level)->velocityZ,
+                                                                                                                        para->getParD(level)->turbViscosity,
+                                                                                                                        para->getSGSConstant(),
+                                                                                                                        para->getParD(level)->numberOfNodes,
+                                                                                                                        level,
+                                                                                                                        para->getForcesDev(),
+                                                                                                                        para->getParD(level)->forceX_SP, para->getParD(level)->forceY_SP, para->getParD(level)->forceZ_SP,
+                                                                                                                        para->getQuadricLimitersDev(),
+                                                                                                                        para->getParD(level)->isEvenTimestep,
+                                                                                                                        indices,
+                                                                                                                        size_indices);
+            break;	case CollisionTemplate::ApplyBodyForce:
+            LB_Kernel_CumulantK17 < turbulenceModel, false, true  > <<< cudaGrid.grid, cudaGrid.threads, 0, stream >>>( para->getParD(level)->omega,
+                                                                                                                        para->getParD(level)->neighborX, para->getParD(level)->neighborY, para->getParD(level)->neighborZ,
+                                                                                                                        para->getParD(level)->distributions.f[0],
+                                                                                                                        para->getParD(level)->rho,
+                                                                                                                        para->getParD(level)->velocityX, para->getParD(level)->velocityY, para->getParD(level)->velocityZ,
+                                                                                                                        para->getParD(level)->turbViscosity,
+                                                                                                                        para->getSGSConstant(),
+                                                                                                                        para->getParD(level)->numberOfNodes,
+                                                                                                                        level,
+                                                                                                                        para->getForcesDev(),
+                                                                                                                        para->getParD(level)->forceX_SP, para->getParD(level)->forceY_SP, para->getParD(level)->forceZ_SP,
+                                                                                                                        para->getQuadricLimitersDev(),
+                                                                                                                        para->getParD(level)->isEvenTimestep,
+                                                                                                                        indices,
+                                                                                                                        size_indices);
+            break;	default:
+            throw std::runtime_error("Invalid CollisionTemplate in CumulantK17::runOnIndices()");
+            break;
+    }
+
+    getLastCudaError("LB_Kernel_CumulantK17 execution failed");
+}
+
+template<TurbulenceModel turbulenceModel>
+CumulantK17<turbulenceModel>::CumulantK17(std::shared_ptr<Parameter> para, int level)
+{
+    this->para = para;
+    this->level = level;
+
+    myPreProcessorTypes.push_back(InitCompSP27);
+
+    myKernelGroup = BasicKernel;
+
+    this->cudaGrid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+    this->kernelUsesFluidNodeIndices = true;
+
+    VF_LOG_INFO("Using turbulence model: {}", turbulenceModel);
+}
+
+template class CumulantK17<TurbulenceModel::AMD>;
+template class CumulantK17<TurbulenceModel::Smagorinsky>;
+template class CumulantK17<TurbulenceModel::QR>;
+template class CumulantK17<TurbulenceModel::None>;
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17.h
new file mode 100644
index 0000000000000000000000000000000000000000..00c79a30c9ccf9a89901165d020fc85d5a479c1d
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17.h
@@ -0,0 +1,20 @@
+#ifndef CUMULANT_K17_H
+#define CUMULANT_K17_H
+
+#include "Kernel/KernelImp.h"
+#include "Parameter/Parameter.h"
+
+template<TurbulenceModel turbulenceModel> 
+class CumulantK17 : public KernelImp
+{
+public:
+	static std::shared_ptr< CumulantK17<turbulenceModel> > getNewInstance(std::shared_ptr< Parameter> para, int level);
+	void run() override;
+    void runOnIndices(const unsigned int *indices, unsigned int size_indices, CollisionTemplate collisionTemplate, CudaStreamIndex streamIndex) override;
+
+private:
+    CumulantK17();
+    CumulantK17(std::shared_ptr<Parameter> para, int level);
+};
+
+#endif 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp.cu
deleted file mode 100644
index b176b94d07e7f280d738a797d5bd853095e3caed..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "CumulantK17Comp.h"
-
-#include "Parameter/Parameter.h"
-#include "CumulantK17Comp_Device.cuh"
-#include "cuda/CudaGrid.h"
-
-std::shared_ptr<CumulantK17Comp> CumulantK17Comp::getNewInstance(std::shared_ptr<Parameter> para, int level)
-{
-	return std::shared_ptr<CumulantK17Comp>(new CumulantK17Comp(para,level));
-}
-
-void CumulantK17Comp::run()
-{
-	LB_Kernel_CumulantK17Comp <<< cudaGrid.grid, cudaGrid.threads >>>(para->getParD(level)->omega,
-													para->getParD(level)->typeOfGridNode,
-													para->getParD(level)->neighborX,
-													para->getParD(level)->neighborY,
-													para->getParD(level)->neighborZ,
-													para->getParD(level)->distributions.f[0],
-													para->getParD(level)->numberOfNodes,
-													level,
-													para->getForcesDev(),
-                                                    para->getQuadricLimitersDev(),
-													para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_CumulantK17Comp execution failed");
-}
-
-CumulantK17Comp::CumulantK17Comp(std::shared_ptr<Parameter> para, int level): KernelImp(para, level)
-{
-	myPreProcessorTypes.push_back(InitCompSP27);
-	myKernelGroup = BasicKernel;
-	this->cudaGrid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
-}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp.h
deleted file mode 100644
index 22a95a688e5d078d7b710f494bfea360c9af0d6b..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef CUMULANT_K17_COMP_H
-#define CUMULANT_K17_COMP_H
-
-#include "Kernel/KernelImp.h"
-
-class CumulantK17Comp : public KernelImp
-{
-public:
-	static std::shared_ptr<CumulantK17Comp> getNewInstance(std::shared_ptr< Parameter> para, int level);
-	void run();
-
-private:
-	CumulantK17Comp();
-	CumulantK17Comp(std::shared_ptr< Parameter> para, int level);
-};
-
-#endif 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp_Device.cu
deleted file mode 100644
index 7cf27aa883cbfd3a0e4a0a36fa61649a62d06eeb..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp_Device.cu
+++ /dev/null
@@ -1,1040 +0,0 @@
-#include "LBM/LB.h" 
-#include "lbm/constants/D3Q27.h"
-#include <lbm/constants/NumericConstants.h>
-
-using namespace vf::lbm::constant;
-using namespace vf::lbm::dir;
-#include "math.h"
-
-
-__global__ void LB_Kernel_CumulantK17Comp(real omega,
-	unsigned int* bcMatD,
-	unsigned int* neighborX,
-	unsigned int* neighborY,
-	unsigned int* neighborZ,
-	real* DDStart,
-	int size_Mat,
-	int level,
-	real* forces,
-	real* quadricLimiters,
-	bool EvenOrOdd)
-{
-	////////////////////////////////////////////////////////////////////////////////
-	const unsigned  x = threadIdx.x;  // Globaler x-Index 
-	const unsigned  y = blockIdx.x;   // Globaler y-Index 
-	const unsigned  z = blockIdx.y;   // Globaler z-Index 
-
-	const unsigned nx = blockDim.x;
-	const unsigned ny = gridDim.x;
-
-	const unsigned k = nx*(ny*z + y) + x;
-	//////////////////////////////////////////////////////////////////////////
-
-	if (k<size_Mat)
-	{
-		////////////////////////////////////////////////////////////////////////////////
-		unsigned int BC;
-		BC = bcMatD[k];
-
-		if ((BC != GEO_SOLID) && (BC != GEO_VOID))
-		{
-			Distributions27 D;
-			if (EvenOrOdd == true)
-			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
-			}
-			else
-			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
-			}
-
-			////////////////////////////////////////////////////////////////////////////////
-			//index
-			//unsigned int kzero= k;
-			//unsigned int ke   = k;
-			unsigned int kw = neighborX[k];
-			//unsigned int kn   = k;
-			unsigned int ks = neighborY[k];
-			//unsigned int kt   = k;
-			unsigned int kb = neighborZ[k];
-			unsigned int ksw = neighborY[kw];
-			//unsigned int kne  = k;
-			//unsigned int kse  = ks;
-			//unsigned int knw  = kw;
-			unsigned int kbw = neighborZ[kw];
-			//unsigned int kte  = k;
-			//unsigned int kbe  = kb;
-			//unsigned int ktw  = kw;
-			unsigned int kbs = neighborZ[ks];
-			//unsigned int ktn  = k;
-			//unsigned int kbn  = kb;
-			//unsigned int kts  = ks;
-			//unsigned int ktse = ks;
-			//unsigned int kbnw = kbw;
-			//unsigned int ktnw = kw;
-			//unsigned int kbse = kbs;
-			//unsigned int ktsw = ksw;
-			//unsigned int kbne = kb;
-			//unsigned int ktne = k;
-			unsigned int kbsw = neighborZ[ksw];
-
-			//unsigned int kzero= k;
-			//unsigned int ke   = k;
-			//unsigned int kw   = neighborX[k];
-			//unsigned int kn   = k;
-			//unsigned int ks   = neighborY[k];
-			//unsigned int kt   = k;
-			//unsigned int kb   = neighborZ[k];
-			//unsigned int ksw  = neighborY[kw];
-			//unsigned int kne  = k;
-			//unsigned int kse  = ks;
-			//unsigned int knw  = kw;
-			//unsigned int kbw  = neighborZ[kw];
-			//unsigned int kte  = k;
-			//unsigned int kbe  = kb;
-			//unsigned int ktw  = kw;
-			//unsigned int kbs  = neighborZ[ks];
-			//unsigned int ktn  = k;
-			//unsigned int kbn  = kb;
-			//unsigned int kts  = ks;
-			//unsigned int ktse = ks;
-			//unsigned int kbnw = kbw;
-			//unsigned int ktnw = kw;
-			//unsigned int kbse = kbs;
-			//unsigned int ktsw = ksw;
-			//unsigned int kbne = kb;
-			//unsigned int ktne = k;
-			//unsigned int kbsw = neighborZ[ksw];
-			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
-			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
-											////////////////////////////////////////////////////////////////////////////////////
-			real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
-				(((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
-				((mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc))) + mfbbb;
-
-			real rho = c1o1 + drho;
-			////////////////////////////////////////////////////////////////////////////////////
-			//slow
-			//real oMdrho = one - ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
-			//					   (((mfbac+mfbca) + (mfbaa+mfbcc)) + ((mfabc+mfcba) + (mfaba+mfcbc)) + ((mfacb+mfcab) + (mfaab+mfccb))) +
-			//						((mfabb+mfcbb) + (mfbab+mfbcb)  +  (mfbba+mfbbc)));//fehlt mfbbb
-			real vvx = ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
-				(((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) +
-				(mfcbb - mfabb)) / rho;
-			real vvy = ((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
-				(((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) +
-				(mfbcb - mfbab)) / rho;
-			real vvz = ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
-				(((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) +
-				(mfbbc - mfbba)) / rho;
-			////////////////////////////////////////////////////////////////////////////////////
-			//the force be with you
-			real fx = forces[0] / (pow((double)c2o1, (double)level)); //zero;//0.0032653/(pow(two,level)); //0.000000005;//(two/1600000.0) / 120.0; //
-			real fy = forces[1] / (pow((double)c2o1, (double)level)); //zero;
-			real fz = forces[2] / (pow((double)c2o1, (double)level)); //zero;
-			vvx += fx*c1o2;
-			vvy += fy*c1o2;
-			vvz += fz*c1o2;
-			////////////////////////////////////////////////////////////////////////////////////
-			//real omega = omega_in;
-			////////////////////////////////////////////////////////////////////////////////////
-			//fast
-			real oMdrho = c1o1; // comp special
-							   //real oMdrho = one - (mfccc+mfaaa + mfaca+mfcac + mfacc+mfcaa + mfaac+mfcca + 
-							   //					   mfbac+mfbca + mfbaa+mfbcc + mfabc+mfcba + mfaba+mfcbc + mfacb+mfcab + mfaab+mfccb +
-							   //					   mfabb+mfcbb + mfbab+mfbcb + mfbba+mfbbc + mfbbb);//fehlt mfbbb nicht mehr
-							   //real vvx    =mfccc-mfaaa + mfcac-mfaca + mfcaa-mfacc + mfcca-mfaac + 
-							   //				mfcba-mfabc + mfcbc-mfaba + mfcab-mfacb + mfccb-mfaab +
-							   //				mfcbb-mfabb;
-							   //real vvy    =mfccc-mfaaa + mfaca-mfcac + mfacc-mfcaa + mfcca-mfaac + 
-							   //				mfbca-mfbac + mfbcc-mfbaa + mfacb-mfcab + mfccb-mfaab +
-							   //				mfbcb-mfbab;
-							   //real vvz    =mfccc-mfaaa + mfcac-mfaca + mfacc-mfcaa + mfaac-mfcca + 
-							   //				mfbac-mfbca + mfbcc-mfbaa + mfabc-mfcba + mfcbc-mfaba +
-							   //				mfbbc-mfbba;
-							   ////////////////////////////////////////////////////////////////////////////////////
-							   // oMdrho assembler style -------> faaaaaastaaaa
-							   // or much sloooowaaaa ... it dep�ndssssss on sadaku
-			real m0, m1, m2;
-			//real oMdrho;
-			//{
-			//	oMdrho=mfccc+mfaaa;
-			//	m0=mfaca+mfcac;
-			//	m1=mfacc+mfcaa;
-			//	m2=mfaac+mfcca;
-			//	oMdrho+=m0;
-			//	m1+=m2;
-			//	oMdrho+=m1;
-			//	m0=mfbac+mfbca;
-			//	m1=mfbaa+mfbcc;
-			//	m0+=m1;
-			//	m1=mfabc+mfcba;
-			//	m2=mfaba+mfcbc;
-			//	m1+=m2;
-			//	m0+=m1;
-			//	m1=mfacb+mfcab;
-			//	m2=mfaab+mfccb;
-			//	m1+=m2;
-			//	m0+=m1;
-			//	oMdrho+=m0;
-			//	m0=mfabb+mfcbb;
-			//	m1=mfbab+mfbcb;
-			//	m2=mfbba+mfbbc;
-			//	m0+=m1+m2;
-			//	m0+=mfbbb; //hat gefehlt
-			//	oMdrho = one - (oMdrho + m0);
-			//}
-			//real vvx;
-			real vx2;
-			//{
-			//	vvx = mfccc-mfaaa;
-			//	m0  = mfcac-mfaca;
-			//	m1  = mfcaa-mfacc;
-			//	m2  = mfcca-mfaac;
-			//	vvx+= m0;
-			//	m1 += m2;
-			//	vvx+= m1;
-			//	vx2 = mfcba-mfabc;
-			//	m0  = mfcbc-mfaba;
-			//	m1  = mfcab-mfacb;
-			//	m2  = mfccb-mfaab;
-			//	vx2+= m0;
-			//	m1 += m2;
-			//	vx2+= m1;
-			//	vvx+= vx2;
-			//	vx2 = mfcbb-mfabb;
-			//	vvx+= vx2;
-			//}
-			//real vvy;
-			real vy2;
-			//{
-			//	vvy = mfccc-mfaaa;
-			//	m0  = mfaca-mfcac;
-			//	m1  = mfacc-mfcaa;
-			//	m2  = mfcca-mfaac;
-			//	vvy+= m0;
-			//	m1 += m2;
-			//	vvy+= m1;
-			//	vy2 = mfbca-mfbac;
-			//	m0  = mfbcc-mfbaa;
-			//	m1  = mfacb-mfcab;
-			//	m2  = mfccb-mfaab;
-			//	vy2+= m0;
-			//	m1 += m2;
-			//	vy2+= m1;
-			//	vvy+= vy2;
-			//	vy2 = mfbcb-mfbab;
-			//	vvy+= vy2;
-			//}
-			//real vvz;
-			real vz2;
-			//{
-			//	vvz = mfccc-mfaaa;
-			//	m0  = mfcac-mfaca;
-			//	m1  = mfacc-mfcaa;
-			//	m2  = mfaac-mfcca;
-			//	vvz+= m0;
-			//	m1 += m2;
-			//	vvz+= m1;
-			//	vz2 = mfbac-mfbca;
-			//	m0  = mfbcc-mfbaa;
-			//	m1  = mfabc-mfcba;
-			//	m2  = mfcbc-mfaba;
-			//	vz2+= m0;
-			//	m1 += m2;
-			//	vz2+= m1;
-			//	vvz+= vz2;
-			//	vz2 = mfbbc-mfbba;
-			//	vvz+= vz2;
-			//}
-			vx2 = vvx*vvx;
-			vy2 = vvy*vvy;
-			vz2 = vvz*vvz;
-			////////////////////////////////////////////////////////////////////////////////////
-			real wadjust;
-			real qudricLimitP = quadricLimiters[0];  //0.01f; //  * 0.0001f; // 1000000.0f; // 1000000.0f; //
-			real qudricLimitM = quadricLimiters[1];  //0.01f; //  * 0.0001f; // 1000000.0f; // 1000000.0f; //
-			real qudricLimitD = quadricLimiters[2];  //0.01f; //  * 0.001f;  // 1000000.0f; // 1000000.0f; //
-									  ////////////////////////////////////////////////////////////////////////////////////
-									  //Hin
-									  ////////////////////////////////////////////////////////////////////////////////////
-									  // mit 1/36, 1/9, 1/36, 1/9, 4/9, 1/9, 1/36, 1/9, 1/36  Konditionieren
-									  ////////////////////////////////////////////////////////////////////////////////////
-									  // Z - Dir
-			m2 = mfaaa + mfaac;
-			m1 = mfaac - mfaaa;
-			m0 = m2 + mfaab;
-			mfaaa = m0;
-			m0 += c1o36 * oMdrho;
-			mfaab = m1 - m0 * vvz;
-			mfaac = m2 - c2o1*	m1 * vvz + vz2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfaba + mfabc;
-			m1 = mfabc - mfaba;
-			m0 = m2 + mfabb;
-			mfaba = m0;
-			m0 += c1o9 * oMdrho;
-			mfabb = m1 - m0 * vvz;
-			mfabc = m2 - c2o1*	m1 * vvz + vz2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfaca + mfacc;
-			m1 = mfacc - mfaca;
-			m0 = m2 + mfacb;
-			mfaca = m0;
-			m0 += c1o36 * oMdrho;
-			mfacb = m1 - m0 * vvz;
-			mfacc = m2 - c2o1*	m1 * vvz + vz2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfbaa + mfbac;
-			m1 = mfbac - mfbaa;
-			m0 = m2 + mfbab;
-			mfbaa = m0;
-			m0 += c1o9 * oMdrho;
-			mfbab = m1 - m0 * vvz;
-			mfbac = m2 - c2o1*	m1 * vvz + vz2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfbba + mfbbc;
-			m1 = mfbbc - mfbba;
-			m0 = m2 + mfbbb;
-			mfbba = m0;
-			m0 += c4o9 * oMdrho;
-			mfbbb = m1 - m0 * vvz;
-			mfbbc = m2 - c2o1*	m1 * vvz + vz2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfbca + mfbcc;
-			m1 = mfbcc - mfbca;
-			m0 = m2 + mfbcb;
-			mfbca = m0;
-			m0 += c1o9 * oMdrho;
-			mfbcb = m1 - m0 * vvz;
-			mfbcc = m2 - c2o1*	m1 * vvz + vz2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfcaa + mfcac;
-			m1 = mfcac - mfcaa;
-			m0 = m2 + mfcab;
-			mfcaa = m0;
-			m0 += c1o36 * oMdrho;
-			mfcab = m1 - m0 * vvz;
-			mfcac = m2 - c2o1*	m1 * vvz + vz2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfcba + mfcbc;
-			m1 = mfcbc - mfcba;
-			m0 = m2 + mfcbb;
-			mfcba = m0;
-			m0 += c1o9 * oMdrho;
-			mfcbb = m1 - m0 * vvz;
-			mfcbc = m2 - c2o1*	m1 * vvz + vz2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfcca + mfccc;
-			m1 = mfccc - mfcca;
-			m0 = m2 + mfccb;
-			mfcca = m0;
-			m0 += c1o36 * oMdrho;
-			mfccb = m1 - m0 * vvz;
-			mfccc = m2 - c2o1*	m1 * vvz + vz2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			// mit  1/6, 0, 1/18, 2/3, 0, 2/9, 1/6, 0, 1/18 Konditionieren
-			////////////////////////////////////////////////////////////////////////////////////
-			// Y - Dir
-			m2 = mfaaa + mfaca;
-			m1 = mfaca - mfaaa;
-			m0 = m2 + mfaba;
-			mfaaa = m0;
-			m0 += c1o6 * oMdrho;
-			mfaba = m1 - m0 * vvy;
-			mfaca = m2 - c2o1*	m1 * vvy + vy2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfaab + mfacb;
-			m1 = mfacb - mfaab;
-			m0 = m2 + mfabb;
-			mfaab = m0;
-			mfabb = m1 - m0 * vvy;
-			mfacb = m2 - c2o1*	m1 * vvy + vy2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfaac + mfacc;
-			m1 = mfacc - mfaac;
-			m0 = m2 + mfabc;
-			mfaac = m0;
-			m0 += c1o18 * oMdrho;
-			mfabc = m1 - m0 * vvy;
-			mfacc = m2 - c2o1*	m1 * vvy + vy2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfbaa + mfbca;
-			m1 = mfbca - mfbaa;
-			m0 = m2 + mfbba;
-			mfbaa = m0;
-			m0 += c2o3 * oMdrho;
-			mfbba = m1 - m0 * vvy;
-			mfbca = m2 - c2o1*	m1 * vvy + vy2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfbab + mfbcb;
-			m1 = mfbcb - mfbab;
-			m0 = m2 + mfbbb;
-			mfbab = m0;
-			mfbbb = m1 - m0 * vvy;
-			mfbcb = m2 - c2o1*	m1 * vvy + vy2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfbac + mfbcc;
-			m1 = mfbcc - mfbac;
-			m0 = m2 + mfbbc;
-			mfbac = m0;
-			m0 += c2o9 * oMdrho;
-			mfbbc = m1 - m0 * vvy;
-			mfbcc = m2 - c2o1*	m1 * vvy + vy2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfcaa + mfcca;
-			m1 = mfcca - mfcaa;
-			m0 = m2 + mfcba;
-			mfcaa = m0;
-			m0 += c1o6 * oMdrho;
-			mfcba = m1 - m0 * vvy;
-			mfcca = m2 - c2o1*	m1 * vvy + vy2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfcab + mfccb;
-			m1 = mfccb - mfcab;
-			m0 = m2 + mfcbb;
-			mfcab = m0;
-			mfcbb = m1 - m0 * vvy;
-			mfccb = m2 - c2o1*	m1 * vvy + vy2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfcac + mfccc;
-			m1 = mfccc - mfcac;
-			m0 = m2 + mfcbc;
-			mfcac = m0;
-			m0 += c1o18 * oMdrho;
-			mfcbc = m1 - m0 * vvy;
-			mfccc = m2 - c2o1*	m1 * vvy + vy2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			// mit     1, 0, 1/3, 0, 0, 0, 1/3, 0, 1/9		Konditionieren
-			////////////////////////////////////////////////////////////////////////////////////
-			// X - Dir
-			m2 = mfaaa + mfcaa;
-			m1 = mfcaa - mfaaa;
-			m0 = m2 + mfbaa;
-			mfaaa = m0;
-			m0 += c1o1* oMdrho;
-			mfbaa = m1 - m0 * vvx;
-			mfcaa = m2 - c2o1*	m1 * vvx + vx2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfaba + mfcba;
-			m1 = mfcba - mfaba;
-			m0 = m2 + mfbba;
-			mfaba = m0;
-			mfbba = m1 - m0 * vvx;
-			mfcba = m2 - c2o1*	m1 * vvx + vx2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfaca + mfcca;
-			m1 = mfcca - mfaca;
-			m0 = m2 + mfbca;
-			mfaca = m0;
-			m0 += c1o3 * oMdrho;
-			mfbca = m1 - m0 * vvx;
-			mfcca = m2 - c2o1*	m1 * vvx + vx2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfaab + mfcab;
-			m1 = mfcab - mfaab;
-			m0 = m2 + mfbab;
-			mfaab = m0;
-			mfbab = m1 - m0 * vvx;
-			mfcab = m2 - c2o1*	m1 * vvx + vx2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfabb + mfcbb;
-			m1 = mfcbb - mfabb;
-			m0 = m2 + mfbbb;
-			mfabb = m0;
-			mfbbb = m1 - m0 * vvx;
-			mfcbb = m2 - c2o1*	m1 * vvx + vx2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfacb + mfccb;
-			m1 = mfccb - mfacb;
-			m0 = m2 + mfbcb;
-			mfacb = m0;
-			mfbcb = m1 - m0 * vvx;
-			mfccb = m2 - c2o1*	m1 * vvx + vx2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfaac + mfcac;
-			m1 = mfcac - mfaac;
-			m0 = m2 + mfbac;
-			mfaac = m0;
-			m0 += c1o3 * oMdrho;
-			mfbac = m1 - m0 * vvx;
-			mfcac = m2 - c2o1*	m1 * vvx + vx2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfabc + mfcbc;
-			m1 = mfcbc - mfabc;
-			m0 = m2 + mfbbc;
-			mfabc = m0;
-			mfbbc = m1 - m0 * vvx;
-			mfcbc = m2 - c2o1*	m1 * vvx + vx2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			m2 = mfacc + mfccc;
-			m1 = mfccc - mfacc;
-			m0 = m2 + mfbcc;
-			mfacc = m0;
-			m0 += c1o9 * oMdrho;
-			mfbcc = m1 - m0 * vvx;
-			mfccc = m2 - c2o1*	m1 * vvx + vx2 * m0;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-
-			////////////////////////////////////////////////////////////////////////////////////
-			// Cumulants
-			////////////////////////////////////////////////////////////////////////////////////
-			real OxxPyyPzz = c1o1;	//set the bulk viscosity one is high / two is very low and zero is (too) high ... (also called omega 2)
-
-									////////////////////////////////////////////////////////////
-									//3.
-									//////////////////////////////
-			real OxyyPxzz = c8o1*(-c2o1 + omega)*(c1o1 + c2o1*omega) / (-c8o1 - c14o1*omega + c7o1*omega*omega);//one;
-			real OxyyMxzz = c8o1*(-c2o1 + omega)*(-c7o1 + c4o1*omega) / (c56o1 - c50o1*omega + c9o1*omega*omega);//one;
-			real Oxyz = c24o1*(-c2o1 + omega)*(-c2o1 - c7o1*omega + c3o1*omega*omega) / (c48o1 + c152o1*omega - c130o1*omega*omega + c29o1*omega*omega*omega);//one;
-																																										  ////////////////////////////////////////////////////////////
-																																										  //4.
-																																										  //////////////////////////////
-			real O4 = c1o1;
-			//////////////////////////////
-			//real O4        = omega;//TRT
-			////////////////////////////////////////////////////////////
-			//5.
-			//////////////////////////////
-			real O5 = c1o1;
-			////////////////////////////////////////////////////////////
-			//6.
-			//////////////////////////////
-			real O6 = c1o1;
-			////////////////////////////////////////////////////////////
-
-
-			//central moments to cumulants
-			//4.
-			real CUMcbb = mfcbb - ((mfcaa + c1o3) * mfabb + c2o1 * mfbba * mfbab) / rho;
-			real CUMbcb = mfbcb - ((mfaca + c1o3) * mfbab + c2o1 * mfbba * mfabb) / rho;
-			real CUMbbc = mfbbc - ((mfaac + c1o3) * mfbba + c2o1 * mfbab * mfabb) / rho;
-
-			real CUMcca = mfcca - (((mfcaa * mfaca + c2o1 * mfbba * mfbba) + c1o3 * (mfcaa + mfaca)) / rho - c1o9*(drho / rho));
-			real CUMcac = mfcac - (((mfcaa * mfaac + c2o1 * mfbab * mfbab) + c1o3 * (mfcaa + mfaac)) / rho - c1o9*(drho / rho));
-			real CUMacc = mfacc - (((mfaac * mfaca + c2o1 * mfabb * mfabb) + c1o3 * (mfaac + mfaca)) / rho - c1o9*(drho / rho));
-
-			//5.
-			real CUMbcc = mfbcc - ((mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb + mfbba * mfabc)) + c1o3 * (mfbca + mfbac)) / rho;
-			real CUMcbc = mfcbc - ((mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab + mfbba * mfbac)) + c1o3 * (mfcba + mfabc)) / rho;
-			real CUMccb = mfccb - ((mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca + mfabb * mfcba)) + c1o3 * (mfacb + mfcab)) / rho;
-
-			//6.
-
-			real CUMccc = mfccc + ((-c4o1 *  mfbbb * mfbbb
-				- (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
-				- c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
-				- c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) / rho
-				+ (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
-					+ c2o1 * (mfcaa * mfaca * mfaac)
-					+ c16o1 *  mfbba * mfbab * mfabb) / (rho * rho)
-				- c1o3 * (mfacc + mfcac + mfcca) / rho
-				- c1o9 * (mfcaa + mfaca + mfaac) / rho
-				+ (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
-					+ (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 *(mfaac + mfaca + mfcaa)) / (rho * rho) * c2o3
-				+ c1o27*((drho * drho - drho) / (rho*rho)));
-
-			//2.
-			// linear combinations
-			real mxxPyyPzz = mfcaa + mfaca + mfaac;
-			real mxxMyy = mfcaa - mfaca;
-			real mxxMzz = mfcaa - mfaac;
-
-			////////////////////////////////////////////////////////////////////////////
-			real Dxy = -c3o1*omega*mfbba;
-			real Dxz = -c3o1*omega*mfbab;
-			real Dyz = -c3o1*omega*mfabb;
-
-			//3.
-			// linear combinations
-
-			real mxxyPyzz = mfcba + mfabc;
-			real mxxyMyzz = mfcba - mfabc;
-
-			real mxxzPyyz = mfcab + mfacb;
-			real mxxzMyyz = mfcab - mfacb;
-
-			real mxyyPxzz = mfbca + mfbac;
-			real mxyyMxzz = mfbca - mfbac;
-
-			///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			//incl. correction		(hat noch nicht so gut funktioniert...Optimierungsbedarf??)
-
-			real dxux = c1o2 * (-omega) *(mxxMyy + mxxMzz) + c1o2 *  OxxPyyPzz * (mfaaa - mxxPyyPzz);
-			real dyuy = dxux + omega * c3o2 * mxxMyy;
-			real dzuz = dxux + omega * c3o2 * mxxMzz;
-
-			//relax
-			mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz) - c3o1 * (c1o1 - c1o2 * OxxPyyPzz) * (vx2 * dxux + vy2 * dyuy + vz2 * dzuz);//-magicBulk*OxxPyyPzz;
-			mxxMyy += omega * (-mxxMyy) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vy2 * dyuy);
-			mxxMzz += omega * (-mxxMzz) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vz2 * dzuz);
-
-			///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-			/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			////no correction
-			//mxxPyyPzz += OxxPyyPzz*(mfaaa-mxxPyyPzz);//-magicBulk*OxxPyyPzz;
-			//mxxMyy    += -(-omega) * (-mxxMyy);
-			//mxxMzz    += -(-omega) * (-mxxMzz);
-			/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			mfabb += omega * (-mfabb);
-			mfbab += omega * (-mfbab);
-			mfbba += omega * (-mfbba);
-			//////////////////////////////////////////////////////////////////////////
-
-			// linear combinations back
-			mfcaa = c1o3 * (mxxMyy + mxxMzz + mxxPyyPzz);
-			mfaca = c1o3 * (-c2o1*  mxxMyy + mxxMzz + mxxPyyPzz);
-			mfaac = c1o3 * (mxxMyy - c2o1* mxxMzz + mxxPyyPzz);
-
-
-			//relax
-			//////////////////////////////////////////////////////////////////////////
-			//das ist der limiter
-			wadjust = Oxyz + (c1o1 - Oxyz)*abs(mfbbb) / (abs(mfbbb) + qudricLimitD);
-			mfbbb += wadjust * (-mfbbb);
-			wadjust = OxyyPxzz + (c1o1 - OxyyPxzz)*abs(mxxyPyzz) / (abs(mxxyPyzz) + qudricLimitP);
-			mxxyPyzz += wadjust * (-mxxyPyzz);
-			wadjust = OxyyMxzz + (c1o1 - OxyyMxzz)*abs(mxxyMyzz) / (abs(mxxyMyzz) + qudricLimitM);
-			mxxyMyzz += wadjust * (-mxxyMyzz);
-			wadjust = OxyyPxzz + (c1o1 - OxyyPxzz)*abs(mxxzPyyz) / (abs(mxxzPyyz) + qudricLimitP);
-			mxxzPyyz += wadjust * (-mxxzPyyz);
-			wadjust = OxyyMxzz + (c1o1 - OxyyMxzz)*abs(mxxzMyyz) / (abs(mxxzMyyz) + qudricLimitM);
-			mxxzMyyz += wadjust * (-mxxzMyyz);
-			wadjust = OxyyPxzz + (c1o1 - OxyyPxzz)*abs(mxyyPxzz) / (abs(mxyyPxzz) + qudricLimitP);
-			mxyyPxzz += wadjust * (-mxyyPxzz);
-			wadjust = OxyyMxzz + (c1o1 - OxyyMxzz)*abs(mxyyMxzz) / (abs(mxyyMxzz) + qudricLimitM);
-			mxyyMxzz += wadjust * (-mxyyMxzz);
-			//////////////////////////////////////////////////////////////////////////
-			//ohne limiter
-			//mfbbb     += OxyyMxzz * (-mfbbb);
-			//mxxyPyzz  += OxyyPxzz * (-mxxyPyzz);
-			//mxxyMyzz  += OxyyMxzz * (-mxxyMyzz);
-			//mxxzPyyz  += OxyyPxzz * (-mxxzPyyz);
-			//mxxzMyyz  += OxyyMxzz * (-mxxzMyyz);
-			//mxyyPxzz  += OxyyPxzz * (-mxyyPxzz);
-			//mxyyMxzz  += OxyyMxzz * (-mxyyMxzz);
-			//////////////////////////////////////////////////////////////////////////
-
-			// linear combinations back
-			mfcba = (mxxyMyzz + mxxyPyzz) * c1o2;
-			mfabc = (-mxxyMyzz + mxxyPyzz) * c1o2;
-			mfcab = (mxxzMyyz + mxxzPyyz) * c1o2;
-			mfacb = (-mxxzMyyz + mxxzPyyz) * c1o2;
-			mfbca = (mxyyMxzz + mxyyPxzz) * c1o2;
-			mfbac = (-mxyyMxzz + mxyyPxzz) * c1o2;
-
-			//4.
-			//////////////////////////////////////////////////////////////////////////
-			//mit limiter
-			//	wadjust    = O4+(one-O4)*abs(CUMacc)/(abs(CUMacc)+qudricLimit);
-			//CUMacc    += wadjust * (-CUMacc);
-			//	wadjust    = O4+(one-O4)*abs(CUMcac)/(abs(CUMcac)+qudricLimit);
-			//CUMcac    += wadjust * (-CUMcac); 
-			//	wadjust    = O4+(one-O4)*abs(CUMcca)/(abs(CUMcca)+qudricLimit);
-			//CUMcca    += wadjust * (-CUMcca); 
-
-			//	wadjust    = O4+(one-O4)*abs(CUMbbc)/(abs(CUMbbc)+qudricLimit);
-			//CUMbbc    += wadjust * (-CUMbbc); 
-			//	wadjust    = O4+(one-O4)*abs(CUMbcb)/(abs(CUMbcb)+qudricLimit);
-			//CUMbcb    += wadjust * (-CUMbcb); 
-			//	wadjust    = O4+(one-O4)*abs(CUMcbb)/(abs(CUMcbb)+qudricLimit);
-			//CUMcbb    += wadjust * (-CUMcbb); 
-			//////////////////////////////////////////////////////////////////////////
-			real factorA = (c4o1 + c2o1*omega - c3o1*omega*omega) / (c2o1 - c7o1*omega + c5o1*omega*omega);
-			real factorB = (c4o1 + c28o1*omega - c14o1*omega*omega) / (c6o1 - c21o1*omega + c15o1*omega*omega);
-			//////////////////////////////////////////////////////////////////////////
-			//ohne limiter
-			//CUMacc += O4 * (-CUMacc); 
-			//CUMcac += O4 * (-CUMcac); 
-			//CUMcca += O4 * (-CUMcca); 
-			//CUMbbc += O4 * (-CUMbbc); 
-			//CUMbcb += O4 * (-CUMbcb); 
-			//CUMcbb += O4 * (-CUMcbb); 
-			CUMacc = -O4*(c1o1 / omega - c1o2) * (dyuy + dzuz) * c2o3 * factorA + (c1o1 - O4) * (CUMacc);
-			CUMcac = -O4*(c1o1 / omega - c1o2) * (dxux + dzuz) * c2o3 * factorA + (c1o1 - O4) * (CUMcac);
-			CUMcca = -O4*(c1o1 / omega - c1o2) * (dyuy + dxux) * c2o3 * factorA + (c1o1 - O4) * (CUMcca);
-			CUMbbc = -O4*(c1o1 / omega - c1o2) * Dxy           * c1o3 * factorB + (c1o1 - O4) * (CUMbbc);
-			CUMbcb = -O4*(c1o1 / omega - c1o2) * Dxz           * c1o3 * factorB + (c1o1 - O4) * (CUMbcb);
-			CUMcbb = -O4*(c1o1 / omega - c1o2) * Dyz           * c1o3 * factorB + (c1o1 - O4) * (CUMcbb);
-			//////////////////////////////////////////////////////////////////////////
-
-
-			//5.
-			CUMbcc += O5 * (-CUMbcc);
-			CUMcbc += O5 * (-CUMcbc);
-			CUMccb += O5 * (-CUMccb);
-
-			//6.
-			CUMccc += O6 * (-CUMccc);
-
-
-
-			//back cumulants to central moments
-			//4.
-			mfcbb = CUMcbb + ((mfcaa + c1o3) * mfabb + c2o1 * mfbba * mfbab) / rho;
-			mfbcb = CUMbcb + ((mfaca + c1o3) * mfbab + c2o1 * mfbba * mfabb) / rho;
-			mfbbc = CUMbbc + ((mfaac + c1o3) * mfbba + c2o1 * mfbab * mfabb) / rho;
-
-			mfcca = CUMcca + (((mfcaa * mfaca + c2o1 * mfbba * mfbba) + c1o3 * (mfcaa + mfaca)) / rho - c1o9*(drho / rho));
-			mfcac = CUMcac + (((mfcaa * mfaac + c2o1 * mfbab * mfbab) + c1o3 * (mfcaa + mfaac)) / rho - c1o9*(drho / rho));
-			mfacc = CUMacc + (((mfaac * mfaca + c2o1 * mfabb * mfabb) + c1o3 * (mfaac + mfaca)) / rho - c1o9*(drho / rho));
-
-			//5.
-			mfbcc = CUMbcc + ((mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb + mfbba * mfabc)) + c1o3 * (mfbca + mfbac)) / rho;
-			mfcbc = CUMcbc + ((mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab + mfbba * mfbac)) + c1o3 * (mfcba + mfabc)) / rho;
-			mfccb = CUMccb + ((mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca + mfabb * mfcba)) + c1o3 * (mfacb + mfcab)) / rho;
-
-			//6.
-
-			mfccc = CUMccc - ((-c4o1 *  mfbbb * mfbbb
-				- (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
-				- c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
-				- c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) / rho
-				+ (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
-					+ c2o1 * (mfcaa * mfaca * mfaac)
-					+ c16o1 *  mfbba * mfbab * mfabb) / (rho * rho)
-				- c1o3 * (mfacc + mfcac + mfcca) / rho
-				- c1o9 * (mfcaa + mfaca + mfaac) / rho
-				+ (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
-					+ (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 *(mfaac + mfaca + mfcaa)) / (rho * rho) * c2o3
-				+ c1o27*((drho * drho - drho) / (rho*rho)));
-			////////////////////////////////////////////////////////////////////////////////////
-
-			////////////////////////////////////////////////////////////////////////////////////
-			//the force be with you
-			mfbaa = -mfbaa;
-			mfaba = -mfaba;
-			mfaab = -mfaab;
-			////////////////////////////////////////////////////////////////////////////////////
-
-
-			////////////////////////////////////////////////////////////////////////////////////
-			//back
-			////////////////////////////////////////////////////////////////////////////////////
-			//mit 1, 0, 1/3, 0, 0, 0, 1/3, 0, 1/9   Konditionieren
-			////////////////////////////////////////////////////////////////////////////////////
-			// Z - Dir
-			m0 = mfaac * c1o2 + mfaab * (vvz - c1o2) + (mfaaa + c1o1* oMdrho) * (vz2 - vvz) * c1o2;
-			m1 = -mfaac - c2o1* mfaab *  vvz + mfaaa                * (c1o1 - vz2) - c1o1* oMdrho * vz2;
-			m2 = mfaac * c1o2 + mfaab * (vvz + c1o2) + (mfaaa + c1o1* oMdrho) * (vz2 + vvz) * c1o2;
-			mfaaa = m0;
-			mfaab = m1;
-			mfaac = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			m0 = mfabc * c1o2 + mfabb * (vvz - c1o2) + mfaba * (vz2 - vvz) * c1o2;
-			m1 = -mfabc - c2o1* mfabb *  vvz + mfaba * (c1o1 - vz2);
-			m2 = mfabc * c1o2 + mfabb * (vvz + c1o2) + mfaba * (vz2 + vvz) * c1o2;
-			mfaba = m0;
-			mfabb = m1;
-			mfabc = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			m0 = mfacc * c1o2 + mfacb * (vvz - c1o2) + (mfaca + c1o3 * oMdrho) * (vz2 - vvz) * c1o2;
-			m1 = -mfacc - c2o1* mfacb *  vvz + mfaca                  * (c1o1 - vz2) - c1o3 * oMdrho * vz2;
-			m2 = mfacc * c1o2 + mfacb * (vvz + c1o2) + (mfaca + c1o3 * oMdrho) * (vz2 + vvz) * c1o2;
-			mfaca = m0;
-			mfacb = m1;
-			mfacc = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			m0 = mfbac * c1o2 + mfbab * (vvz - c1o2) + mfbaa * (vz2 - vvz) * c1o2;
-			m1 = -mfbac - c2o1* mfbab *  vvz + mfbaa * (c1o1 - vz2);
-			m2 = mfbac * c1o2 + mfbab * (vvz + c1o2) + mfbaa * (vz2 + vvz) * c1o2;
-			mfbaa = m0;
-			mfbab = m1;
-			mfbac = m2;
-			/////////b//////////////////////////////////////////////////////////////////////////
-			m0 = mfbbc * c1o2 + mfbbb * (vvz - c1o2) + mfbba * (vz2 - vvz) * c1o2;
-			m1 = -mfbbc - c2o1* mfbbb *  vvz + mfbba * (c1o1 - vz2);
-			m2 = mfbbc * c1o2 + mfbbb * (vvz + c1o2) + mfbba * (vz2 + vvz) * c1o2;
-			mfbba = m0;
-			mfbbb = m1;
-			mfbbc = m2;
-			/////////b//////////////////////////////////////////////////////////////////////////
-			m0 = mfbcc * c1o2 + mfbcb * (vvz - c1o2) + mfbca * (vz2 - vvz) * c1o2;
-			m1 = -mfbcc - c2o1* mfbcb *  vvz + mfbca * (c1o1 - vz2);
-			m2 = mfbcc * c1o2 + mfbcb * (vvz + c1o2) + mfbca * (vz2 + vvz) * c1o2;
-			mfbca = m0;
-			mfbcb = m1;
-			mfbcc = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			m0 = mfcac * c1o2 + mfcab * (vvz - c1o2) + (mfcaa + c1o3 * oMdrho) * (vz2 - vvz) * c1o2;
-			m1 = -mfcac - c2o1* mfcab *  vvz + mfcaa                  * (c1o1 - vz2) - c1o3 * oMdrho * vz2;
-			m2 = mfcac * c1o2 + mfcab * (vvz + c1o2) + (mfcaa + c1o3 * oMdrho) * (vz2 + vvz) * c1o2;
-			mfcaa = m0;
-			mfcab = m1;
-			mfcac = m2;
-			/////////c//////////////////////////////////////////////////////////////////////////
-			m0 = mfcbc * c1o2 + mfcbb * (vvz - c1o2) + mfcba * (vz2 - vvz) * c1o2;
-			m1 = -mfcbc - c2o1* mfcbb *  vvz + mfcba * (c1o1 - vz2);
-			m2 = mfcbc * c1o2 + mfcbb * (vvz + c1o2) + mfcba * (vz2 + vvz) * c1o2;
-			mfcba = m0;
-			mfcbb = m1;
-			mfcbc = m2;
-			/////////c//////////////////////////////////////////////////////////////////////////
-			m0 = mfccc * c1o2 + mfccb * (vvz - c1o2) + (mfcca + c1o9 * oMdrho) * (vz2 - vvz) * c1o2;
-			m1 = -mfccc - c2o1* mfccb *  vvz + mfcca                  * (c1o1 - vz2) - c1o9 * oMdrho * vz2;
-			m2 = mfccc * c1o2 + mfccb * (vvz + c1o2) + (mfcca + c1o9 * oMdrho) * (vz2 + vvz) * c1o2;
-			mfcca = m0;
-			mfccb = m1;
-			mfccc = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			//mit 1/6, 2/3, 1/6, 0, 0, 0, 1/18, 2/9, 1/18   Konditionieren
-			////////////////////////////////////////////////////////////////////////////////////
-			// Y - Dir
-			m0 = mfaca * c1o2 + mfaba * (vvy - c1o2) + (mfaaa + c1o6 * oMdrho) * (vy2 - vvy) * c1o2;
-			m1 = -mfaca - c2o1* mfaba *  vvy + mfaaa                  * (c1o1 - vy2) - c1o6 * oMdrho * vy2;
-			m2 = mfaca * c1o2 + mfaba * (vvy + c1o2) + (mfaaa + c1o6 * oMdrho) * (vy2 + vvy) * c1o2;
-			mfaaa = m0;
-			mfaba = m1;
-			mfaca = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			m0 = mfacb * c1o2 + mfabb * (vvy - c1o2) + (mfaab + c2o3 * oMdrho) * (vy2 - vvy) * c1o2;
-			m1 = -mfacb - c2o1* mfabb *  vvy + mfaab                  * (c1o1 - vy2) - c2o3 * oMdrho * vy2;
-			m2 = mfacb * c1o2 + mfabb * (vvy + c1o2) + (mfaab + c2o3 * oMdrho) * (vy2 + vvy) * c1o2;
-			mfaab = m0;
-			mfabb = m1;
-			mfacb = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			m0 = mfacc * c1o2 + mfabc * (vvy - c1o2) + (mfaac + c1o6 * oMdrho) * (vy2 - vvy) * c1o2;
-			m1 = -mfacc - c2o1* mfabc *  vvy + mfaac                  * (c1o1 - vy2) - c1o6 * oMdrho * vy2;
-			m2 = mfacc * c1o2 + mfabc * (vvy + c1o2) + (mfaac + c1o6 * oMdrho) * (vy2 + vvy) * c1o2;
-			mfaac = m0;
-			mfabc = m1;
-			mfacc = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			m0 = mfbca * c1o2 + mfbba * (vvy - c1o2) + mfbaa * (vy2 - vvy) * c1o2;
-			m1 = -mfbca - c2o1* mfbba *  vvy + mfbaa * (c1o1 - vy2);
-			m2 = mfbca * c1o2 + mfbba * (vvy + c1o2) + mfbaa * (vy2 + vvy) * c1o2;
-			mfbaa = m0;
-			mfbba = m1;
-			mfbca = m2;
-			/////////b//////////////////////////////////////////////////////////////////////////
-			m0 = mfbcb * c1o2 + mfbbb * (vvy - c1o2) + mfbab * (vy2 - vvy) * c1o2;
-			m1 = -mfbcb - c2o1* mfbbb *  vvy + mfbab * (c1o1 - vy2);
-			m2 = mfbcb * c1o2 + mfbbb * (vvy + c1o2) + mfbab * (vy2 + vvy) * c1o2;
-			mfbab = m0;
-			mfbbb = m1;
-			mfbcb = m2;
-			/////////b//////////////////////////////////////////////////////////////////////////
-			m0 = mfbcc * c1o2 + mfbbc * (vvy - c1o2) + mfbac * (vy2 - vvy) * c1o2;
-			m1 = -mfbcc - c2o1* mfbbc *  vvy + mfbac * (c1o1 - vy2);
-			m2 = mfbcc * c1o2 + mfbbc * (vvy + c1o2) + mfbac * (vy2 + vvy) * c1o2;
-			mfbac = m0;
-			mfbbc = m1;
-			mfbcc = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			m0 = mfcca * c1o2 + mfcba * (vvy - c1o2) + (mfcaa + c1o18 * oMdrho) * (vy2 - vvy) * c1o2;
-			m1 = -mfcca - c2o1* mfcba *  vvy + mfcaa                   * (c1o1 - vy2) - c1o18 * oMdrho * vy2;
-			m2 = mfcca * c1o2 + mfcba * (vvy + c1o2) + (mfcaa + c1o18 * oMdrho) * (vy2 + vvy) * c1o2;
-			mfcaa = m0;
-			mfcba = m1;
-			mfcca = m2;
-			/////////c//////////////////////////////////////////////////////////////////////////
-			m0 = mfccb * c1o2 + mfcbb * (vvy - c1o2) + (mfcab + c2o9 * oMdrho) * (vy2 - vvy) * c1o2;
-			m1 = -mfccb - c2o1* mfcbb *  vvy + mfcab                  * (c1o1 - vy2) - c2o9 * oMdrho * vy2;
-			m2 = mfccb * c1o2 + mfcbb * (vvy + c1o2) + (mfcab + c2o9 * oMdrho) * (vy2 + vvy) * c1o2;
-			mfcab = m0;
-			mfcbb = m1;
-			mfccb = m2;
-			/////////c//////////////////////////////////////////////////////////////////////////
-			m0 = mfccc * c1o2 + mfcbc * (vvy - c1o2) + (mfcac + c1o18 * oMdrho) * (vy2 - vvy) * c1o2;
-			m1 = -mfccc - c2o1* mfcbc *  vvy + mfcac                   * (c1o1 - vy2) - c1o18 * oMdrho * vy2;
-			m2 = mfccc * c1o2 + mfcbc * (vvy + c1o2) + (mfcac + c1o18 * oMdrho) * (vy2 + vvy) * c1o2;
-			mfcac = m0;
-			mfcbc = m1;
-			mfccc = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			//mit 1/36, 1/9, 1/36, 1/9, 4/9, 1/9, 1/36, 1/9, 1/36 Konditionieren
-			////////////////////////////////////////////////////////////////////////////////////
-			// X - Dir
-			m0 = mfcaa * c1o2 + mfbaa * (vvx - c1o2) + (mfaaa + c1o36 * oMdrho) * (vx2 - vvx) * c1o2;
-			m1 = -mfcaa - c2o1* mfbaa *  vvx + mfaaa                   * (c1o1 - vx2) - c1o36 * oMdrho * vx2;
-			m2 = mfcaa * c1o2 + mfbaa * (vvx + c1o2) + (mfaaa + c1o36 * oMdrho) * (vx2 + vvx) * c1o2;
-			mfaaa = m0;
-			mfbaa = m1;
-			mfcaa = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			m0 = mfcba * c1o2 + mfbba * (vvx - c1o2) + (mfaba + c1o9 * oMdrho) * (vx2 - vvx) * c1o2;
-			m1 = -mfcba - c2o1* mfbba *  vvx + mfaba                  * (c1o1 - vx2) - c1o9 * oMdrho * vx2;
-			m2 = mfcba * c1o2 + mfbba * (vvx + c1o2) + (mfaba + c1o9 * oMdrho) * (vx2 + vvx) * c1o2;
-			mfaba = m0;
-			mfbba = m1;
-			mfcba = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			m0 = mfcca * c1o2 + mfbca * (vvx - c1o2) + (mfaca + c1o36 * oMdrho) * (vx2 - vvx) * c1o2;
-			m1 = -mfcca - c2o1* mfbca *  vvx + mfaca                   * (c1o1 - vx2) - c1o36 * oMdrho * vx2;
-			m2 = mfcca * c1o2 + mfbca * (vvx + c1o2) + (mfaca + c1o36 * oMdrho) * (vx2 + vvx) * c1o2;
-			mfaca = m0;
-			mfbca = m1;
-			mfcca = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			m0 = mfcab * c1o2 + mfbab * (vvx - c1o2) + (mfaab + c1o9 * oMdrho) * (vx2 - vvx) * c1o2;
-			m1 = -mfcab - c2o1* mfbab *  vvx + mfaab                  * (c1o1 - vx2) - c1o9 * oMdrho * vx2;
-			m2 = mfcab * c1o2 + mfbab * (vvx + c1o2) + (mfaab + c1o9 * oMdrho) * (vx2 + vvx) * c1o2;
-			mfaab = m0;
-			mfbab = m1;
-			mfcab = m2;
-			///////////b////////////////////////////////////////////////////////////////////////
-			m0 = mfcbb * c1o2 + mfbbb * (vvx - c1o2) + (mfabb + c4o9 * oMdrho) * (vx2 - vvx) * c1o2;
-			m1 = -mfcbb - c2o1* mfbbb *  vvx + mfabb                  * (c1o1 - vx2) - c4o9 * oMdrho * vx2;
-			m2 = mfcbb * c1o2 + mfbbb * (vvx + c1o2) + (mfabb + c4o9 * oMdrho) * (vx2 + vvx) * c1o2;
-			mfabb = m0;
-			mfbbb = m1;
-			mfcbb = m2;
-			///////////b////////////////////////////////////////////////////////////////////////
-			m0 = mfccb * c1o2 + mfbcb * (vvx - c1o2) + (mfacb + c1o9 * oMdrho) * (vx2 - vvx) * c1o2;
-			m1 = -mfccb - c2o1* mfbcb *  vvx + mfacb                  * (c1o1 - vx2) - c1o9 * oMdrho * vx2;
-			m2 = mfccb * c1o2 + mfbcb * (vvx + c1o2) + (mfacb + c1o9 * oMdrho) * (vx2 + vvx) * c1o2;
-			mfacb = m0;
-			mfbcb = m1;
-			mfccb = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-			////////////////////////////////////////////////////////////////////////////////////
-			m0 = mfcac * c1o2 + mfbac * (vvx - c1o2) + (mfaac + c1o36 * oMdrho) * (vx2 - vvx) * c1o2;
-			m1 = -mfcac - c2o1* mfbac *  vvx + mfaac                   * (c1o1 - vx2) - c1o36 * oMdrho * vx2;
-			m2 = mfcac * c1o2 + mfbac * (vvx + c1o2) + (mfaac + c1o36 * oMdrho) * (vx2 + vvx) * c1o2;
-			mfaac = m0;
-			mfbac = m1;
-			mfcac = m2;
-			///////////c////////////////////////////////////////////////////////////////////////
-			m0 = mfcbc * c1o2 + mfbbc * (vvx - c1o2) + (mfabc + c1o9 * oMdrho) * (vx2 - vvx) * c1o2;
-			m1 = -mfcbc - c2o1* mfbbc *  vvx + mfabc                  * (c1o1 - vx2) - c1o9 * oMdrho * vx2;
-			m2 = mfcbc * c1o2 + mfbbc * (vvx + c1o2) + (mfabc + c1o9 * oMdrho) * (vx2 + vvx) * c1o2;
-			mfabc = m0;
-			mfbbc = m1;
-			mfcbc = m2;
-			///////////c////////////////////////////////////////////////////////////////////////
-			m0 = mfccc * c1o2 + mfbcc * (vvx - c1o2) + (mfacc + c1o36 * oMdrho) * (vx2 - vvx) * c1o2;
-			m1 = -mfccc - c2o1* mfbcc *  vvx + mfacc                   * (c1o1 - vx2) - c1o36 * oMdrho * vx2;
-			m2 = mfccc * c1o2 + mfbcc * (vvx + c1o2) + (mfacc + c1o36 * oMdrho) * (vx2 + vvx) * c1o2;
-			mfacc = m0;
-			mfbcc = m1;
-			mfccc = m2;
-			////////////////////////////////////////////////////////////////////////////////////
-
-			////////////////////////////////////////////////////////////////////////////////////
-			(D.f[DIR_P00])[k] = mfabb;//(D.f[ DIR_P00   ])[ke   ] = mfabb;// -  c2over27 ;  (D.f[ DIR_P00   ])[k   ]                                                                     
-			(D.f[DIR_M00])[kw] = mfcbb;//(D.f[ DIR_M00   ])[kw   ] = mfcbb;// -  c2over27 ;  (D.f[ DIR_M00   ])[kw  ]                                                                   
-			(D.f[DIR_0P0])[k] = mfbab;//(D.f[ DIR_0P0   ])[kn   ] = mfbab;// -  c2over27 ;	 (D.f[ DIR_0P0   ])[k   ]
-			(D.f[DIR_0M0])[ks] = mfbcb;//(D.f[ DIR_0M0   ])[ks   ] = mfbcb;// -  c2over27 ;	 (D.f[ DIR_0M0   ])[ks  ]
-			(D.f[DIR_00P])[k] = mfbba;//(D.f[ DIR_00P   ])[kt   ] = mfbba;// -  c2over27 ;	 (D.f[ DIR_00P   ])[k   ]
-			(D.f[DIR_00M])[kb] = mfbbc;//(D.f[ DIR_00M   ])[kb   ] = mfbbc;// -  c2over27 ;	 (D.f[ DIR_00M   ])[kb  ]
-			(D.f[DIR_PP0])[k] = mfaab;//(D.f[ DIR_PP0  ])[kne  ] = mfaab;// -  c1over54 ;	 (D.f[ DIR_PP0  ])[k   ]
-			(D.f[DIR_MM0])[ksw] = mfccb;//(D.f[ DIR_MM0  ])[ksw  ] = mfccb;// -  c1over54 ;	 (D.f[ DIR_MM0  ])[ksw ]
-			(D.f[DIR_PM0])[ks] = mfacb;//(D.f[ DIR_PM0  ])[kse  ] = mfacb;// -  c1over54 ;	 (D.f[ DIR_PM0  ])[ks  ]
-			(D.f[DIR_MP0])[kw] = mfcab;//(D.f[ DIR_MP0  ])[knw  ] = mfcab;// -  c1over54 ;	 (D.f[ DIR_MP0  ])[kw  ]
-			(D.f[DIR_P0P])[k] = mfaba;//(D.f[ DIR_P0P  ])[kte  ] = mfaba;// -  c1over54 ;	 (D.f[ DIR_P0P  ])[k   ]
-			(D.f[DIR_M0M])[kbw] = mfcbc;//(D.f[ DIR_M0M  ])[kbw  ] = mfcbc;// -  c1over54 ;	 (D.f[ DIR_M0M  ])[kbw ]
-			(D.f[DIR_P0M])[kb] = mfabc;//(D.f[ DIR_P0M  ])[kbe  ] = mfabc;// -  c1over54 ;	 (D.f[ DIR_P0M  ])[kb  ]
-			(D.f[DIR_M0P])[kw] = mfcba;//(D.f[ DIR_M0P  ])[ktw  ] = mfcba;// -  c1over54 ;	 (D.f[ DIR_M0P  ])[kw  ]
-			(D.f[DIR_0PP])[k] = mfbaa;//(D.f[ DIR_0PP  ])[ktn  ] = mfbaa;// -  c1over54 ;	 (D.f[ DIR_0PP  ])[k   ]
-			(D.f[DIR_0MM])[kbs] = mfbcc;//(D.f[ DIR_0MM  ])[kbs  ] = mfbcc;// -  c1over54 ;	 (D.f[ DIR_0MM  ])[kbs ]
-			(D.f[DIR_0PM])[kb] = mfbac;//(D.f[ DIR_0PM  ])[kbn  ] = mfbac;// -  c1over54 ;	 (D.f[ DIR_0PM  ])[kb  ]
-			(D.f[DIR_0MP])[ks] = mfbca;//(D.f[ DIR_0MP  ])[kts  ] = mfbca;// -  c1over54 ;	 (D.f[ DIR_0MP  ])[ks  ]
-			(D.f[DIR_000])[k] = mfbbb;//(D.f[ DIR_000])[kzero] = mfbbb;// -  c8over27 ;	 (D.f[ DIR_000])[k   ]
-			(D.f[DIR_PPP])[k] = mfaaa;//(D.f[ DIR_PPP ])[ktne ] = mfaaa;// -  c1over216;	 (D.f[ DIR_PPP ])[k   ]
-			(D.f[DIR_PMP])[ks] = mfaca;//(D.f[ DIR_PMP ])[ktse ] = mfaca;// -  c1over216;	 (D.f[ DIR_PMP ])[ks  ]
-			(D.f[DIR_PPM])[kb] = mfaac;//(D.f[ DIR_PPM ])[kbne ] = mfaac;// -  c1over216;	 (D.f[ DIR_PPM ])[kb  ]
-			(D.f[DIR_PMM])[kbs] = mfacc;//(D.f[ DIR_PMM ])[kbse ] = mfacc;// -  c1over216;	 (D.f[ DIR_PMM ])[kbs ]
-			(D.f[DIR_MPP])[kw] = mfcaa;//(D.f[ DIR_MPP ])[ktnw ] = mfcaa;// -  c1over216;	 (D.f[ DIR_MPP ])[kw  ]
-			(D.f[DIR_MMP])[ksw] = mfcca;//(D.f[ DIR_MMP ])[ktsw ] = mfcca;// -  c1over216;	 (D.f[ DIR_MMP ])[ksw ]
-			(D.f[DIR_MPM])[kbw] = mfcac;//(D.f[ DIR_MPM ])[kbnw ] = mfcac;// -  c1over216;	 (D.f[ DIR_MPM ])[kbw ]
-			(D.f[DIR_MMM])[kbsw] = mfccc;//(D.f[ DIR_MMM ])[kbsw ] = mfccc;// -  c1over216;	 (D.f[ DIR_MMM ])[kbsw]
-										////////////////////////////////////////////////////////////////////////////////////
-		}
-	}
-}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp_Device.cuh b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp_Device.cuh
deleted file mode 100644
index f44842057d554498b0b5d4c733e2425e524a3b75..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp_Device.cuh
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef LB_Kernel_CUMULANT_K17_COMP_H
-#define LB_Kernel_CUMULANT_K17_COMP_H
-
-#include <DataTypes.h>
-#include <curand.h>
-
-__global__ void LB_Kernel_CumulantK17Comp(	real omega,
-														unsigned int* bcMatD,
-														unsigned int* neighborX,
-														unsigned int* neighborY,
-														unsigned int* neighborZ,
-														real* DDStart,
-														int size_Mat,
-														int level,
-														real* forces,
-                                                        real* quadricLimiters,
-														bool EvenOrOdd);
-#endif
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesignedDevice.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17_Device.cu
similarity index 63%
rename from src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesignedDevice.cu
rename to src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17_Device.cu
index db8caf1b23c2087a4c5c76886fb4530bc6272a1d..1ffec96c255b7923f3ee39c01f756abd8cad8862 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesignedDevice.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17_Device.cu
@@ -1,55 +1,78 @@
-//=======================================================================================
-// ____          ____    __    ______     __________   __      __       __        __         
-// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
-//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
-//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
-//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
-//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
-//      \    \  |    |   ________________________________________________________________    
-//       \    \ |    |  |  ______________________________________________________________|   
-//        \    \|    |  |  |         __          __     __     __     ______      _______    
-//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
-//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
 //           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
-//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
 //
-//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file Cumulant27chimStream.cu
-//! \ingroup GPU
-//! \author Martin Schoenherr, Anna Wellmann
+//! \file CumlantK17_Device.cu
+//! \author Anna Wellmann, Martin Schönherr, Henry Korb, Henrik Asmuth
+//! \date 05/12/2022
+//! \brief Kernel for CumulantK17 including different turbulence models and options for local body forces and writing macroscopic variables
+//!
+//! CumulantK17 kernel using chimera transformations and quartic limiters as present in Geier et al. (2017). Additional options are three different
+//! eddy-viscosity turbulence models (Smagorinsky, AMD, QR) that can be set via the template parameter turbulenceModel (with default
+//! TurbulenceModel::None).
+//! The kernel is executed separately for each subset of fluid node indices with a different tag CollisionTemplate. For each subset, only the locally
+//! required options are switched on ( \param writeMacroscopicVariables and/or \param applyBodyForce) in order to minimize memory accesses. The default
+//! refers to the plain cumlant kernel (CollisionTemplate::Default).
+//! Nodes are added to subsets (taggedFluidNodes) in Simulation::init using a corresponding tag with different values of CollisionTemplate. These subsets
+//! are provided by the utilized PostCollisionInteractiors depending on they specific requirements (e.g. writeMacroscopicVariables for probes).
+
 //=======================================================================================
-/* Device code */
-#include "LBM/LB.h" 
+#include "LBM/LB.h"
 #include "lbm/constants/D3Q27.h"
-#include <lbm/constants/NumericConstants.h>
-#include "Kernel/Utilities/DistributionHelper.cuh"
+#include "lbm/constants/NumericConstants.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
+#include "LBM/GPUHelperFunctions/ChimeraTransformation.h"
+
+#include "GPU/TurbulentViscosityInlines.cuh"
 
 using namespace vf::lbm::constant;
 using namespace vf::lbm::dir;
-#include "Kernel/Utilities/ChimeraTransformation.h"
+using namespace vf::gpu;
 
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void LB_Kernel_CumulantK17CompChimRedesigned(
-    real omega,
+template<TurbulenceModel turbulenceModel, bool writeMacroscopicVariables, bool applyBodyForce>
+__global__ void LB_Kernel_CumulantK17(
+    real omega_in,
     uint* neighborX,
     uint* neighborY,
     uint* neighborZ,
     real* distributions,
-    unsigned long numberOfLBnodes,
+    real* rho,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* turbulentViscosity,
+    real SGSconstant,
+    unsigned long long numberOfLBnodes,
     int level,
     real* forces,
+    real* bodyForceX,
+    real* bodyForceY,
+    real* bodyForceZ,
     real* quadricLimiters,
     bool isEvenTimestep,
     const uint *fluidNodeIndices,
@@ -64,19 +87,18 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     //! The cumulant kernel is executed in the following steps
     //!
     ////////////////////////////////////////////////////////////////////////////////
-    //! - Get the thread index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
     //!
-    const unsigned kThread = vf::gpu::getNodeIndex();
+    const unsigned nodeIndex = getNodeIndex();
 
     //////////////////////////////////////////////////////////////////////////
-    //! - Return for non-fluid nodes
-    if (kThread >= numberOfFluidNodes) 
+    // run for all indices in size_Mat and fluid nodes
+    if (nodeIndex >= numberOfFluidNodes)
         return;
-
     ////////////////////////////////////////////////////////////////////////////////
     //! - Get the node index from the array containing all indices of fluid nodes
     //!
-    const unsigned k_000 = fluidNodeIndices[kThread];
+    const unsigned k_000 = fluidNodeIndices[nodeIndex];
 
     //////////////////////////////////////////////////////////////////////////
     //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on
@@ -84,11 +106,11 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
     //! DOI:10.3390/computation5020019 ]</b></a>
     //!
-    Distributions27 dist = vf::gpu::getDistributionReferences27(distributions, numberOfLBnodes, isEvenTimestep);
-    
+    Distributions27 dist;
+    getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+
     ////////////////////////////////////////////////////////////////////////////////
     //! - Set neighbor indices (necessary for indirect addressing)
-    //!
     uint k_M00 = neighborX[k_000];
     uint k_0M0 = neighborY[k_000];
     uint k_00M = neighborZ[k_000];
@@ -96,9 +118,8 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     uint k_M0M = neighborZ[k_M00];
     uint k_0MM = neighborZ[k_0M0];
     uint k_MMM = neighborZ[k_MM0];
-
     ////////////////////////////////////////////////////////////////////////////////////
-    //! - Set local distributions (f's):
+    //! - Set local distributions
     //!
     real f_000 = (dist.f[DIR_000])[k_000];
     real f_P00 = (dist.f[DIR_P00])[k_000];
@@ -159,28 +180,28 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     real& m_200 = f_PMM;
     real& m_000 = f_MMM;
 
-    ////////////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////(unsigned long)//////////////////////////////
     //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
     //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
     //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
     //!
     real drho = ((((f_PPP + f_MMM) + (f_MPM + f_PMP)) + ((f_MPP + f_PMM) + (f_MMP + f_PPM))) +
-                 (((f_0MP + f_0PM) + (f_0MM + f_0PP)) + ((f_M0P + f_P0M) + (f_M0M + f_P0P)) +
-                  ((f_MP0 + f_PM0) + (f_MM0 + f_PP0))) +
-                  ((f_M00 + f_P00) + (f_0M0 + f_0P0) + (f_00M + f_00P))) +
+                (((f_0MP + f_0PM) + (f_0MM + f_0PP)) + ((f_M0P + f_P0M) + (f_M0M + f_P0P)) +
+                ((f_MP0 + f_PM0) + (f_MM0 + f_PP0))) +
+                ((f_M00 + f_P00) + (f_0M0 + f_0P0) + (f_00M + f_00P))) +
                     f_000;
 
     real oneOverRho = c1o1 / (c1o1 + drho);
 
     real vvx = ((((f_PPP - f_MMM) + (f_PMP - f_MPM)) + ((f_PMM - f_MPP) + (f_PPM - f_MMP))) +
                 (((f_P0M - f_M0P) + (f_P0P - f_M0M)) + ((f_PM0 - f_MP0) + (f_PP0 - f_MM0))) + (f_P00 - f_M00)) *
-               oneOverRho;
+            oneOverRho;
     real vvy = ((((f_PPP - f_MMM) + (f_MPM - f_PMP)) + ((f_MPP - f_PMM) + (f_PPM - f_MMP))) +
                 (((f_0PM - f_0MP) + (f_0PP - f_0MM)) + ((f_MP0 - f_PM0) + (f_PP0 - f_MM0))) + (f_0P0 - f_0M0)) *
-               oneOverRho;
+            oneOverRho;
     real vvz = ((((f_PPP - f_MMM) + (f_PMP - f_MPM)) + ((f_MPP - f_PMM) + (f_MMP - f_PPM))) +
                 (((f_0MP - f_0PM) + (f_0PP - f_0MM)) + ((f_M0P - f_P0M) + (f_P0P - f_M0M))) + (f_00P - f_00M)) *
-               oneOverRho;
+            oneOverRho;
 
     ////////////////////////////////////////////////////////////////////////////////////
     //! - Add half of the acceleration (body force) to the velocity as in Eq. (42) \ref
@@ -188,16 +209,55 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
     //!
     real factor = c1o1;
-    // The factor has to be scaled for each level to get the correct acceleration.
     for (size_t i = 1; i <= level; i++) {
         factor *= c2o1;
     }
-    real fx = forces[0] / factor;
-    real fy = forces[1] / factor;
-    real fz = forces[2] / factor;
-    vvx += fx * c1o2;
-    vvy += fy * c1o2;
-    vvz += fz * c1o2;
+
+    real fx = forces[0];
+    real fy = forces[1];
+    real fz = forces[2];
+
+    if( applyBodyForce ){
+        fx += bodyForceX[k_000];
+        fy += bodyForceY[k_000];
+        fz += bodyForceZ[k_000];
+
+        // real vx = vvx;
+        // real vy = vvy;
+        // real vz = vvz;
+        real acc_x = fx * c1o2 / factor;
+        real acc_y = fy * c1o2 / factor;
+        real acc_z = fz * c1o2 / factor;
+
+        vvx += acc_x;
+        vvy += acc_y;
+        vvz += acc_z;
+
+        // Reset body force. To be used when not using round-off correction.
+        bodyForceX[k_000] = 0.0f;
+        bodyForceY[k_000] = 0.0f;
+        bodyForceZ[k_000] = 0.0f;
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        //!> Round-off correction
+        //!
+        //!> Similar to Kahan summation algorithm (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
+        //!> Essentially computes the round-off error of the applied force and adds it in the next time step as a compensation.
+        //!> Seems to be necesseary at very high Re boundary layers, where the forcing and velocity can
+        //!> differ by several orders of magnitude.
+        //!> \note 16/05/2022: Testing, still ongoing!
+        //!
+        // bodyForceX[k_000] = (acc_x-(vvx-vx))*factor*c2o1;
+        // bodyForceY[k_000] = (acc_y-(vvy-vy))*factor*c2o1;
+        // bodyForceZ[k_000] = (acc_z-(vvz-vz))*factor*c2o1;
+    }
+    else{
+        vvx += fx * c1o2 / factor;
+        vvy += fy * c1o2 / factor;
+        vvz += fz * c1o2 / factor;
+    }
+
+
     ////////////////////////////////////////////////////////////////////////////////////
     // calculate the square of velocities for this lattice node
     real vx2 = vvx * vvx;
@@ -272,15 +332,21 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     //!  - Fifth order cumulants \f$ C_{221}, C_{212}, C_{122}\f$: \f$\omega_9=O5=1.0\f$.
     //!  - Sixth order cumulant \f$ C_{222}\f$: \f$\omega_{10}=O6=1.0\f$.
     //!
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Calculate modified omega with turbulent viscosity
+    //!
+    real omega = omega_in;
+    if(turbulenceModel != TurbulenceModel::None){ omega /= (c1o1 + c3o1*omega_in*turbulentViscosity[k_000]); }
     ////////////////////////////////////////////////////////////
     // 2.
     real OxxPyyPzz = c1o1;
     ////////////////////////////////////////////////////////////
     // 3.
-    real OxyyPxzz = c8o1 * (-c2o1 + omega) * (c1o1 + c2o1 * omega)  / (-c8o1 - c14o1 * omega + c7o1 * omega * omega);
-    real OxyyMxzz = c8o1 * (-c2o1 + omega) * (-c7o1 + c4o1 * omega) / (c56o1 - c50o1 * omega + c9o1 * omega * omega);
-    real Oxyz     = c24o1 * (-c2o1 + omega) * (-c2o1 - c7o1 * omega + c3o1 * omega * omega) /
-                    (c48o1 + c152o1 * omega - c130o1 * omega * omega + c29o1 * omega * omega * omega);
+    real OxyyPxzz = c8o1 * (-c2o1 + omega) * (c1o1 + c2o1 * omega) / (-c8o1 - c14o1 * omega + c7o1 * omega * omega);
+    real OxyyMxzz =
+        c8o1 * (-c2o1 + omega) * (-c7o1 + c4o1 * omega) / (c56o1 - c50o1 * omega + c9o1 * omega * omega);
+    real Oxyz = c24o1 * (-c2o1 + omega) * (-c2o1 - c7o1 * omega + c3o1 * omega * omega) /
+                (c48o1 + c152o1 * omega - c130o1 * omega * omega + c29o1 * omega * omega * omega);
     ////////////////////////////////////////////////////////////
     // 4.
     real O4 = c1o1;
@@ -292,16 +358,16 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     real O6 = c1o1;
 
     ////////////////////////////////////////////////////////////////////////////////////
-    //! - A and B: parameters for fourth order convergence of the diffusion term according to Eq. (115) and (116)
+    //! - A and DIR_00M: parameters for fourth order convergence of the diffusion term according to Eq. (115) and (116)
     //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
     //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a> with simplifications assuming \f$ \omega_2 = 1.0 \f$ (modify for
     //! different bulk viscosity).
     //!
-    real factorA = (c4o1 + c2o1  * omega - c3o1  * omega * omega) / (c2o1 - c7o1  * omega + c5o1  * omega * omega);
+    real factorA = (c4o1 + c2o1 * omega - c3o1 * omega * omega) / (c2o1 - c7o1 * omega + c5o1 * omega * omega);
     real factorB = (c4o1 + c28o1 * omega - c14o1 * omega * omega) / (c6o1 - c21o1 * omega + c15o1 * omega * omega);
 
     ////////////////////////////////////////////////////////////////////////////////////
-    //! - Compute cumulants (c's) from central moments according to Eq. (20)-(23) in
+    //! - Compute cumulants from central moments according to Eq. (20)-(23) in
     //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
     //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
     //!
@@ -318,27 +384,27 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     // 5.
     real c_122 =
         m_122 - ((m_002 * m_120 + m_020 * m_102 + c4o1 * m_011 * m_111 + c2o1 * (m_101 * m_021 + m_110 * m_012)) +
-                 c1o3 * (m_120 + m_102)) *
-                 oneOverRho;
+                c1o3 * (m_120 + m_102)) *
+                oneOverRho;
     real c_212 =
         m_212 - ((m_002 * m_210 + m_200 * m_012 + c4o1 * m_101 * m_111 + c2o1 * (m_011 * m_201 + m_110 * m_102)) +
-                 c1o3 * (m_210 + m_012)) *
-                 oneOverRho;
+                c1o3 * (m_210 + m_012)) *
+                oneOverRho;
     real c_221 =
         m_221 - ((m_200 * m_021 + m_020 * m_201 + c4o1 * m_110 * m_111 + c2o1 * (m_101 * m_120 + m_011 * m_210)) +
-                 c1o3 * (m_021 + m_201)) *
-                 oneOverRho;
+                c1o3 * (m_021 + m_201)) *
+                oneOverRho;
     ////////////////////////////////////////////////////////////
     // 6.
     real c_222 = m_222 + ((-c4o1 * m_111 * m_111 - (m_200 * m_022 + m_020 * m_202 + m_002 * m_220) -
                             c4o1 * (m_011 * m_211 + m_101 * m_121 + m_110 * m_112) -
                             c2o1 * (m_120 * m_102 + m_210 * m_012 + m_201 * m_021)) *
                             oneOverRho +
-                           (c4o1 * (m_101 * m_101 * m_020 + m_011 * m_011 * m_200 + m_110 * m_110 * m_002) +
+                        (c4o1 * (m_101 * m_101 * m_020 + m_011 * m_011 * m_200 + m_110 * m_110 * m_002) +
                             c2o1 * (m_200 * m_020 * m_002) + c16o1 * m_110 * m_101 * m_011) *
                             oneOverRho * oneOverRho -
                             c1o3 * (m_022 + m_202 + m_220) * oneOverRho - c1o9 * (m_200 + m_020 + m_002) * oneOverRho +
-                           (c2o1 * (m_101 * m_101 + m_011 * m_011 + m_110 * m_110) +
+                        (c2o1 * (m_101 * m_101 + m_011 * m_011 + m_110 * m_110) +
                             (m_002 * m_020 + m_002 * m_200 + m_020 * m_200) + c1o3 * (m_002 + m_020 + m_200)) *
                             oneOverRho * oneOverRho * c2o3 +
                             c1o27 * ((drho * drho - drho) * oneOverRho * oneOverRho));
@@ -378,6 +444,22 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     real dxux = c1o2 * (-omega) * (mxxMyy + mxxMzz) + c1o2 * OxxPyyPzz * (m_000 - mxxPyyPzz);
     real dyuy = dxux + omega * c3o2 * mxxMyy;
     real dzuz = dxux + omega * c3o2 * mxxMzz;
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    switch (turbulenceModel)
+    {
+    case TurbulenceModel::None:
+    case TurbulenceModel::AMD:  //AMD is computed in separate kernel
+        break;
+    case TurbulenceModel::Smagorinsky:
+        turbulentViscosity[k_000] = calcTurbulentViscositySmagorinsky(SGSconstant, dxux, dyuy, dzuz, Dxy, Dxz , Dyz);
+        break;
+    case TurbulenceModel::QR:
+        turbulentViscosity[k_000] = calcTurbulentViscosityQR(SGSconstant, dxux, dyuy, dzuz, Dxy, Dxz , Dyz);
+        break;
+    default:
+        break;
+    }
     ////////////////////////////////////////////////////////////
     //! - Relaxation of second order cumulants with correction terms according to Eq. (33)-(35) in
     //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
@@ -386,7 +468,6 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     mxxPyyPzz += OxxPyyPzz * (m_000 - mxxPyyPzz) - c3o1 * (c1o1 - c1o2 * OxxPyyPzz) * (vx2 * dxux + vy2 * dyuy + vz2 * dzuz);
     mxxMyy += omega * (-mxxMyy) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vy2 * dyuy);
     mxxMzz += omega * (-mxxMzz) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vz2 * dzuz);
-    //////////////////////////////////////////////////////////////////////////
 
     ////////////////////////////////////////////////////////////////////////////////////
     ////no correction
@@ -394,18 +475,18 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     // mxxMyy += -(-omega) * (-mxxMyy);
     // mxxMzz += -(-omega) * (-mxxMzz);
     //////////////////////////////////////////////////////////////////////////
-    
     m_011 += omega * (-m_011);
     m_101 += omega * (-m_101);
     m_110 += omega * (-m_110);
 
+    ////////////////////////////////////////////////////////////////////////////////////
+    // relax
     //////////////////////////////////////////////////////////////////////////
+    // incl. limiter
     //! - Relaxation of third order cumulants including limiter according to Eq. (116)-(123)
     //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
     //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
     //!
-    //////////////////////////////////////////////////////////////////////////
-    // incl. limiter
     real wadjust = Oxyz + (c1o1 - Oxyz) * abs(m_111) / (abs(m_111) + quadricLimitD);
     m_111 += wadjust * (-m_111);
     wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxxyPyzz) / (abs(mxxyPyzz) + quadricLimitP);
@@ -459,6 +540,7 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     c_121 = -O4 * (c1o1 / omega - c1o2) * Dxz           * c1o3 * factorB + (c1o1 - O4) * (c_121);
     c_211 = -O4 * (c1o1 / omega - c1o2) * Dyz           * c1o3 * factorB + (c1o1 - O4) * (c_211);
 
+
     //////////////////////////////////////////////////////////////////////////
     // 5.
     c_122 += O5 * (-c_122);
@@ -503,17 +585,17 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     //////////////////////////////////////////////////////////////////////////
     // 6.
     m_222 = c_222 - ((-c4o1 * m_111 * m_111 - (m_200 * m_022 + m_020 * m_202 + m_002 * m_220) -
-                       c4o1 * (m_011 * m_211 + m_101 * m_121 + m_110 * m_112) -
-                       c2o1 * (m_120 * m_102 + m_210 * m_012 + m_201 * m_021)) *
-                       oneOverRho +
-                      (c4o1 * (m_101 * m_101 * m_020 + m_011 * m_011 * m_200 + m_110 * m_110 * m_002) +
-                       c2o1 * (m_200 * m_020 * m_002) + c16o1 * m_110 * m_101 * m_011) *
-                       oneOverRho * oneOverRho -
-                       c1o3 * (m_022 + m_202 + m_220) * oneOverRho - c1o9 * (m_200 + m_020 + m_002) * oneOverRho +
-                      (c2o1 * (m_101 * m_101 + m_011 * m_011 + m_110 * m_110) +
-                       (m_002 * m_020 + m_002 * m_200 + m_020 * m_200) + c1o3 * (m_002 + m_020 + m_200)) *
-                       oneOverRho * oneOverRho * c2o3 +
-                       c1o27 * ((drho * drho - drho) * oneOverRho * oneOverRho));
+                    c4o1 * (m_011 * m_211 + m_101 * m_121 + m_110 * m_112) -
+                    c2o1 * (m_120 * m_102 + m_210 * m_012 + m_201 * m_021)) *
+                    oneOverRho +
+                    (c4o1 * (m_101 * m_101 * m_020 + m_011 * m_011 * m_200 + m_110 * m_110 * m_002) +
+                    c2o1 * (m_200 * m_020 * m_002) + c16o1 * m_110 * m_101 * m_011) *
+                    oneOverRho * oneOverRho -
+                    c1o3 * (m_022 + m_202 + m_220) * oneOverRho - c1o9 * (m_200 + m_020 + m_002) * oneOverRho +
+                    (c2o1 * (m_101 * m_101 + m_011 * m_011 + m_110 * m_110) +
+                    (m_002 * m_020 + m_002 * m_200 + m_020 * m_200) + c1o3 * (m_002 + m_020 + m_200)) *
+                    oneOverRho * oneOverRho * c2o3 +
+                    c1o27 * ((drho * drho - drho) * oneOverRho * oneOverRho));
 
     ////////////////////////////////////////////////////////////////////////////////////
     //! -  Add acceleration (body force) to first order cumulants according to Eq. (85)-(87) in
@@ -524,6 +606,15 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     m_010 = -m_010;
     m_001 = -m_001;
 
+    //Write to array here to distribute read/write
+    if(writeMacroscopicVariables)
+    {
+        rho[k_000] = drho;
+        vx[k_000] = vvx;
+        vy[k_000] = vvy;
+        vz[k_000] = vvz;
+    }
+
     ////////////////////////////////////////////////////////////////////////////////////
     //! - Chimera transform from central moments to well conditioned distributions as defined in Appendix J in
     //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
@@ -573,31 +664,63 @@ __global__ void LB_Kernel_CumulantK17CompChimRedesigned(
     //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
     //! DOI:10.3390/computation5020019 ]</b></a>
     //!
-    (dist.f[DIR_P00])[k_000] = f_M00;
-    (dist.f[DIR_M00])[k_M00] = f_P00;
-    (dist.f[DIR_0P0])[k_000] = f_0M0;
-    (dist.f[DIR_0M0])[k_0M0] = f_0P0;
-    (dist.f[DIR_00P])[k_000] = f_00M;
-    (dist.f[DIR_00M])[k_00M] = f_00P;
-    (dist.f[DIR_PP0])[k_000] = f_MM0;
-    (dist.f[DIR_MM0])[k_MM0] = f_PP0;
-    (dist.f[DIR_PM0])[k_0M0] = f_MP0;
-    (dist.f[DIR_MP0])[k_M00] = f_PM0;
-    (dist.f[DIR_P0P])[k_000] = f_M0M;
-    (dist.f[DIR_M0M])[k_M0M] = f_P0P;
-    (dist.f[DIR_P0M])[k_00M] = f_M0P;
-    (dist.f[DIR_M0P])[k_M00] = f_P0M;
-    (dist.f[DIR_0PP])[k_000] = f_0MM;
-    (dist.f[DIR_0MM])[k_0MM] = f_0PP;
-    (dist.f[DIR_0PM])[k_00M] = f_0MP;
-    (dist.f[DIR_0MP])[k_0M0] = f_0PM;
+    (dist.f[DIR_P00])[k_000]    = f_M00;
+    (dist.f[DIR_M00])[k_M00]    = f_P00;
+    (dist.f[DIR_0P0])[k_000]    = f_0M0;
+    (dist.f[DIR_0M0])[k_0M0]    = f_0P0;
+    (dist.f[DIR_00P])[k_000]    = f_00M;
+    (dist.f[DIR_00M])[k_00M]    = f_00P;
+    (dist.f[DIR_PP0])[k_000]   = f_MM0;
+    (dist.f[DIR_MM0])[k_MM0]   = f_PP0;
+    (dist.f[DIR_PM0])[k_0M0]   = f_MP0;
+    (dist.f[DIR_MP0])[k_M00]   = f_PM0;
+    (dist.f[DIR_P0P])[k_000]   = f_M0M;
+    (dist.f[DIR_M0M])[k_M0M]   = f_P0P;
+    (dist.f[DIR_P0M])[k_00M]   = f_M0P;
+    (dist.f[DIR_M0P])[k_M00]   = f_P0M;
+    (dist.f[DIR_0PP])[k_000]   = f_0MM;
+    (dist.f[DIR_0MM])[k_0MM]   = f_0PP;
+    (dist.f[DIR_0PM])[k_00M]   = f_0MP;
+    (dist.f[DIR_0MP])[k_0M0]   = f_0PM;
     (dist.f[DIR_000])[k_000] = f_000;
-    (dist.f[DIR_PPP])[k_000] = f_MMM;
-    (dist.f[DIR_PMP])[k_0M0] = f_MPM;
-    (dist.f[DIR_PPM])[k_00M] = f_MMP;
-    (dist.f[DIR_PMM])[k_0MM] = f_MPP;
-    (dist.f[DIR_MPP])[k_M00] = f_PMM;
-    (dist.f[DIR_MMP])[k_MM0] = f_PPM;
-    (dist.f[DIR_MPM])[k_M0M] = f_PMP;
-    (dist.f[DIR_MMM])[k_MMM] = f_PPP;
-}
\ No newline at end of file
+    (dist.f[DIR_PPP])[k_000]  = f_MMM;
+    (dist.f[DIR_PMP])[k_0M0]  = f_MPM;
+    (dist.f[DIR_PPM])[k_00M]  = f_MMP;
+    (dist.f[DIR_PMM])[k_0MM]  = f_MPP;
+    (dist.f[DIR_MPP])[k_M00]  = f_PMM;
+    (dist.f[DIR_MMP])[k_MM0]  = f_PPM;
+    (dist.f[DIR_MPM])[k_M0M]  = f_PMP;
+    (dist.f[DIR_MMM])[k_MMM]  = f_PPP;
+}
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::AMD, true, true > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::Smagorinsky, true, true > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::QR, true, true > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::None, true, true > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::AMD, true, false > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::Smagorinsky, true, false > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::QR, true, false > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::None, true, false > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::AMD, false, true > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::Smagorinsky, false, true > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::QR, false, true > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::None, false, true > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::AMD, false, false > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::Smagorinsky, false, false > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::QR, false, false > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
+
+template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::None, false, false > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17_Device.cuh b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17_Device.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..da576618d1b08b55629c3c65fc115ceb822c8f7e
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17_Device.cuh
@@ -0,0 +1,29 @@
+#ifndef LB_Kernel_CUMULANT_K17_H
+#define LB_Kernel_CUMULANT_K17_H
+
+#include <DataTypes.h>
+#include <curand.h>
+
+template< TurbulenceModel turbulenceModel, bool writeMacroscopicVariables, bool applyBodyForce > __global__ void LB_Kernel_CumulantK17(
+    real omega_in,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    real* distributions,
+    real* rho,
+    real* vx,
+    real* vy,
+    real* vz,
+    real* turbulentViscosity,
+    real SGSconstant,
+    unsigned long long numberOfLBnodes,
+    int level,
+    real* forces,
+    real* bodyForceX,
+    real* bodyForceY,
+    real* bodyForceZ,
+    real* quadricLimiters,
+    bool isEvenTimestep,
+    const uint *fluidNodeIndices,
+    uint numberOfFluidNodes);
+#endif
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Bulk/CumulantK17BulkComp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Bulk/CumulantK17BulkComp.cu
index 72d13282fc604dddcfa84682425a7a1829855ea0..b9e25494490507bde5a6aa7d6dd588ac1a1f6c87 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Bulk/CumulantK17BulkComp.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Bulk/CumulantK17BulkComp.cu
@@ -17,17 +17,18 @@ void CumulantK17BulkComp::run()
 	dim3 grid(Grid, 1, 1);
 	dim3 threads(numberOfThreads, 1, 1);
 
-	LB_Kernel_CumulantK17BulkComp << < grid, threads >> >(	para->getParD(level)->omega,
-																	para->getParD(level)->typeOfGridNode,
-																	para->getParD(level)->neighborX,
-																	para->getParD(level)->neighborY,
-																	para->getParD(level)->neighborZ,
-																	para->getParD(level)->distributions.f[0],
-																	para->getParD(level)->numberOfNodes,
-																	level,
-																	para->getForcesDev(),
-                                                                    para->getQuadricLimitersDev(),
-																	para->getParD(level)->isEvenTimestep);
+	LB_Kernel_CumulantK17BulkComp << < grid, threads >> >(
+		para->getParD(level)->omega,
+		para->getParD(level)->typeOfGridNode,
+		para->getParD(level)->neighborX,
+		para->getParD(level)->neighborY,
+		para->getParD(level)->neighborZ,
+		para->getParD(level)->distributions.f[0],
+		para->getParD(level)->numberOfNodes,
+		level,
+		para->getForcesDev(),
+		para->getQuadricLimitersDev(),
+		para->getParD(level)->isEvenTimestep);
 	getLastCudaError("LB_Kernel_CumulantK17BulkComp execution failed");
 }
 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Bulk/CumulantK17BulkComp_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Bulk/CumulantK17BulkComp_Device.cu
index cec04116ae4b411b1b3816ff4a8cab606c92491e..b33a3c251b5fb0cde8b1da0fcce097f955353d69 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Bulk/CumulantK17BulkComp_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Bulk/CumulantK17BulkComp_Device.cu
@@ -40,63 +40,63 @@ __global__ void LB_Kernel_CumulantK17BulkComp(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -129,33 +129,33 @@ __global__ void LB_Kernel_CumulantK17BulkComp(real omega,
 			//unsigned int ktne = k;
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 											////////////////////////////////////////////////////////////////////////////////////
 			real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
 				(((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Unified/CumulantK17Unified.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Unified/CumulantK17Unified.cu
index 6ef6b40d3b7079579f54ca68734deb274d0c1c3a..295804887f9c451120d463c7fcdd968bd2f24d12 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Unified/CumulantK17Unified.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Unified/CumulantK17Unified.cu
@@ -31,15 +31,16 @@ CumulantK17Unified::CumulantK17Unified(std::shared_ptr<Parameter> para, int leve
 
 void CumulantK17Unified::run()
 {
-    GPUKernelParameter kernelParameter{ para->getParD(level)->omega,
-                                                 para->getParD(level)->typeOfGridNode,
-                                                 para->getParD(level)->neighborX,
-                                                 para->getParD(level)->neighborY,
-                                                 para->getParD(level)->neighborZ,
-                                                 para->getParD(level)->distributions.f[0],
-                                                 (int)para->getParD(level)->numberOfNodes,
-                                                 para->getParD(level)->forcing,
-                                                 para->getParD(level)->isEvenTimestep };
+    GPUKernelParameter kernelParameter{
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        (int)para->getParD(level)->numberOfNodes,
+        para->getParD(level)->forcing,
+        para->getParD(level)->isEvenTimestep };
 
     auto lambda = [] __device__(lbm::KernelParameter parameter) {
         return lbm::cumulantChimera(parameter, lbm::setRelaxationRatesK17);
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chim/CumulantK17CompChim_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chim/CumulantK17CompChim_Device.cu
index 3eea267e55fee45111fb11cf1258559e2c3c63f2..a0db78d27b00372feab8490111183481abbec8b9 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chim/CumulantK17CompChim_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chim/CumulantK17CompChim_Device.cu
@@ -33,11 +33,12 @@
 /* Device code */
 #include "LBM/LB.h" 
 #include "lbm/constants/D3Q27.h"
-#include <lbm/constants/NumericConstants.h>
+#include "lbm/constants/NumericConstants.h"
+#include "LBM/GPUHelperFunctions/ChimeraTransformation.h"
 
 using namespace vf::lbm::constant;
 using namespace vf::lbm::dir;
-#include "Kernel/Utilities/ChimeraTransformation.h"
+using namespace vf::gpu;
 
 ////////////////////////////////////////////////////////////////////////////////
 __global__ void LB_Kernel_CumulantK17CompChim(
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.cu
deleted file mode 100644
index 8c06b7117c8b1ef62b932a76bf5de0be2ae99b1c..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.cu
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "CumulantK17CompChimRedesigned.h"
-
-#include "Parameter/Parameter.h"
-#include "Parameter/CudaStreamManager.h"
-#include "CumulantK17CompChimRedesigned_Device.cuh"
-
-#include <cuda.h>
-
-std::shared_ptr<CumulantK17CompChimRedesigned> CumulantK17CompChimRedesigned::getNewInstance(std::shared_ptr<Parameter> para,
-                                                                               int level)
-{
-    return std::shared_ptr<CumulantK17CompChimRedesigned>(new CumulantK17CompChimRedesigned(para, level));
-}
-
-void CumulantK17CompChimRedesigned::run()
-{
-    LB_Kernel_CumulantK17CompChimRedesigned <<< cudaGrid.grid, cudaGrid.threads >>>(
-        para->getParD(level)->omega,
-        para->getParD(level)->neighborX,
-        para->getParD(level)->neighborY,
-        para->getParD(level)->neighborZ,
-        para->getParD(level)->distributions.f[0],
-        para->getParD(level)->numberOfNodes,
-        level,
-        para->getForcesDev(),
-        para->getQuadricLimitersDev(),
-        para->getParD(level)->isEvenTimestep,
-        para->getParD(level)->fluidNodeIndices,
-        para->getParD(level)->numberOfFluidNodes);
-    getLastCudaError("LB_Kernel_CumulantK17CompChim execution failed");
-}
-
-void CumulantK17CompChimRedesigned::runOnIndices(const unsigned int *indices, unsigned int size_indices, int streamIndex)
-{
-    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
-
-    LB_Kernel_CumulantK17CompChimRedesigned<<< cudaGrid.grid, cudaGrid.threads, 0, stream>>>(
-        para->getParD(level)->omega, 
-        para->getParD(level)->neighborX, 
-        para->getParD(level)->neighborY,
-        para->getParD(level)->neighborZ, 
-        para->getParD(level)->distributions.f[0], 
-        para->getParD(level)->numberOfNodes, 
-        level,
-        para->getForcesDev(), 
-        para->getQuadricLimitersDev(),
-        para->getParD(level)->isEvenTimestep,
-        indices,
-        size_indices);
-    getLastCudaError("LB_Kernel_CumulantK17CompChim execution failed");
-    
-}
-
-CumulantK17CompChimRedesigned::CumulantK17CompChimRedesigned(std::shared_ptr<Parameter> para, int level): KernelImp(para, level)
-{
-    myPreProcessorTypes.push_back(InitCompSP27);
-    myKernelGroup = BasicKernel;
-    this->cudaGrid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
-    this->kernelUsesFluidNodeIndices = true;
-}
-
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.h
deleted file mode 100644
index 4658075de330665fdba88a5ec8149a9b476d5ac7..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef CUMULANT_K17_COMP_CHIM_REDESIGN_H
-#define CUMULANT_K17_COMP_CHIM_REDESIGN_H
-
-#include "Kernel/KernelImp.h"
-
-class CumulantK17CompChimRedesigned : public KernelImp
-{
-public:
-    static std::shared_ptr<CumulantK17CompChimRedesigned> getNewInstance(std::shared_ptr<Parameter> para, int level);
-	void run() override;
-    void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1) override;
-
-private:
-    CumulantK17CompChimRedesigned();
-    CumulantK17CompChimRedesigned(std::shared_ptr<Parameter> para, int level);
-};
-
-#endif 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned_Device.cuh b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned_Device.cuh
deleted file mode 100644
index 00628efc76447a09504d2fd32a26a63a4d611c66..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned_Device.cuh
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef LB_Kernel_CUMULANT_K17_COMP_CHIM_REDESIGN_H
-#define LB_Kernel_CUMULANT_K17_COMP_CHIM_REDESIGN_H
-
-#include <DataTypes.h>
-#include <curand.h>
-
-__global__ void LB_Kernel_CumulantK17CompChimRedesigned(
-    real omega,
-    uint* neighborX,
-    uint* neighborY,
-    uint* neighborZ,
-    real* distributions,
-    unsigned long numberOfLBnodes,
-    int level,
-    real* forces,
-    real* quadricLimiters,
-    bool isEvenTimestep,
-    const uint* fluidNodeIndices,
-    uint numberOfFluidNodes);
-#endif
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu
deleted file mode 100644
index 6fae9f6d4845019afd363790eea0ee17c69a060f..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "CumulantK17CompChimStream.h"
-
-#include "Parameter/Parameter.h"
-#include "Parameter/CudaStreamManager.h"
-#include "CumulantK17CompChimStream_Device.cuh"
-
-#include <cuda.h>
-
-std::shared_ptr<CumulantK17CompChimStream> CumulantK17CompChimStream::getNewInstance(std::shared_ptr<Parameter> para,
-                                                                               int level)
-{
-    return std::shared_ptr<CumulantK17CompChimStream>(new CumulantK17CompChimStream(para, level));
-}
-
-void CumulantK17CompChimStream::run()
-{
-    LB_Kernel_CumulantK17CompChimStream <<< cudaGrid.grid, cudaGrid.threads >>>(
-        para->getParD(level)->omega,
-        para->getParD(level)->neighborX,
-        para->getParD(level)->neighborY,
-        para->getParD(level)->neighborZ,
-        para->getParD(level)->distributions.f[0],
-        para->getParD(level)->numberOfNodes,
-        level,
-        para->getForcesDev(),
-        para->getQuadricLimitersDev(),
-        para->getParD(level)->isEvenTimestep,
-        para->getParD(level)->fluidNodeIndices,
-        para->getParD(level)->numberOfFluidNodes);
-    getLastCudaError("LB_Kernel_CumulantK17CompChim execution failed");
-}
-
-void CumulantK17CompChimStream::runOnIndices(const unsigned int *indices, unsigned int size_indices, int streamIndex)
-{
-    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
-
-    LB_Kernel_CumulantK17CompChimStream<<< cudaGrid.grid, cudaGrid.threads, 0, stream>>>(
-        para->getParD(level)->omega, 
-        para->getParD(level)->neighborX, 
-        para->getParD(level)->neighborY,
-        para->getParD(level)->neighborZ, 
-        para->getParD(level)->distributions.f[0], 
-        para->getParD(level)->numberOfNodes, 
-        level,
-        para->getForcesDev(), 
-        para->getQuadricLimitersDev(), 
-        para->getParD(level)->isEvenTimestep,
-        indices,
-        size_indices);
-    getLastCudaError("LB_Kernel_CumulantK17CompChim execution failed");
-    
-}
-
-CumulantK17CompChimStream::CumulantK17CompChimStream(std::shared_ptr<Parameter> para, int level): KernelImp(para, level)
-{
-    myPreProcessorTypes.push_back(InitCompSP27);
-    myKernelGroup = BasicKernel;
-    this->cudaGrid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
-    this->kernelUsesFluidNodeIndices = true;
-}
-
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.h
deleted file mode 100644
index 325826e04c893b7c56b7f00bb2503a4eb1fda441..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef CUMULANT_K17_COMP_CHIM_SPARSE_H
-#define CUMULANT_K17_COMP_CHIM_SPARSE_H
-
-#include "Kernel/KernelImp.h"
-
-class CumulantK17CompChimStream : public KernelImp
-{
-public:
-    static std::shared_ptr<CumulantK17CompChimStream> getNewInstance(std::shared_ptr<Parameter> para, int level);
-	void run() override;
-    void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1) override;
-
-private:
-    CumulantK17CompChimStream();
-    CumulantK17CompChimStream(std::shared_ptr<Parameter> para, int level);
-};
-
-#endif 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStreamDevice.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStreamDevice.cu
deleted file mode 100644
index 830fcc6c328f2ecd0f626539040868696065065f..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStreamDevice.cu
+++ /dev/null
@@ -1,640 +0,0 @@
-//=======================================================================================
-// ____          ____    __    ______     __________   __      __       __        __         
-// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
-//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
-//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
-//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
-//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
-//      \    \  |    |   ________________________________________________________________    
-//       \    \ |    |  |  ______________________________________________________________|   
-//        \    \|    |  |  |         __          __     __     __     ______      _______    
-//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
-//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
-//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
-//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
-//
-//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file Cumulant27chimStream.cu
-//! \ingroup GPU
-//! \author Martin Schoenherr, Anna Wellmann
-//=======================================================================================
-/* Device code */
-#include "LBM/LB.h" 
-#include "lbm/constants/D3Q27.h"
-#include <lbm/constants/NumericConstants.h>
-
-using namespace vf::lbm::constant;
-using namespace vf::lbm::dir;
-#include "Kernel/Utilities/ChimeraTransformation.h"
-
-////////////////////////////////////////////////////////////////////////////////
-__global__ void LB_Kernel_CumulantK17CompChimStream(
-	real omega,
-	uint* neighborX,
-	uint* neighborY,
-	uint* neighborZ,
-	real* distributions,
-	unsigned long size_Mat,
-	int level,
-	real* forces,
-	real* quadricLimiters,
-	bool isEvenTimestep,
-    const uint *fluidNodeIndices, 
-    uint numberOfFluidNodes)
-{
-    //////////////////////////////////////////////////////////////////////////
-    //! Cumulant K17 Kernel is based on \ref
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-    //! ]</b></a> and \ref <a href="https://doi.org/10.1016/j.jcp.2017.07.004"><b>[ M. Geier et al. (2017),
-    //! DOI:10.1016/j.jcp.2017.07.004 ]</b></a>
-    //!
-    //! The cumulant kernel is executed in the following steps
-    //!
-    ////////////////////////////////////////////////////////////////////////////////
-    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
-    //!
-    const unsigned x = threadIdx.x;
-    const unsigned y = blockIdx.x;
-    const unsigned z = blockIdx.y;
-
-    const unsigned nx = blockDim.x;
-    const unsigned ny = gridDim.x;
-
-    const unsigned k_thread = nx * (ny * z + y) + x;
-
-    //////////////////////////////////////////////////////////////////////////
-    // run for all indices in fluidNodeIndices
-    if (k_thread < numberOfFluidNodes) {
-        //////////////////////////////////////////////////////////////////////////
-        //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on
-        //! timestep is based on the esoteric twist algorithm \ref <a
-        //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
-        //! DOI:10.3390/computation5020019 ]</b></a>
-
-        const unsigned k = fluidNodeIndices[k_thread];
-
-        Distributions27 dist;
-        if (isEvenTimestep) {
-            dist.f[DIR_P00]    = &distributions[DIR_P00 * size_Mat];
-            dist.f[DIR_M00]    = &distributions[DIR_M00 * size_Mat];
-            dist.f[DIR_0P0]    = &distributions[DIR_0P0 * size_Mat];
-            dist.f[DIR_0M0]    = &distributions[DIR_0M0 * size_Mat];
-            dist.f[DIR_00P]    = &distributions[DIR_00P * size_Mat];
-            dist.f[DIR_00M]    = &distributions[DIR_00M * size_Mat];
-            dist.f[DIR_PP0]   = &distributions[DIR_PP0 * size_Mat];
-            dist.f[DIR_MM0]   = &distributions[DIR_MM0 * size_Mat];
-            dist.f[DIR_PM0]   = &distributions[DIR_PM0 * size_Mat];
-            dist.f[DIR_MP0]   = &distributions[DIR_MP0 * size_Mat];
-            dist.f[DIR_P0P]   = &distributions[DIR_P0P * size_Mat];
-            dist.f[DIR_M0M]   = &distributions[DIR_M0M * size_Mat];
-            dist.f[DIR_P0M]   = &distributions[DIR_P0M * size_Mat];
-            dist.f[DIR_M0P]   = &distributions[DIR_M0P * size_Mat];
-            dist.f[DIR_0PP]   = &distributions[DIR_0PP * size_Mat];
-            dist.f[DIR_0MM]   = &distributions[DIR_0MM * size_Mat];
-            dist.f[DIR_0PM]   = &distributions[DIR_0PM * size_Mat];
-            dist.f[DIR_0MP]   = &distributions[DIR_0MP * size_Mat];
-            dist.f[DIR_000] = &distributions[DIR_000 * size_Mat];
-            dist.f[DIR_PPP]  = &distributions[DIR_PPP * size_Mat];
-            dist.f[DIR_MMP]  = &distributions[DIR_MMP * size_Mat];
-            dist.f[DIR_PMP]  = &distributions[DIR_PMP * size_Mat];
-            dist.f[DIR_MPP]  = &distributions[DIR_MPP * size_Mat];
-            dist.f[DIR_PPM]  = &distributions[DIR_PPM * size_Mat];
-            dist.f[DIR_MMM]  = &distributions[DIR_MMM * size_Mat];
-            dist.f[DIR_PMM]  = &distributions[DIR_PMM * size_Mat];
-            dist.f[DIR_MPM]  = &distributions[DIR_MPM * size_Mat];
-        } else {
-            dist.f[DIR_M00]    = &distributions[DIR_P00 * size_Mat];
-            dist.f[DIR_P00]    = &distributions[DIR_M00 * size_Mat];
-            dist.f[DIR_0M0]    = &distributions[DIR_0P0 * size_Mat];
-            dist.f[DIR_0P0]    = &distributions[DIR_0M0 * size_Mat];
-            dist.f[DIR_00M]    = &distributions[DIR_00P * size_Mat];
-            dist.f[DIR_00P]    = &distributions[DIR_00M * size_Mat];
-            dist.f[DIR_MM0]   = &distributions[DIR_PP0 * size_Mat];
-            dist.f[DIR_PP0]   = &distributions[DIR_MM0 * size_Mat];
-            dist.f[DIR_MP0]   = &distributions[DIR_PM0 * size_Mat];
-            dist.f[DIR_PM0]   = &distributions[DIR_MP0 * size_Mat];
-            dist.f[DIR_M0M]   = &distributions[DIR_P0P * size_Mat];
-            dist.f[DIR_P0P]   = &distributions[DIR_M0M * size_Mat];
-            dist.f[DIR_M0P]   = &distributions[DIR_P0M * size_Mat];
-            dist.f[DIR_P0M]   = &distributions[DIR_M0P * size_Mat];
-            dist.f[DIR_0MM]   = &distributions[DIR_0PP * size_Mat];
-            dist.f[DIR_0PP]   = &distributions[DIR_0MM * size_Mat];
-            dist.f[DIR_0MP]   = &distributions[DIR_0PM * size_Mat];
-            dist.f[DIR_0PM]   = &distributions[DIR_0MP * size_Mat];
-            dist.f[DIR_000] = &distributions[DIR_000 * size_Mat];
-            dist.f[DIR_MMM]  = &distributions[DIR_PPP * size_Mat];
-            dist.f[DIR_PPM]  = &distributions[DIR_MMP * size_Mat];
-            dist.f[DIR_MPM]  = &distributions[DIR_PMP * size_Mat];
-            dist.f[DIR_PMM]  = &distributions[DIR_MPP * size_Mat];
-            dist.f[DIR_MMP]  = &distributions[DIR_PPM * size_Mat];
-            dist.f[DIR_PPP]  = &distributions[DIR_MMM * size_Mat];
-            dist.f[DIR_MPP]  = &distributions[DIR_PMM * size_Mat];
-            dist.f[DIR_PMP]  = &distributions[DIR_MPM * size_Mat];
-        }
-        ////////////////////////////////////////////////////////////////////////////////
-        //! - Set neighbor indices (necessary for indirect addressing)
-        uint kw   = neighborX[k];
-        uint ks   = neighborY[k];
-        uint kb   = neighborZ[k];
-        uint ksw  = neighborY[kw];
-        uint kbw  = neighborZ[kw];
-        uint kbs  = neighborZ[ks];
-        uint kbsw = neighborZ[ksw];
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Set local distributions
-        //!
-        real mfcbb = (dist.f[DIR_P00])[k];
-        real mfabb = (dist.f[DIR_M00])[kw];
-        real mfbcb = (dist.f[DIR_0P0])[k];
-        real mfbab = (dist.f[DIR_0M0])[ks];
-        real mfbbc = (dist.f[DIR_00P])[k];
-        real mfbba = (dist.f[DIR_00M])[kb];
-        real mfccb = (dist.f[DIR_PP0])[k];
-        real mfaab = (dist.f[DIR_MM0])[ksw];
-        real mfcab = (dist.f[DIR_PM0])[ks];
-        real mfacb = (dist.f[DIR_MP0])[kw];
-        real mfcbc = (dist.f[DIR_P0P])[k];
-        real mfaba = (dist.f[DIR_M0M])[kbw];
-        real mfcba = (dist.f[DIR_P0M])[kb];
-        real mfabc = (dist.f[DIR_M0P])[kw];
-        real mfbcc = (dist.f[DIR_0PP])[k];
-        real mfbaa = (dist.f[DIR_0MM])[kbs];
-        real mfbca = (dist.f[DIR_0PM])[kb];
-        real mfbac = (dist.f[DIR_0MP])[ks];
-        real mfbbb = (dist.f[DIR_000])[k];
-        real mfccc = (dist.f[DIR_PPP])[k];
-        real mfaac = (dist.f[DIR_MMP])[ksw];
-        real mfcac = (dist.f[DIR_PMP])[ks];
-        real mfacc = (dist.f[DIR_MPP])[kw];
-        real mfcca = (dist.f[DIR_PPM])[kb];
-        real mfaaa = (dist.f[DIR_MMM])[kbsw];
-        real mfcaa = (dist.f[DIR_PMM])[kbs];
-        real mfaca = (dist.f[DIR_MPM])[kbw];
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-        //!
-        real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
-                     (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) +
-                      ((mfacb + mfcab) + (mfaab + mfccb))) +
-                     ((mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc))) +
-                    mfbbb;
-
-        real rho   = c1o1 + drho;
-        real OOrho = c1o1 / rho;
-
-        real vvx = ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
-                    (((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) + (mfcbb - mfabb)) *
-                   OOrho;
-        real vvy = ((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
-                    (((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) + (mfbcb - mfbab)) *
-                   OOrho;
-        real vvz = ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
-                    (((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) + (mfbbc - mfbba)) *
-                   OOrho;
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Add half of the acceleration (body force) to the velocity as in Eq. (42) \ref
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-        //!
-        real factor = c1o1;
-        for (size_t i = 1; i <= level; i++) {
-            factor *= c2o1;
-        }
-        real fx = forces[0] / factor;
-        real fy = forces[1] / factor;
-        real fz = forces[2] / factor;
-        vvx += fx * c1o2;
-        vvy += fy * c1o2;
-        vvz += fz * c1o2;
-        ////////////////////////////////////////////////////////////////////////////////////
-        // calculate the square of velocities for this lattice node
-        real vx2 = vvx * vvx;
-        real vy2 = vvy * vvy;
-        real vz2 = vvz * vvz;
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Set relaxation limiters for third order cumulants to default value \f$ \lambda=0.001 \f$ according to
-        //! section 6 in \ref <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!
-        real wadjust;
-        real qudricLimitP = quadricLimiters[0];
-        real qudricLimitM = quadricLimiters[1];
-        real qudricLimitD = quadricLimiters[2];
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Chimera transform from well conditioned distributions to central moments as defined in Appendix J in \ref
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> see also Eq. (6)-(14) in \ref <a
-        //! href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-        //! ]</b></a>
-        //!
-        ////////////////////////////////////////////////////////////////////////////////////
-        // Z - Dir
-        forwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
-        forwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2, c9o1, c1o9);
-        forwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
-        forwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2, c9o1, c1o9);
-        forwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2, c9o4, c4o9);
-        forwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2, c9o1, c1o9);
-        forwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
-        forwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2, c9o1, c1o9);
-        forwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // Y - Dir
-        forwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2, c6o1, c1o6);
-        forwardChimera(mfaab, mfabb, mfacb, vvy, vy2);
-        forwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
-        forwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2, c3o2, c2o3);
-        forwardChimera(mfbab, mfbbb, mfbcb, vvy, vy2);
-        forwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2, c9o2, c2o9);
-        forwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2, c6o1, c1o6);
-        forwardChimera(mfcab, mfcbb, mfccb, vvy, vy2);
-        forwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // X - Dir
-        forwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
-        forwardChimera(mfaba, mfbba, mfcba, vvx, vx2);
-        forwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
-        forwardChimera(mfaab, mfbab, mfcab, vvx, vx2);
-        forwardChimera(mfabb, mfbbb, mfcbb, vvx, vx2);
-        forwardChimera(mfacb, mfbcb, mfccb, vvx, vx2);
-        forwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
-        forwardChimera(mfabc, mfbbc, mfcbc, vvx, vx2);
-        forwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c3o1, c1o9);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Setting relaxation rates for non-hydrodynamic cumulants (default values). Variable names and equations
-        //! according to <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!  => [NAME IN PAPER]=[NAME IN CODE]=[DEFAULT VALUE].
-        //!  - Trace of second order cumulants \f$ C_{200}+C_{020}+C_{002} \f$ used to adjust bulk
-        //!  viscosity:\f$\omega_2=OxxPyyPzz=1.0 \f$.
-        //!  - Third order cumulants \f$ C_{120}+C_{102}, C_{210}+C_{012}, C_{201}+C_{021} \f$: \f$ \omega_3=OxyyPxzz
-        //!  \f$ set according to Eq. (111) with simplifications assuming \f$ \omega_2=1.0\f$.
-        //!  - Third order cumulants \f$ C_{120}-C_{102}, C_{210}-C_{012}, C_{201}-C_{021} \f$: \f$ \omega_4 = OxyyMxzz
-        //!  \f$ set according to Eq. (112) with simplifications assuming \f$ \omega_2 = 1.0\f$.
-        //!  - Third order cumulants \f$ C_{111} \f$: \f$ \omega_5 = Oxyz \f$ set according to Eq. (113) with
-        //!  simplifications assuming \f$ \omega_2 = 1.0\f$  (modify for different bulk viscosity).
-        //!  - Fourth order cumulants \f$ C_{220}, C_{202}, C_{022}, C_{211}, C_{121}, C_{112} \f$: for simplification
-        //!  all set to the same default value \f$ \omega_6=\omega_7=\omega_8=O4=1.0 \f$.
-        //!  - Fifth order cumulants \f$ C_{221}, C_{212}, C_{122}\f$: \f$\omega_9=O5=1.0\f$.
-        //!  - Sixth order cumulant \f$ C_{222}\f$: \f$\omega_{10}=O6=1.0\f$.
-        //!
-        ////////////////////////////////////////////////////////////
-        // 2.
-        real OxxPyyPzz = c1o1;
-        ////////////////////////////////////////////////////////////
-        // 3.
-        real OxyyPxzz = c8o1 * (-c2o1 + omega) * (c1o1 + c2o1 * omega) / (-c8o1 - c14o1 * omega + c7o1 * omega * omega);
-        real OxyyMxzz =
-            c8o1 * (-c2o1 + omega) * (-c7o1 + c4o1 * omega) / (c56o1 - c50o1 * omega + c9o1 * omega * omega);
-        real Oxyz = c24o1 * (-c2o1 + omega) * (-c2o1 - c7o1 * omega + c3o1 * omega * omega) /
-                    (c48o1 + c152o1 * omega - c130o1 * omega * omega + c29o1 * omega * omega * omega);
-        ////////////////////////////////////////////////////////////
-        // 4.
-        real O4 = c1o1;
-        ////////////////////////////////////////////////////////////
-        // 5.
-        real O5 = c1o1;
-        ////////////////////////////////////////////////////////////
-        // 6.
-        real O6 = c1o1;
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - A and DIR_00M: parameters for fourth order convergence of the diffusion term according to Eq. (115) and (116)
-        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a> with simplifications assuming \f$ \omega_2 = 1.0 \f$ (modify for
-        //! different bulk viscosity).
-        //!
-        real factorA = (c4o1 + c2o1 * omega - c3o1 * omega * omega) / (c2o1 - c7o1 * omega + c5o1 * omega * omega);
-        real factorB = (c4o1 + c28o1 * omega - c14o1 * omega * omega) / (c6o1 - c21o1 * omega + c15o1 * omega * omega);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Compute cumulants from central moments according to Eq. (20)-(23) in
-        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!
-        ////////////////////////////////////////////////////////////
-        // 4.
-        real CUMcbb = mfcbb - ((mfcaa + c1o3) * mfabb + c2o1 * mfbba * mfbab) * OOrho;
-        real CUMbcb = mfbcb - ((mfaca + c1o3) * mfbab + c2o1 * mfbba * mfabb) * OOrho;
-        real CUMbbc = mfbbc - ((mfaac + c1o3) * mfbba + c2o1 * mfbab * mfabb) * OOrho;
-
-        real CUMcca =
-            mfcca - (((mfcaa * mfaca + c2o1 * mfbba * mfbba) + c1o3 * (mfcaa + mfaca)) * OOrho - c1o9 * (drho * OOrho));
-        real CUMcac =
-            mfcac - (((mfcaa * mfaac + c2o1 * mfbab * mfbab) + c1o3 * (mfcaa + mfaac)) * OOrho - c1o9 * (drho * OOrho));
-        real CUMacc =
-            mfacc - (((mfaac * mfaca + c2o1 * mfabb * mfabb) + c1o3 * (mfaac + mfaca)) * OOrho - c1o9 * (drho * OOrho));
-        ////////////////////////////////////////////////////////////
-        // 5.
-        real CUMbcc =
-            mfbcc - ((mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb + mfbba * mfabc)) +
-                     c1o3 * (mfbca + mfbac)) *
-                        OOrho;
-        real CUMcbc =
-            mfcbc - ((mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab + mfbba * mfbac)) +
-                     c1o3 * (mfcba + mfabc)) *
-                        OOrho;
-        real CUMccb =
-            mfccb - ((mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca + mfabb * mfcba)) +
-                     c1o3 * (mfacb + mfcab)) *
-                        OOrho;
-        ////////////////////////////////////////////////////////////
-        // 6.
-        real CUMccc = mfccc + ((-c4o1 * mfbbb * mfbbb - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca) -
-                                c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc) -
-                                c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) *
-                                   OOrho +
-                               (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac) +
-                                c2o1 * (mfcaa * mfaca * mfaac) + c16o1 * mfbba * mfbab * mfabb) *
-                                   OOrho * OOrho -
-                               c1o3 * (mfacc + mfcac + mfcca) * OOrho - c1o9 * (mfcaa + mfaca + mfaac) * OOrho +
-                               (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba) +
-                                (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 * (mfaac + mfaca + mfcaa)) *
-                                   OOrho * OOrho * c2o3 +
-                               c1o27 * ((drho * drho - drho) * OOrho * OOrho));
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Compute linear combinations of second and third order cumulants
-        //!
-        ////////////////////////////////////////////////////////////
-        // 2.
-        real mxxPyyPzz = mfcaa + mfaca + mfaac;
-        real mxxMyy    = mfcaa - mfaca;
-        real mxxMzz    = mfcaa - mfaac;
-        ////////////////////////////////////////////////////////////
-        // 3.
-        real mxxyPyzz = mfcba + mfabc;
-        real mxxyMyzz = mfcba - mfabc;
-
-        real mxxzPyyz = mfcab + mfacb;
-        real mxxzMyyz = mfcab - mfacb;
-
-        real mxyyPxzz = mfbca + mfbac;
-        real mxyyMxzz = mfbca - mfbac;
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // incl. correction
-        ////////////////////////////////////////////////////////////
-        //! - Compute velocity  gradients from second order cumulants according to Eq. (27)-(32)
-        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a> Further explanations of the correction in viscosity in Appendix H of
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> Note that the division by rho is omitted here as we need rho times
-        //! the gradients later.
-        //!
-        real Dxy  = -c3o1 * omega * mfbba;
-        real Dxz  = -c3o1 * omega * mfbab;
-        real Dyz  = -c3o1 * omega * mfabb;
-        real dxux = c1o2 * (-omega) * (mxxMyy + mxxMzz) + c1o2 * OxxPyyPzz * (mfaaa - mxxPyyPzz);
-        real dyuy = dxux + omega * c3o2 * mxxMyy;
-        real dzuz = dxux + omega * c3o2 * mxxMzz;
-        ////////////////////////////////////////////////////////////
-        //! - Relaxation of second order cumulants with correction terms according to Eq. (33)-(35) in
-        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!
-        mxxPyyPzz +=
-            OxxPyyPzz * (mfaaa - mxxPyyPzz) - c3o1 * (c1o1 - c1o2 * OxxPyyPzz) * (vx2 * dxux + vy2 * dyuy + vz2 * dzuz);
-        mxxMyy += omega * (-mxxMyy) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vy2 * dyuy);
-        mxxMzz += omega * (-mxxMzz) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vz2 * dzuz);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        ////no correction
-        // mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz);
-        // mxxMyy += -(-omega) * (-mxxMyy);
-        // mxxMzz += -(-omega) * (-mxxMzz);
-        //////////////////////////////////////////////////////////////////////////
-        mfabb += omega * (-mfabb);
-        mfbab += omega * (-mfbab);
-        mfbba += omega * (-mfbba);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // relax
-        //////////////////////////////////////////////////////////////////////////
-        // incl. limiter
-        //! - Relaxation of third order cumulants including limiter according to Eq. (116)-(123)
-        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!
-        wadjust = Oxyz + (c1o1 - Oxyz) * abs(mfbbb) / (abs(mfbbb) + qudricLimitD);
-        mfbbb += wadjust * (-mfbbb);
-        wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxxyPyzz) / (abs(mxxyPyzz) + qudricLimitP);
-        mxxyPyzz += wadjust * (-mxxyPyzz);
-        wadjust = OxyyMxzz + (c1o1 - OxyyMxzz) * abs(mxxyMyzz) / (abs(mxxyMyzz) + qudricLimitM);
-        mxxyMyzz += wadjust * (-mxxyMyzz);
-        wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxxzPyyz) / (abs(mxxzPyyz) + qudricLimitP);
-        mxxzPyyz += wadjust * (-mxxzPyyz);
-        wadjust = OxyyMxzz + (c1o1 - OxyyMxzz) * abs(mxxzMyyz) / (abs(mxxzMyyz) + qudricLimitM);
-        mxxzMyyz += wadjust * (-mxxzMyyz);
-        wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxyyPxzz) / (abs(mxyyPxzz) + qudricLimitP);
-        mxyyPxzz += wadjust * (-mxyyPxzz);
-        wadjust = OxyyMxzz + (c1o1 - OxyyMxzz) * abs(mxyyMxzz) / (abs(mxyyMxzz) + qudricLimitM);
-        mxyyMxzz += wadjust * (-mxyyMxzz);
-        //////////////////////////////////////////////////////////////////////////
-        // no limiter
-        // mfbbb += OxyyMxzz * (-mfbbb);
-        // mxxyPyzz += OxyyPxzz * (-mxxyPyzz);
-        // mxxyMyzz += OxyyMxzz * (-mxxyMyzz);
-        // mxxzPyyz += OxyyPxzz * (-mxxzPyyz);
-        // mxxzMyyz += OxyyMxzz * (-mxxzMyyz);
-        // mxyyPxzz += OxyyPxzz * (-mxyyPxzz);
-        // mxyyMxzz += OxyyMxzz * (-mxyyMxzz);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Compute inverse linear combinations of second and third order cumulants
-        //!
-        mfcaa = c1o3 * (mxxMyy + mxxMzz + mxxPyyPzz);
-        mfaca = c1o3 * (-c2o1 * mxxMyy + mxxMzz + mxxPyyPzz);
-        mfaac = c1o3 * (mxxMyy - c2o1 * mxxMzz + mxxPyyPzz);
-
-        mfcba = (mxxyMyzz + mxxyPyzz) * c1o2;
-        mfabc = (-mxxyMyzz + mxxyPyzz) * c1o2;
-        mfcab = (mxxzMyyz + mxxzPyyz) * c1o2;
-        mfacb = (-mxxzMyyz + mxxzPyyz) * c1o2;
-        mfbca = (mxyyMxzz + mxyyPxzz) * c1o2;
-        mfbac = (-mxyyMxzz + mxyyPxzz) * c1o2;
-        //////////////////////////////////////////////////////////////////////////
-
-        //////////////////////////////////////////////////////////////////////////
-        // 4.
-        // no limiter
-        //! - Relax fourth order cumulants to modified equilibrium for fourth order convergence of diffusion according
-        //! to Eq. (43)-(48) <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!
-        CUMacc = -O4 * (c1o1 / omega - c1o2) * (dyuy + dzuz) * c2o3 * factorA + (c1o1 - O4) * (CUMacc);
-        CUMcac = -O4 * (c1o1 / omega - c1o2) * (dxux + dzuz) * c2o3 * factorA + (c1o1 - O4) * (CUMcac);
-        CUMcca = -O4 * (c1o1 / omega - c1o2) * (dyuy + dxux) * c2o3 * factorA + (c1o1 - O4) * (CUMcca);
-        CUMbbc = -O4 * (c1o1 / omega - c1o2) * Dxy * c1o3 * factorB + (c1o1 - O4) * (CUMbbc);
-        CUMbcb = -O4 * (c1o1 / omega - c1o2) * Dxz * c1o3 * factorB + (c1o1 - O4) * (CUMbcb);
-        CUMcbb = -O4 * (c1o1 / omega - c1o2) * Dyz * c1o3 * factorB + (c1o1 - O4) * (CUMcbb);
-
-        //////////////////////////////////////////////////////////////////////////
-        // 5.
-        CUMbcc += O5 * (-CUMbcc);
-        CUMcbc += O5 * (-CUMcbc);
-        CUMccb += O5 * (-CUMccb);
-
-        //////////////////////////////////////////////////////////////////////////
-        // 6.
-        CUMccc += O6 * (-CUMccc);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Compute central moments from post collision cumulants according to Eq. (53)-(56) in
-        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!
-
-        //////////////////////////////////////////////////////////////////////////
-        // 4.
-        mfcbb = CUMcbb + c1o3 * ((c3o1 * mfcaa + c1o1) * mfabb + c6o1 * mfbba * mfbab) * OOrho;
-        mfbcb = CUMbcb + c1o3 * ((c3o1 * mfaca + c1o1) * mfbab + c6o1 * mfbba * mfabb) * OOrho;
-        mfbbc = CUMbbc + c1o3 * ((c3o1 * mfaac + c1o1) * mfbba + c6o1 * mfbab * mfabb) * OOrho;
-
-        mfcca =
-            CUMcca +
-            (((mfcaa * mfaca + c2o1 * mfbba * mfbba) * c9o1 + c3o1 * (mfcaa + mfaca)) * OOrho - (drho * OOrho)) * c1o9;
-        mfcac =
-            CUMcac +
-            (((mfcaa * mfaac + c2o1 * mfbab * mfbab) * c9o1 + c3o1 * (mfcaa + mfaac)) * OOrho - (drho * OOrho)) * c1o9;
-        mfacc =
-            CUMacc +
-            (((mfaac * mfaca + c2o1 * mfabb * mfabb) * c9o1 + c3o1 * (mfaac + mfaca)) * OOrho - (drho * OOrho)) * c1o9;
-
-        //////////////////////////////////////////////////////////////////////////
-        // 5.
-        mfbcc = CUMbcc + c1o3 *
-                             (c3o1 * (mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb +
-                                      c2o1 * (mfbab * mfacb + mfbba * mfabc)) +
-                              (mfbca + mfbac)) *
-                             OOrho;
-        mfcbc = CUMcbc + c1o3 *
-                             (c3o1 * (mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb +
-                                      c2o1 * (mfabb * mfcab + mfbba * mfbac)) +
-                              (mfcba + mfabc)) *
-                             OOrho;
-        mfccb = CUMccb + c1o3 *
-                             (c3o1 * (mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb +
-                                      c2o1 * (mfbab * mfbca + mfabb * mfcba)) +
-                              (mfacb + mfcab)) *
-                             OOrho;
-
-        //////////////////////////////////////////////////////////////////////////
-        // 6.
-        mfccc = CUMccc - ((-c4o1 * mfbbb * mfbbb - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca) -
-                           c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc) -
-                           c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) *
-                              OOrho +
-                          (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac) +
-                           c2o1 * (mfcaa * mfaca * mfaac) + c16o1 * mfbba * mfbab * mfabb) *
-                              OOrho * OOrho -
-                          c1o3 * (mfacc + mfcac + mfcca) * OOrho - c1o9 * (mfcaa + mfaca + mfaac) * OOrho +
-                          (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba) +
-                           (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 * (mfaac + mfaca + mfcaa)) *
-                              OOrho * OOrho * c2o3 +
-                          c1o27 * ((drho * drho - drho) * OOrho * OOrho));
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! -  Add acceleration (body force) to first order cumulants according to Eq. (85)-(87) in
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-        //!
-        mfbaa = -mfbaa;
-        mfaba = -mfaba;
-        mfaab = -mfaab;
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Chimera transform from central moments to well conditioned distributions as defined in Appendix J in
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> see also Eq. (88)-(96) in <a
-        //! href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-        //! ]</b></a>
-        //!
-        ////////////////////////////////////////////////////////////////////////////////////
-        // X - Dir
-        backwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
-        backwardChimera(mfaba, mfbba, mfcba, vvx, vx2);
-        backwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
-        backwardChimera(mfaab, mfbab, mfcab, vvx, vx2);
-        backwardChimera(mfabb, mfbbb, mfcbb, vvx, vx2);
-        backwardChimera(mfacb, mfbcb, mfccb, vvx, vx2);
-        backwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
-        backwardChimera(mfabc, mfbbc, mfcbc, vvx, vx2);
-        backwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c9o1, c1o9);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // Y - Dir
-        backwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2, c6o1, c1o6);
-        backwardChimera(mfaab, mfabb, mfacb, vvy, vy2);
-        backwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
-        backwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2, c3o2, c2o3);
-        backwardChimera(mfbab, mfbbb, mfbcb, vvy, vy2);
-        backwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2, c9o2, c2o9);
-        backwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2, c6o1, c1o6);
-        backwardChimera(mfcab, mfcbb, mfccb, vvy, vy2);
-        backwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // Z - Dir
-        backwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
-        backwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2, c9o1, c1o9);
-        backwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
-        backwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2, c9o1, c1o9);
-        backwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2, c9o4, c4o9);
-        backwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2, c9o1, c1o9);
-        backwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
-        backwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2, c9o1, c1o9);
-        backwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Write distributions: style of reading and writing the distributions from/to
-        //! stored arrays dependent on timestep is based on the esoteric twist algorithm
-        //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
-        //! DOI:10.3390/computation5020019 ]</b></a>
-        //!
-        (dist.f[DIR_P00])[k]      = mfabb;
-        (dist.f[DIR_M00])[kw]     = mfcbb;
-        (dist.f[DIR_0P0])[k]      = mfbab;
-        (dist.f[DIR_0M0])[ks]     = mfbcb;
-        (dist.f[DIR_00P])[k]      = mfbba;
-        (dist.f[DIR_00M])[kb]     = mfbbc;
-        (dist.f[DIR_PP0])[k]     = mfaab;
-        (dist.f[DIR_MM0])[ksw]   = mfccb;
-        (dist.f[DIR_PM0])[ks]    = mfacb;
-        (dist.f[DIR_MP0])[kw]    = mfcab;
-        (dist.f[DIR_P0P])[k]     = mfaba;
-        (dist.f[DIR_M0M])[kbw]   = mfcbc;
-        (dist.f[DIR_P0M])[kb]    = mfabc;
-        (dist.f[DIR_M0P])[kw]    = mfcba;
-        (dist.f[DIR_0PP])[k]     = mfbaa;
-        (dist.f[DIR_0MM])[kbs]   = mfbcc;
-        (dist.f[DIR_0PM])[kb]    = mfbac;
-        (dist.f[DIR_0MP])[ks]    = mfbca;
-        (dist.f[DIR_000])[k]   = mfbbb;
-        (dist.f[DIR_PPP])[k]    = mfaaa;
-        (dist.f[DIR_PMP])[ks]   = mfaca;
-        (dist.f[DIR_PPM])[kb]   = mfaac;
-        (dist.f[DIR_PMM])[kbs]  = mfacc;
-        (dist.f[DIR_MPP])[kw]   = mfcaa;
-        (dist.f[DIR_MMP])[ksw]  = mfcca;
-        (dist.f[DIR_MPM])[kbw]  = mfcac;
-        (dist.f[DIR_MMM])[kbsw] = mfccc;
-    }
-}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream_Device.cuh b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream_Device.cuh
deleted file mode 100644
index f74192c0423ba9dc96820d7f46eecb9d49a39ed4..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream_Device.cuh
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef LB_Kernel_CUMULANT_K17_COMP_CHIM_SPARSE_H
-#define LB_Kernel_CUMULANT_K17_COMP_CHIM_SPARSE_H
-
-#include <DataTypes.h>
-#include <curand.h>
-
-__global__ void LB_Kernel_CumulantK17CompChimStream(
-	real omega,
-	uint* neighborX,
-	uint* neighborY,
-	uint* neighborZ,
-	real* distributions,
-	unsigned long size_Mat,
-	int level,
-	real* forces,
-	real* quadricLimiters,
-	bool isEvenTimestep,
-	const uint* fluidNodeIndices,
-	uint numberOfFluidNodes);
-#endif
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK18/CumulantK18Comp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK18/CumulantK18Comp.cu
index 54af306039585f3beb39b05f2f2e0a96ae784e12..2e0af0bdb85d3f008768f9f430e8b4e5d9719b0f 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK18/CumulantK18Comp.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK18/CumulantK18Comp.cu
@@ -1,8 +1,8 @@
 #include "CumulantK18Comp.h"
 
 #include "CumulantK18Comp_Device.cuh"
-
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<CumulantK18Comp> CumulantK18Comp::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -11,37 +11,22 @@ std::shared_ptr<CumulantK18Comp> CumulantK18Comp::getNewInstance(std::shared_ptr
 
 void CumulantK18Comp::run()
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_CumulantK18Comp << < grid, threads >> >(	para->getParD(level)->omega,
-														para->getParD(level)->typeOfGridNode,
-														para->getParD(level)->neighborX,
-														para->getParD(level)->neighborY,
-														para->getParD(level)->neighborZ,
-														para->getParD(level)->distributions.f[0],
-														para->getParD(level)->g6.g[0],
-														para->getParD(level)->numberOfNodes,
-														level,
-														para->getForcesDev(),
-                                                        para->getQuadricLimitersDev(),
-														para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_CumulantK18Comp execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_CumulantK18Comp <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->g6.g[0],
+        para->getParD(level)->numberOfNodes,
+        level,
+        para->getForcesDev(),
+        para->getQuadricLimitersDev(),
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_CumulantK18Comp execution failed");
 }
 
 CumulantK18Comp::CumulantK18Comp(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK18/CumulantK18Comp_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK18/CumulantK18Comp_Device.cu
index bb42d113e47ce28f153ac295f2d9a934dd1b213a..0e4ae5caebb9bd4b1c889a78bfadb62487742c98 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK18/CumulantK18Comp_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK18/CumulantK18Comp_Device.cu
@@ -42,83 +42,83 @@ __global__ void LB_Kernel_CumulantK18Comp(
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			Distributions6 G;
 			if (EvenOrOdd == true)
 			{
-				G.g[DIR_P00] = &G6[DIR_P00   *size_Mat];
-				G.g[DIR_M00] = &G6[DIR_M00   *size_Mat];
-				G.g[DIR_0P0] = &G6[DIR_0P0   *size_Mat];
-				G.g[DIR_0M0] = &G6[DIR_0M0   *size_Mat];
-				G.g[DIR_00P] = &G6[DIR_00P   *size_Mat];
-				G.g[DIR_00M] = &G6[DIR_00M   *size_Mat];
+				G.g[DIR_P00] = &G6[DIR_P00 * size_Mat];
+				G.g[DIR_M00] = &G6[DIR_M00 * size_Mat];
+				G.g[DIR_0P0] = &G6[DIR_0P0 * size_Mat];
+				G.g[DIR_0M0] = &G6[DIR_0M0 * size_Mat];
+				G.g[DIR_00P] = &G6[DIR_00P * size_Mat];
+				G.g[DIR_00M] = &G6[DIR_00M * size_Mat];
 			}
 			else
 			{
-				G.g[DIR_M00] = &G6[DIR_P00   *size_Mat];
-				G.g[DIR_P00] = &G6[DIR_M00   *size_Mat];
-				G.g[DIR_0M0] = &G6[DIR_0P0   *size_Mat];
-				G.g[DIR_0P0] = &G6[DIR_0M0   *size_Mat];
-				G.g[DIR_00M] = &G6[DIR_00P   *size_Mat];
-				G.g[DIR_00P] = &G6[DIR_00M   *size_Mat];
+				G.g[DIR_M00] = &G6[DIR_P00 * size_Mat];
+				G.g[DIR_P00] = &G6[DIR_M00 * size_Mat];
+				G.g[DIR_0M0] = &G6[DIR_0P0 * size_Mat];
+				G.g[DIR_0P0] = &G6[DIR_0M0 * size_Mat];
+				G.g[DIR_00M] = &G6[DIR_00P * size_Mat];
+				G.g[DIR_00P] = &G6[DIR_00M * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK20/CumulantK20Comp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK20/CumulantK20Comp.cu
index 0c1778dc39496c6564dedcbe1f6e818bee147191..d0d81eaac711d4d80284b66a1040e0e8404f5d4d 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK20/CumulantK20Comp.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK20/CumulantK20Comp.cu
@@ -1,8 +1,8 @@
 #include "CumulantK20Comp.h"
 
 #include "CumulantK20Comp_Device.cuh"
-
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<CumulantK20Comp> CumulantK20Comp::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -11,37 +11,22 @@ std::shared_ptr<CumulantK20Comp> CumulantK20Comp::getNewInstance(std::shared_ptr
 
 void CumulantK20Comp::run()
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_CumulantK20Comp << < grid, threads >> >(	para->getParD(level)->omega,
-																para->getParD(level)->typeOfGridNode,
-																para->getParD(level)->neighborX,
-																para->getParD(level)->neighborY,
-																para->getParD(level)->neighborZ,
-																para->getParD(level)->distributions.f[0],
-																para->getParD(level)->g6.g[0],
-																para->getParD(level)->numberOfNodes,
-																level,
-																para->getForcesDev(),
-                                                                para->getQuadricLimitersDev(),
-																para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_CumulantK20Comp execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_CumulantK20Comp <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->g6.g[0],
+        para->getParD(level)->numberOfNodes,
+        level,
+        para->getForcesDev(),
+        para->getQuadricLimitersDev(),
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_CumulantK20Comp execution failed");
 }
 
 CumulantK20Comp::CumulantK20Comp(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK20/CumulantK20Comp_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK20/CumulantK20Comp_Device.cu
index c805fc293aeb8b182bb0e01df82b584da69d0175..2dbe0bb62412f9363fdd0e714f5da296f81ae5b3 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK20/CumulantK20Comp_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK20/CumulantK20Comp_Device.cu
@@ -42,83 +42,83 @@ __global__ void LB_Kernel_CumulantK20Comp(
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			Distributions6 G;
 			if (EvenOrOdd == true)
 			{
-				G.g[DIR_P00] = &G6[DIR_P00   *size_Mat];
-				G.g[DIR_M00] = &G6[DIR_M00   *size_Mat];
-				G.g[DIR_0P0] = &G6[DIR_0P0   *size_Mat];
-				G.g[DIR_0M0] = &G6[DIR_0M0   *size_Mat];
-				G.g[DIR_00P] = &G6[DIR_00P   *size_Mat];
-				G.g[DIR_00M] = &G6[DIR_00M   *size_Mat];
+				G.g[DIR_P00] = &G6[DIR_P00 * size_Mat];
+				G.g[DIR_M00] = &G6[DIR_M00 * size_Mat];
+				G.g[DIR_0P0] = &G6[DIR_0P0 * size_Mat];
+				G.g[DIR_0M0] = &G6[DIR_0M0 * size_Mat];
+				G.g[DIR_00P] = &G6[DIR_00P * size_Mat];
+				G.g[DIR_00M] = &G6[DIR_00M * size_Mat];
 			}
 			else
 			{
-				G.g[DIR_M00] = &G6[DIR_P00   *size_Mat];
-				G.g[DIR_P00] = &G6[DIR_M00   *size_Mat];
-				G.g[DIR_0M0] = &G6[DIR_0P0   *size_Mat];
-				G.g[DIR_0P0] = &G6[DIR_0M0   *size_Mat];
-				G.g[DIR_00M] = &G6[DIR_00P   *size_Mat];
-				G.g[DIR_00P] = &G6[DIR_00M   *size_Mat];
+				G.g[DIR_M00] = &G6[DIR_P00 * size_Mat];
+				G.g[DIR_P00] = &G6[DIR_M00 * size_Mat];
+				G.g[DIR_0M0] = &G6[DIR_0P0 * size_Mat];
+				G.g[DIR_0P0] = &G6[DIR_0M0 * size_Mat];
+				G.g[DIR_00M] = &G6[DIR_00P * size_Mat];
+				G.g[DIR_00P] = &G6[DIR_00M * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/MRT/MRTCompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/MRT/MRTCompSP27.cu
index be94791572f739fb2eef7c049702caeedb6641fc..b576333f50304f5628e073d2eee16cf5b82c9d34 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/MRT/MRTCompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/MRT/MRTCompSP27.cu
@@ -2,6 +2,7 @@
 
 #include "MRTCompSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<MRTCompSP27> MRTCompSP27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,33 +11,18 @@ std::shared_ptr<MRTCompSP27> MRTCompSP27::getNewInstance(std::shared_ptr<Paramet
 
 void MRTCompSP27::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_MRT_Comp_SP_27 << < grid, threads >> >(	para->getParD(level)->omega,
-														para->getParD(level)->typeOfGridNode,
-														para->getParD(level)->neighborX,
-														para->getParD(level)->neighborY,
-														para->getParD(level)->neighborZ,
-														para->getParD(level)->distributions.f[0],
-														para->getParD(level)->numberOfNodes,
-														para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_MRT_Comp_SP_27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_MRT_Comp_SP_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_MRT_Comp_SP_27 execution failed");
 }
 
 MRTCompSP27::MRTCompSP27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/MRT/MRTCompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/MRT/MRTCompSP27_Device.cu
index a9aefa2d62a962766470c93a62adeefa4f19570e..c3eb51a114e5c4a3be7605765d0889a7bae25cf0 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/MRT/MRTCompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/MRT/MRTCompSP27_Device.cu
@@ -37,63 +37,63 @@ __global__ void LB_Kernel_MRT_Comp_SP_27(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -126,33 +126,33 @@ __global__ void LB_Kernel_MRT_Comp_SP_27(real omega,
 			//unsigned int ktne = k;
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 											////////////////////////////////////////////////////////////////////////////////////
 			real rho = (mfccc + mfaaa + mfaca + mfcac + mfacc + mfcaa + mfaac + mfcca +
 				mfbac + mfbca + mfbaa + mfbcc + mfabc + mfcba + mfaba + mfcbc + mfacb + mfcab + mfaab + mfccb +
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/RunLBMKernel.cuh b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/RunLBMKernel.cuh
index 558b4f333e7c92b372a5097aa4917dd6d1230a34..3be594e3e39a57cd71741cd060e9dddda15d6035 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/RunLBMKernel.cuh
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/RunLBMKernel.cuh
@@ -5,7 +5,7 @@
 #include <DataTypes.h>
 #include <cuda_runtime.h>
 
-#include <lbm/KernelParameter.h>
+#include "lbm/KernelParameter.h"
 
 #include "Kernel/Utilities/DistributionHelper.cuh"
 
@@ -23,7 +23,7 @@ struct GPUKernelParameter
     unsigned int* neighborY;
     unsigned int* neighborZ;
     real* distributions;
-    int size_Mat;
+    int numberOfLBnodes;
     real* forces;
     bool isEvenTimestep;
 };
@@ -31,19 +31,22 @@ struct GPUKernelParameter
 template<typename KernelFunctor>
 __global__ void runKernel(KernelFunctor kernel, GPUKernelParameter kernelParameter)
 {
-    const uint k = getNodeIndex();
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = getNodeIndex();
 
-    if(k >= kernelParameter.size_Mat)
+    if(nodeIndex >= kernelParameter.numberOfLBnodes)
         return;
 
-    if (!isValidFluidNode(kernelParameter.typeOfGridNode[k]))
+    if (!isValidFluidNode(kernelParameter.typeOfGridNode[nodeIndex]))
         return;
 
     DistributionWrapper distributionWrapper {
         kernelParameter.distributions,
-        (unsigned int)kernelParameter.size_Mat,
+        (unsigned int)kernelParameter.numberOfLBnodes,
         kernelParameter.isEvenTimestep,
-        k,
+        nodeIndex,
         kernelParameter.neighborX,
         kernelParameter.neighborY,
         kernelParameter.neighborZ
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGK/BGKIncompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGK/BGKIncompSP27.cu
index 81655fac9cfd0b562ba60a5ee289fb64da5c1fba..3fb9be28654f83a7a98bb7d6b3a8a46e9170e7a8 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGK/BGKIncompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGK/BGKIncompSP27.cu
@@ -2,6 +2,7 @@
 
 #include "BGKIncompSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<BGKIncompSP27> BGKIncompSP27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,33 +11,18 @@ std::shared_ptr<BGKIncompSP27> BGKIncompSP27::getNewInstance(std::shared_ptr<Par
 
 void BGKIncompSP27::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_BGK_Incomp_SP_27 << < grid, threads >> >(	para->getParD(level)->omega,
-													para->getParD(level)->typeOfGridNode,
-													para->getParD(level)->neighborX,
-													para->getParD(level)->neighborY,
-													para->getParD(level)->neighborZ,
-													para->getParD(level)->distributions.f[0],
-													para->getParD(level)->numberOfNodes,
-													para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_BGK_SP_27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_BGK_Incomp_SP_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_BGK_Incomp_SP_27 execution failed");
 }
 
 BGKIncompSP27::BGKIncompSP27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGK/BGKIncompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGK/BGKIncompSP27_Device.cu
index 9a94006b8a1be745fc2bcfdd80e454152347139d..233595656720f5c84cf5be9e555565af0e9c95d0 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGK/BGKIncompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGK/BGKIncompSP27_Device.cu
@@ -37,63 +37,63 @@ __global__ void LB_Kernel_BGK_Incomp_SP_27(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGKPlus/BGKPlusIncompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGKPlus/BGKPlusIncompSP27.cu
index 86b513f1252f2787abee637819e64606d111c4fa..f274f576a14fc193bcabd44d2c9078a2c98055bc 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGKPlus/BGKPlusIncompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGKPlus/BGKPlusIncompSP27.cu
@@ -2,6 +2,7 @@
 
 #include "BGKPlusIncompSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<BGKPlusIncompSP27> BGKPlusIncompSP27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,33 +11,18 @@ std::shared_ptr<BGKPlusIncompSP27> BGKPlusIncompSP27::getNewInstance(std::shared
 
 void BGKPlusIncompSP27::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_BGK_Plus_Incomp_SP_27 << < grid, threads >> >(	para->getParD(level)->omega,
-														para->getParD(level)->typeOfGridNode,
-														para->getParD(level)->neighborX,
-														para->getParD(level)->neighborY,
-														para->getParD(level)->neighborZ,
-														para->getParD(level)->distributions.f[0],
-														para->getParD(level)->numberOfNodes,
-														para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_BGK_Plus_SP_27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_BGK_Plus_Incomp_SP_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_BGK_Plus_Incomp_SP_27 execution failed");
 }
 
 BGKPlusIncompSP27::BGKPlusIncompSP27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGKPlus/BGKPlusIncompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGKPlus/BGKPlusIncompSP27_Device.cu
index 9355e42aa5b05190f063f5247d8d6c0dea787a02..b49b76c6224be4b3543c01647a6553e6fc64b74e 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGKPlus/BGKPlusIncompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/BGKPlus/BGKPlusIncompSP27_Device.cu
@@ -37,63 +37,63 @@ __global__ void LB_Kernel_BGK_Plus_Incomp_SP_27(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -126,33 +126,33 @@ __global__ void LB_Kernel_BGK_Plus_Incomp_SP_27(real omega,
 			//unsigned int ktne = k;
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 											////////////////////////////////////////////////////////////////////////////////////
 											//slow
 											//real oMdrho = one - ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cascade/CascadeIncompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cascade/CascadeIncompSP27.cu
index 05f374096c9c5da2460b32cf5ae8cb59cfa78382..3a6760b619d2ca1a7eb19771478eb9e5989ead0c 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cascade/CascadeIncompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cascade/CascadeIncompSP27.cu
@@ -2,6 +2,7 @@
 
 #include "CascadeIncompSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<CascadeIncompSP27> CascadeIncompSP27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,33 +11,18 @@ std::shared_ptr<CascadeIncompSP27> CascadeIncompSP27::getNewInstance(std::shared
 
 void CascadeIncompSP27::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_Cascade_Incomp_SP_27 << < grid, threads >> >(	para->getParD(level)->omega,
-														para->getParD(level)->typeOfGridNode,
-														para->getParD(level)->neighborX,
-														para->getParD(level)->neighborY,
-														para->getParD(level)->neighborZ,
-														para->getParD(level)->distributions.f[0],
-														para->getParD(level)->numberOfNodes,
-														para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_Cascade_SP_27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_Cascade_Incomp_SP_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_Cascade_Incomp_SP_27 execution failed");
 }
 
 CascadeIncompSP27::CascadeIncompSP27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cascade/CascadeIncompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cascade/CascadeIncompSP27_Device.cu
index 92cc749b135739d5f38c9916c4ee0da7497e5f2d..8e607cabb4cc40bbb22c5ad3ec6db2c63154add6 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cascade/CascadeIncompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cascade/CascadeIncompSP27_Device.cu
@@ -37,63 +37,63 @@ __global__ void LB_Kernel_Cascade_Incomp_SP_27(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -126,33 +126,33 @@ __global__ void LB_Kernel_Cascade_Incomp_SP_27(real omega,
 			//unsigned int ktne = k;
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 											////////////////////////////////////////////////////////////////////////////////////
 											//slow
 											//real oMdrho = one - ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cumulant1hSP27/Cumulant1hIncompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cumulant1hSP27/Cumulant1hIncompSP27.cu
index 62768ef9948b6c259c5ad4005237081f4d255e73..44beb8507d5664f01283130dd3087a788e4491ed 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cumulant1hSP27/Cumulant1hIncompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cumulant1hSP27/Cumulant1hIncompSP27.cu
@@ -2,6 +2,7 @@
 
 #include "Cumulant1hIncompSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<Cumulant1hIncompSP27> Cumulant1hIncompSP27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,38 +11,23 @@ std::shared_ptr<Cumulant1hIncompSP27> Cumulant1hIncompSP27::getNewInstance(std::
 
 void Cumulant1hIncompSP27::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_Cum_1h_Incomp_SP_27 << < grid, threads >> >(	para->getParD(level)->omega,
-													para->getParD(level)->deltaPhi,
-													para->getAngularVelocity(),
-													para->getParD(level)->typeOfGridNode,
-													para->getParD(level)->neighborX,
-													para->getParD(level)->neighborY,
-													para->getParD(level)->neighborZ,
-													para->getParD(level)->coordinateX,
-													para->getParD(level)->coordinateY,
-													para->getParD(level)->coordinateZ,
-													para->getParD(level)->distributions.f[0],
-													para->getParD(level)->numberOfNodes,
-													para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_Cum_1h_SP_27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_Cum_1h_Incomp_SP_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->deltaPhi,
+        para->getAngularVelocity(),
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->coordinateX,
+        para->getParD(level)->coordinateY,
+        para->getParD(level)->coordinateZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_Cum_1h_Incomp_SP_27 execution failed");
 }
 
 Cumulant1hIncompSP27::Cumulant1hIncompSP27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cumulant1hSP27/Cumulant1hIncompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cumulant1hSP27/Cumulant1hIncompSP27_Device.cu
index 0243046082ce1853011c6632d5a2f80364ebe0db..5130017acc642c92b064a500e79ff685ec2f6d97 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cumulant1hSP27/Cumulant1hIncompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/Cumulant1hSP27/Cumulant1hIncompSP27_Device.cu
@@ -42,63 +42,63 @@ __global__ void LB_Kernel_Cum_1h_Incomp_SP_27(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -159,33 +159,33 @@ __global__ void LB_Kernel_Cum_1h_Incomp_SP_27(real omega,
 			//unsigned int ktne = k;
 			//unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 											////////////////////////////////////////////////////////////////////////////////////
 											//Ship
 			real coord0X = 281.125f;//7.5f;
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantIsoSP27/CumulantIsoIncompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantIsoSP27/CumulantIsoIncompSP27.cu
index 6551e1bde300e3a4d2a4f50cefdfff258edfacee..3a740bef6d7fbaa2883b3d36930d49bf9bf0bb3e 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantIsoSP27/CumulantIsoIncompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantIsoSP27/CumulantIsoIncompSP27.cu
@@ -2,6 +2,7 @@
 
 #include "CumulantIsoIncompSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<CumulantIsoIncompSP27> CumulantIsoIncompSP27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,36 +11,21 @@ std::shared_ptr<CumulantIsoIncompSP27> CumulantIsoIncompSP27::getNewInstance(std
 
 void CumulantIsoIncompSP27::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_Cum_IsoTest_Incomp_SP_27 << < grid, threads >> >(para->getParD(level)->omega,
-		para->getParD(level)->typeOfGridNode,
-		para->getParD(level)->neighborX,
-		para->getParD(level)->neighborY,
-		para->getParD(level)->neighborZ,
-		para->getParD(level)->distributions.f[0],
-		para->getParD(level)->dxxUx,
-		para->getParD(level)->dyyUy,
-		para->getParD(level)->dzzUz,
-		para->getParD(level)->numberOfNodes,
-		para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_Kum_IsoTest_SP_27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_Cum_IsoTest_Incomp_SP_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->dxxUx,
+        para->getParD(level)->dyyUy,
+        para->getParD(level)->dzzUz,
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_Cum_IsoTest_Incomp_SP_27 execution failed");
 }
 
 CumulantIsoIncompSP27::CumulantIsoIncompSP27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantIsoSP27/CumulantIsoIncompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantIsoSP27/CumulantIsoIncompSP27_Device.cu
index 64d697f2b0953cee75f4397e399a0e6128e486a2..1f0ef2ec84c8d4b9b4be57548bde396c3316a80d 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantIsoSP27/CumulantIsoIncompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantIsoSP27/CumulantIsoIncompSP27_Device.cu
@@ -40,63 +40,63 @@ __global__ void LB_Kernel_Cum_IsoTest_Incomp_SP_27(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -129,33 +129,33 @@ __global__ void LB_Kernel_Cum_IsoTest_Incomp_SP_27(real omega,
 			//unsigned int ktne = k;
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 											////////////////////////////////////////////////////////////////////////////////////
 											//slow
 											//real oMdrho = one - ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantK15/CumulantK15Incomp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantK15/CumulantK15Incomp.cu
index 40cde56b007f70f98db13d5962f3e746b97637ef..7ae17b97170b4d8474acd6777f7c27411a962681 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantK15/CumulantK15Incomp.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantK15/CumulantK15Incomp.cu
@@ -2,6 +2,7 @@
 
 #include "CumulantK15Incomp_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<CumulantK15Incomp> CumulantK15Incomp::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,33 +11,18 @@ std::shared_ptr<CumulantK15Incomp> CumulantK15Incomp::getNewInstance(std::shared
 
 void CumulantK15Incomp::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_CumulantK15Incomp <<< grid, threads >>>(	para->getParD(level)->omega,
-														para->getParD(level)->typeOfGridNode,
-														para->getParD(level)->neighborX,
-														para->getParD(level)->neighborY,
-														para->getParD(level)->neighborZ,
-														para->getParD(level)->distributions.f[0],
-														para->getParD(level)->numberOfNodes,
-														para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_CumulantK15Incomp execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_CumulantK15Incomp <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_CumulantK15Incomp execution failed");
 }
 
 CumulantK15Incomp::CumulantK15Incomp(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantK15/CumulantK15Incomp_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantK15/CumulantK15Incomp_Device.cu
index fc108ef1ef109a40735e250bd9a0f21491e4f977..01b60b3bf8067a81f99b912c4c0c700963f5448c 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantK15/CumulantK15Incomp_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/CumulantK15/CumulantK15Incomp_Device.cu
@@ -37,63 +37,63 @@ __global__ void LB_Kernel_CumulantK15Incomp(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -154,33 +154,33 @@ __global__ void LB_Kernel_CumulantK15Incomp(real omega,
 			//unsigned int ktne = k;
 			//unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 											////////////////////////////////////////////////////////////////////////////////////
 											//slow
 											//real oMdrho = one - ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/MRT/MRTIncompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/MRT/MRTIncompSP27.cu
index c4311309e4653f2862e303dacb3e2d07646a5061..7645703e0d40176b136762d6b48633f4a9c0d950 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/MRT/MRTIncompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/MRT/MRTIncompSP27.cu
@@ -2,6 +2,7 @@
 
 #include "MRTIncompSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<MRTIncompSP27> MRTIncompSP27::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,33 +11,18 @@ std::shared_ptr<MRTIncompSP27> MRTIncompSP27::getNewInstance(std::shared_ptr<Par
 
 void MRTIncompSP27::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_MRT_Incomp_SP_27 << < grid, threads >> >(	para->getParD(level)->omega,
-													para->getParD(level)->typeOfGridNode,
-													para->getParD(level)->neighborX,
-													para->getParD(level)->neighborY,
-													para->getParD(level)->neighborZ,
-													para->getParD(level)->distributions.f[0],
-													para->getParD(level)->numberOfNodes,
-													para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LB_Kernel_MRT_SP_27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Kernel_MRT_Incomp_SP_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->omega,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Kernel_MRT_Incomp_SP_27 execution failed");
 }
 
 MRTIncompSP27::MRTIncompSP27(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/MRT/MRTIncompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/MRT/MRTIncompSP27_Device.cu
index f6a283c2f9ba3c15729061ebeabcf34edd0abe97..a6663cc3c72696fda2ce9819203cd19195088730 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/MRT/MRTIncompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/MRT/MRTIncompSP27_Device.cu
@@ -37,63 +37,63 @@ __global__ void LB_Kernel_MRT_Incomp_SP_27(real omega,
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -126,33 +126,33 @@ __global__ void LB_Kernel_MRT_Incomp_SP_27(real omega,
 			//unsigned int ktne = k;
 			unsigned int kbsw = neighborZ[ksw];
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00   ])[k  ];//ke
-			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0   ])[k  ];//kn
-			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P   ])[k  ];//kt
-			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0  ])[k  ];//kne
-			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0  ])[ks ];//kse
-			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0  ])[kw ];//knw
-			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P  ])[k  ];//kte
-			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M  ])[kb ];//kbe
-			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P  ])[kw ];//ktw
-			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP  ])[k  ];//ktn
-			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM  ])[kb ];//kbn
-			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP  ])[ks ];//kts
+			real mfcbb = (D.f[DIR_P00])[k];//[ke   ];// +  c2over27 ;(D.f[DIR_P00])[k  ];//ke
+			real mfabb = (D.f[DIR_M00])[kw];//[kw   ];// +  c2over27 ;(D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k];//[kn   ];// +  c2over27 ;(D.f[DIR_0P0])[k  ];//kn
+			real mfbab = (D.f[DIR_0M0])[ks];//[ks   ];// +  c2over27 ;(D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k];//[kt   ];// +  c2over27 ;(D.f[DIR_00P])[k  ];//kt
+			real mfbba = (D.f[DIR_00M])[kb];//[kb   ];// +  c2over27 ;(D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k];//[kne  ];// +  c1over54 ;(D.f[DIR_PP0])[k  ];//kne
+			real mfaab = (D.f[DIR_MM0])[ksw];//[ksw  ];// +  c1over54 ;(D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks];//[kse  ];// +  c1over54 ;(D.f[DIR_PM0])[ks ];//kse
+			real mfacb = (D.f[DIR_MP0])[kw];//[knw  ];// +  c1over54 ;(D.f[DIR_MP0])[kw ];//knw
+			real mfcbc = (D.f[DIR_P0P])[k];//[kte  ];// +  c1over54 ;(D.f[DIR_P0P])[k  ];//kte
+			real mfaba = (D.f[DIR_M0M])[kbw];//[kbw  ];// +  c1over54 ;(D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb];//[kbe  ];// +  c1over54 ;(D.f[DIR_P0M])[kb ];//kbe
+			real mfabc = (D.f[DIR_M0P])[kw];//[ktw  ];// +  c1over54 ;(D.f[DIR_M0P])[kw ];//ktw
+			real mfbcc = (D.f[DIR_0PP])[k];//[ktn  ];// +  c1over54 ;(D.f[DIR_0PP])[k  ];//ktn
+			real mfbaa = (D.f[DIR_0MM])[kbs];//[kbs  ];// +  c1over54 ;(D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb];//[kbn  ];// +  c1over54 ;(D.f[DIR_0PM])[kb ];//kbn
+			real mfbac = (D.f[DIR_0MP])[ks];//[kts  ];// +  c1over54 ;(D.f[DIR_0MP])[ks ];//kts
 			real mfbbb = (D.f[DIR_000])[k];//[kzero];// +  c8over27 ;(D.f[DIR_000])[k  ];//kzero
-			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP ])[k  ];//ktne
-			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP ])[ksw];//ktsw
-			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP ])[ks ];//ktse
-			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP ])[kw ];//ktnw
-			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM ])[kb ];//kbne
-			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM ])[kbsw];
-			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM ])[kbs];//kbse
-			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM ])[kbw];//kbnw
+			real mfccc = (D.f[DIR_PPP])[k];//[ktne ];// +  c1over216;(D.f[DIR_PPP])[k  ];//ktne
+			real mfaac = (D.f[DIR_MMP])[ksw];//[ktsw ];// +  c1over216;(D.f[DIR_MMP])[ksw];//ktsw
+			real mfcac = (D.f[DIR_PMP])[ks];//[ktse ];// +  c1over216;(D.f[DIR_PMP])[ks ];//ktse
+			real mfacc = (D.f[DIR_MPP])[kw];//[ktnw ];// +  c1over216;(D.f[DIR_MPP])[kw ];//ktnw
+			real mfcca = (D.f[DIR_PPM])[kb];//[kbne ];// +  c1over216;(D.f[DIR_PPM])[kb ];//kbne
+			real mfaaa = (D.f[DIR_MMM])[kbsw];//[kbsw ];// +  c1over216;(D.f[DIR_MMM])[kbsw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];//[kbse ];// +  c1over216;(D.f[DIR_PMM])[kbs];//kbse
+			real mfaca = (D.f[DIR_MPM])[kbw];//[kbnw ];// +  c1over216;(D.f[DIR_MPM])[kbw];//kbnw
 											////////////////////////////////////////////////////////////////////////////////////
 											//slow
 											//real oMdrho = one - ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/CumulantOne/PMCumulantOneCompSP27.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/CumulantOne/PMCumulantOneCompSP27.cu
index 77527d5bedab08fdcacb3a103727ae25274b2aa4..43724f9165e2bb8dca1705ae0053612df92413ec 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/CumulantOne/PMCumulantOneCompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/CumulantOne/PMCumulantOneCompSP27.cu
@@ -11,7 +11,7 @@ std::shared_ptr<PMCumulantOneCompSP27> PMCumulantOneCompSP27::getNewInstance(std
 
 void PMCumulantOneCompSP27::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
+	int size_Mat = (int)para->getParD(level)->numberOfNodes;
 	int numberOfThreads = para->getParD(level)->numberofthreads;
 
 	int Grid = (size_Mat / numberOfThreads) + 1;
@@ -30,7 +30,8 @@ void PMCumulantOneCompSP27::run()
 	dim3 threads(numberOfThreads, 1, 1);
 
 	for (int i = 0; i < pm.size(); i++) {
-		LB_Kernel_PM_Cum_One_Comp_SP_27 << < grid, threads >> >(para->getParD(level)->omega,
+		LB_Kernel_PM_Cum_One_Comp_SP_27 <<< grid, threads >>>(
+			para->getParD(level)->omega,
 			para->getParD(level)->neighborX,
 			para->getParD(level)->neighborY,
 			para->getParD(level)->neighborZ,
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/CumulantOne/PMCumulantOneCompSP27_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/CumulantOne/PMCumulantOneCompSP27_Device.cu
index 89975d1663fb236295c22b81af4b0544ffc489bb..4f5f61f9d7a61fee8fd3438de5c588c861d8604c 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/CumulantOne/PMCumulantOneCompSP27_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/CumulantOne/PMCumulantOneCompSP27_Device.cu
@@ -11,7 +11,7 @@ __global__ void LB_Kernel_PM_Cum_One_Comp_SP_27(real omega,
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
 	real* DDStart,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	int level,
 	real* forces,
 	real porosity,
@@ -24,63 +24,63 @@ __global__ void LB_Kernel_PM_Cum_One_Comp_SP_27(real omega,
 	Distributions27 D;
 	if (EvenOrOdd == true)
 	{
-		D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-		D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-		D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-		D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-		D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-		D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-		D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-		D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-		D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-		D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-		D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-		D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-		D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-		D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-		D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-		D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-		D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-		D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-		D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-		D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-		D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-		D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-		D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-		D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-		D.f[DIR_PMM] = &DDStart[DIR_PMM *size_Mat];
-		D.f[DIR_MPM] = &DDStart[DIR_MPM *size_Mat];
+		D.f[DIR_P00] = &DDStart[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_M00] = &DDStart[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00P] = &DDStart[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00M] = &DDStart[DIR_00M * numberOfLBnodes];
+		D.f[DIR_PP0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_MM0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_P0P] = &DDStart[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_M0M] = &DDStart[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DDStart[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DDStart[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0PP] = &DDStart[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0MM] = &DDStart[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DDStart[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DDStart[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+		D.f[DIR_PPP] = &DDStart[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_MMP] = &DDStart[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_PMP] = &DDStart[DIR_PMP * numberOfLBnodes];
+		D.f[DIR_MPP] = &DDStart[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_PPM] = &DDStart[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_MMM] = &DDStart[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_PMM] = &DDStart[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_MPM] = &DDStart[DIR_MPM * numberOfLBnodes];
 	}
 	else
 	{
-		D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-		D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-		D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-		D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-		D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-		D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-		D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-		D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-		D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-		D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-		D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-		D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-		D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-		D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-		D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-		D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-		D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-		D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-		D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-		D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-		D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-		D.f[DIR_MPM] = &DDStart[DIR_PMP *size_Mat];
-		D.f[DIR_PMM] = &DDStart[DIR_MPP *size_Mat];
-		D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-		D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-		D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-		D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+		D.f[DIR_M00] = &DDStart[DIR_P00 * numberOfLBnodes];
+		D.f[DIR_P00] = &DDStart[DIR_M00 * numberOfLBnodes];
+		D.f[DIR_0M0] = &DDStart[DIR_0P0 * numberOfLBnodes];
+		D.f[DIR_0P0] = &DDStart[DIR_0M0 * numberOfLBnodes];
+		D.f[DIR_00M] = &DDStart[DIR_00P * numberOfLBnodes];
+		D.f[DIR_00P] = &DDStart[DIR_00M * numberOfLBnodes];
+		D.f[DIR_MM0] = &DDStart[DIR_PP0 * numberOfLBnodes];
+		D.f[DIR_PP0] = &DDStart[DIR_MM0 * numberOfLBnodes];
+		D.f[DIR_MP0] = &DDStart[DIR_PM0 * numberOfLBnodes];
+		D.f[DIR_PM0] = &DDStart[DIR_MP0 * numberOfLBnodes];
+		D.f[DIR_M0M] = &DDStart[DIR_P0P * numberOfLBnodes];
+		D.f[DIR_P0P] = &DDStart[DIR_M0M * numberOfLBnodes];
+		D.f[DIR_M0P] = &DDStart[DIR_P0M * numberOfLBnodes];
+		D.f[DIR_P0M] = &DDStart[DIR_M0P * numberOfLBnodes];
+		D.f[DIR_0MM] = &DDStart[DIR_0PP * numberOfLBnodes];
+		D.f[DIR_0PP] = &DDStart[DIR_0MM * numberOfLBnodes];
+		D.f[DIR_0MP] = &DDStart[DIR_0PM * numberOfLBnodes];
+		D.f[DIR_0PM] = &DDStart[DIR_0MP * numberOfLBnodes];
+		D.f[DIR_000] = &DDStart[DIR_000 * numberOfLBnodes];
+		D.f[DIR_MMM] = &DDStart[DIR_PPP * numberOfLBnodes];
+		D.f[DIR_PPM] = &DDStart[DIR_MMP * numberOfLBnodes];
+		D.f[DIR_MPM] = &DDStart[DIR_PMP * numberOfLBnodes];
+		D.f[DIR_PMM] = &DDStart[DIR_MPP * numberOfLBnodes];
+		D.f[DIR_MMP] = &DDStart[DIR_PPM * numberOfLBnodes];
+		D.f[DIR_PPP] = &DDStart[DIR_MMM * numberOfLBnodes];
+		D.f[DIR_MPP] = &DDStart[DIR_PMM * numberOfLBnodes];
+		D.f[DIR_PMP] = &DDStart[DIR_MPM * numberOfLBnodes];
 	}
 
 	////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/CumulantOne/PMCumulantOneCompSP27_Device.cuh b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/CumulantOne/PMCumulantOneCompSP27_Device.cuh
index 6533c604f32a478cdc6a097e4dd7d0b56e48150d..f2cf530b5d331c71d4a13bd5882a3657a3bbddea 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/CumulantOne/PMCumulantOneCompSP27_Device.cuh
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/CumulantOne/PMCumulantOneCompSP27_Device.cuh
@@ -9,7 +9,7 @@ __global__ void LB_Kernel_PM_Cum_One_Comp_SP_27(real omega,
 	unsigned int* neighborY,
 	unsigned int* neighborZ,
 	real* DDStart,
-	int size_Mat,
+	unsigned long long numberOfLBnodes,
 	int level,
 	real* forces,
 	real porosity,
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim.cu
deleted file mode 100644
index a9d518d14a286ae3f6b565176969162994afa269..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "TurbulentViscosityCumulantK17CompChim.h"
-#include "cuda/CudaGrid.h"
-#include <logger/Logger.h>
-#include "Parameter/Parameter.h"
-#include "TurbulentViscosityCumulantK17CompChim_Device.cuh"
-
-template<TurbulenceModel turbulenceModel> 
-std::shared_ptr< TurbulentViscosityCumulantK17CompChim<turbulenceModel> > TurbulentViscosityCumulantK17CompChim<turbulenceModel>::getNewInstance(std::shared_ptr<Parameter> para, int level)
-{
-	return std::shared_ptr<TurbulentViscosityCumulantK17CompChim<turbulenceModel> >(new TurbulentViscosityCumulantK17CompChim<turbulenceModel>(para,level));
-}
-
-template<TurbulenceModel turbulenceModel>
-void TurbulentViscosityCumulantK17CompChim<turbulenceModel>::run()
-{
-	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, para->getParH(level)->numberOfNodes);
-
-	LB_Kernel_TurbulentViscosityCumulantK17CompChim < turbulenceModel  > <<< grid.grid, grid.threads >>>(   para->getParD(level)->omega, 	
-																											para->getParD(level)->typeOfGridNode, 										para->getParD(level)->neighborX,	
-																											para->getParD(level)->neighborY,	
-																											para->getParD(level)->neighborZ,	
-																											para->getParD(level)->distributions.f[0],	
-																											para->getParD(level)->rho,		
-																											para->getParD(level)->velocityX,		
-																											para->getParD(level)->velocityY,	
-																											para->getParD(level)->velocityZ,	
-																											para->getParD(level)->turbViscosity,
-																											para->getSGSConstant(),
-																											(unsigned long)para->getParD(level)->numberOfNodes,	
-																											level,				
-																											para->getIsBodyForce(),				
-																											para->getForcesDev(),				
-																											para->getParD(level)->forceX_SP,	
-																											para->getParD(level)->forceY_SP,
-																											para->getParD(level)->forceZ_SP,
-																											para->getQuadricLimitersDev(),			
-																											para->getParD(level)->isEvenTimestep);
-
-	getLastCudaError("LB_Kernel_TurbulentViscosityCumulantK17CompChim execution failed");
-}
-
-template<TurbulenceModel turbulenceModel>
-TurbulentViscosityCumulantK17CompChim<turbulenceModel>::TurbulentViscosityCumulantK17CompChim(std::shared_ptr<Parameter> para, int level)
-{
-	this->para = para;
-	this->level = level;
-
-	myPreProcessorTypes.push_back(InitCompSP27);
-
-	myKernelGroup = BasicKernel;
-
-	VF_LOG_INFO("Using turbulence model: {}", turbulenceModel);
-}
-
-template class TurbulentViscosityCumulantK17CompChim<TurbulenceModel::AMD>;
-template class TurbulentViscosityCumulantK17CompChim<TurbulenceModel::Smagorinsky>;
-template class TurbulentViscosityCumulantK17CompChim<TurbulenceModel::QR>;
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim.h
deleted file mode 100644
index 0d35b68c916e54c6ec6eeeacd7189fe4d9a33c10..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef TurbulentViscosityCUMULANT_K17_COMP_CHIM_H
-#define TurbulentViscosityCUMULANT_K17_COMP_CHIM_H
-
-#include "Kernel/KernelImp.h"
-#include "Parameter/Parameter.h"
-
-template<TurbulenceModel turbulenceModel> 
-class TurbulentViscosityCumulantK17CompChim : public KernelImp
-{
-public:
-	static std::shared_ptr< TurbulentViscosityCumulantK17CompChim<turbulenceModel> > getNewInstance(std::shared_ptr< Parameter> para, int level);
-	void run();
-
-private:
-    TurbulentViscosityCumulantK17CompChim();
-    TurbulentViscosityCumulantK17CompChim(std::shared_ptr<Parameter> para, int level);
-};
-
-#endif 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cu
deleted file mode 100644
index 32350b95107b68103af0f238fefe095882919092..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cu
+++ /dev/null
@@ -1,687 +0,0 @@
-//=======================================================================================
-// ____          ____    __    ______     __________   __      __       __        __         
-// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
-//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
-//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
-//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
-//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
-//      \    \  |    |   ________________________________________________________________    
-//       \    \ |    |  |  ______________________________________________________________|   
-//        \    \|    |  |  |         __          __     __     __     ______      _______    
-//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
-//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
-//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
-//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
-//
-//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file TurbulentViscosityCumulantK17CompChim_Device.cu
-//! \author Henry Korb, Henrik Asmuth
-//! \date 16/05/2022
-//! \brief CumulantK17CompChim kernel by Martin Schönherr that inlcudes turbulent viscosity and other small mods.
-//!
-//! Additions to CumulantK17CompChim:
-//!     - can incorporate local body force 
-//!     - when applying a local body force, the total round of error of forcing+bodyforce is saved and added in next time step
-//!     - uses turbulent viscosity that is computed in separate kernel (as of now AMD)
-//!     - saves macroscopic values (needed for instance for probes, AMD, and actuator models)
-//!
-//=======================================================================================
-/* Device code */
-#include "LBM/LB.h" 
-#include "lbm/constants/D3Q27.h"
-#include <lbm/constants/NumericConstants.h>
-#include "Kernel/Utilities/DistributionHelper.cuh"
-
-#include "GPU/TurbulentViscosityInlines.cuh"
-
-using namespace vf::lbm::constant;
-using namespace vf::lbm::dir;
-#include "Kernel/Utilities/ChimeraTransformation.h"
-
-
-////////////////////////////////////////////////////////////////////////////////
-template<TurbulenceModel turbulenceModel>
-__global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim(
-	real omega_in,
-	uint* typeOfGridNode,
-	uint* neighborX,
-	uint* neighborY,
-	uint* neighborZ,
-	real* distributions,
-    real* rho,
-    real* vx,
-    real* vy,
-    real* vz,
-    real* turbulentViscosity,
-    real SGSconstant,
-	unsigned long size_Mat,
-	int level,
-    bool bodyForce,
-	real* forces,
-    real* bodyForceX,
-    real* bodyForceY,
-    real* bodyForceZ,
-	real* quadricLimiters,
-	bool isEvenTimestep)
-{
-    //////////////////////////////////////////////////////////////////////////
-    //! Cumulant K17 Kernel is based on \ref
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-    //! ]</b></a> and \ref <a href="https://doi.org/10.1016/j.jcp.2017.07.004"><b>[ M. Geier et al. (2017),
-    //! DOI:10.1016/j.jcp.2017.07.004 ]</b></a>
-    //!
-    //! The cumulant kernel is executed in the following steps
-    //!
-    ////////////////////////////////////////////////////////////////////////////////
-    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
-    //!
-    const unsigned k_000 = vf::gpu::getNodeIndex();
-
-    //////////////////////////////////////////////////////////////////////////
-    // run for all indices in size_Mat and fluid nodes
-    if ((k_000 < size_Mat) && (typeOfGridNode[k_000] == GEO_FLUID)) {
-        //////////////////////////////////////////////////////////////////////////
-        //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on
-        //! timestep is based on the esoteric twist algorithm \ref <a
-        //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
-        //! DOI:10.3390/computation5020019 ]</b></a>
-        //!
-        Distributions27 dist = vf::gpu::getDistributionReferences27(distributions, size_Mat, isEvenTimestep);
-
-        ////////////////////////////////////////////////////////////////////////////////
-        //! - Set neighbor indices (necessary for indirect addressing)
-        uint k_M00 = neighborX[k_000];
-        uint k_0M0 = neighborY[k_000];
-        uint k_00M = neighborZ[k_000];
-        uint k_MM0 = neighborY[k_M00];
-        uint k_M0M = neighborZ[k_M00];
-        uint k_0MM = neighborZ[k_0M0];
-        uint k_MMM = neighborZ[k_MM0];
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Set local distributions
-        //!
-        real f_000 = (dist.f[DIR_000])[k_000];
-        real f_P00 = (dist.f[DIR_P00])[k_000];
-        real f_M00 = (dist.f[DIR_M00])[k_M00];
-        real f_0P0 = (dist.f[DIR_0P0])[k_000];
-        real f_0M0 = (dist.f[DIR_0M0])[k_0M0];
-        real f_00P = (dist.f[DIR_00P])[k_000];
-        real f_00M = (dist.f[DIR_00M])[k_00M];
-        real f_PP0 = (dist.f[DIR_PP0])[k_000];
-        real f_MM0 = (dist.f[DIR_MM0])[k_MM0];
-        real f_PM0 = (dist.f[DIR_PM0])[k_0M0];
-        real f_MP0 = (dist.f[DIR_MP0])[k_M00];
-        real f_P0P = (dist.f[DIR_P0P])[k_000];
-        real f_M0M = (dist.f[DIR_M0M])[k_M0M];
-        real f_P0M = (dist.f[DIR_P0M])[k_00M];
-        real f_M0P = (dist.f[DIR_M0P])[k_M00];
-        real f_0PP = (dist.f[DIR_0PP])[k_000];
-        real f_0MM = (dist.f[DIR_0MM])[k_0MM];
-        real f_0PM = (dist.f[DIR_0PM])[k_00M];
-        real f_0MP = (dist.f[DIR_0MP])[k_0M0];
-        real f_PPP = (dist.f[DIR_PPP])[k_000];
-        real f_MPP = (dist.f[DIR_MPP])[k_M00];
-        real f_PMP = (dist.f[DIR_PMP])[k_0M0];
-        real f_MMP = (dist.f[DIR_MMP])[k_MM0];
-        real f_PPM = (dist.f[DIR_PPM])[k_00M];
-        real f_MPM = (dist.f[DIR_MPM])[k_M0M];
-        real f_PMM = (dist.f[DIR_PMM])[k_0MM];
-        real f_MMM = (dist.f[DIR_MMM])[k_MMM];
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Define aliases to use the same variable for the moments (m's):
-        //!
-        real& m_111 = f_000;
-        real& m_211 = f_P00;
-        real& m_011 = f_M00;
-        real& m_121 = f_0P0;
-        real& m_101 = f_0M0;
-        real& m_112 = f_00P;
-        real& m_110 = f_00M;
-        real& m_221 = f_PP0;
-        real& m_001 = f_MM0;
-        real& m_201 = f_PM0;
-        real& m_021 = f_MP0;
-        real& m_212 = f_P0P;
-        real& m_010 = f_M0M;
-        real& m_210 = f_P0M;
-        real& m_012 = f_M0P;
-        real& m_122 = f_0PP;
-        real& m_100 = f_0MM;
-        real& m_120 = f_0PM;
-        real& m_102 = f_0MP;
-        real& m_222 = f_PPP;
-        real& m_022 = f_MPP;
-        real& m_202 = f_PMP;
-        real& m_002 = f_MMP;
-        real& m_220 = f_PPM;
-        real& m_020 = f_MPM;
-        real& m_200 = f_PMM;
-        real& m_000 = f_MMM;
-
-        //////////////////////////////////////////////////////(unsigned long)//////////////////////////////
-        //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-        //!
-        real drho = ((((f_PPP + f_MMM) + (f_MPM + f_PMP)) + ((f_MPP + f_PMM) + (f_MMP + f_PPM))) +
-                    (((f_0MP + f_0PM) + (f_0MM + f_0PP)) + ((f_M0P + f_P0M) + (f_M0M + f_P0P)) +
-                    ((f_MP0 + f_PM0) + (f_MM0 + f_PP0))) +
-                    ((f_M00 + f_P00) + (f_0M0 + f_0P0) + (f_00M + f_00P))) +
-                        f_000;
-
-        real oneOverRho = c1o1 / (c1o1 + drho);
-
-        real vvx = ((((f_PPP - f_MMM) + (f_PMP - f_MPM)) + ((f_PMM - f_MPP) + (f_PPM - f_MMP))) +
-                    (((f_P0M - f_M0P) + (f_P0P - f_M0M)) + ((f_PM0 - f_MP0) + (f_PP0 - f_MM0))) + (f_P00 - f_M00)) *
-                oneOverRho;
-        real vvy = ((((f_PPP - f_MMM) + (f_MPM - f_PMP)) + ((f_MPP - f_PMM) + (f_PPM - f_MMP))) +
-                    (((f_0PM - f_0MP) + (f_0PP - f_0MM)) + ((f_MP0 - f_PM0) + (f_PP0 - f_MM0))) + (f_0P0 - f_0M0)) *
-                oneOverRho;
-        real vvz = ((((f_PPP - f_MMM) + (f_PMP - f_MPM)) + ((f_MPP - f_PMM) + (f_MMP - f_PPM))) +
-                    (((f_0MP - f_0PM) + (f_0PP - f_0MM)) + ((f_M0P - f_P0M) + (f_P0P - f_M0M))) + (f_00P - f_00M)) *
-                oneOverRho;
-        
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Add half of the acceleration (body force) to the velocity as in Eq. (42) \ref
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-        //!
-        real factor = c1o1;
-        for (size_t i = 1; i <= level; i++) {
-            factor *= c2o1;
-        }
-        
-        real fx = forces[0];
-        real fy = forces[1];
-        real fz = forces[2];
-
-        if( bodyForce ){
-            fx += bodyForceX[k_000]; 
-            fy += bodyForceY[k_000];
-            fz += bodyForceZ[k_000];
-
-            real vx = vvx;
-            real vy = vvy;
-            real vz = vvz;
-            real acc_x = fx * c1o2 / factor;
-            real acc_y = fy * c1o2 / factor;
-            real acc_z = fz * c1o2 / factor;
-
-            vvx += acc_x;
-            vvy += acc_y;
-            vvz += acc_z;
-            
-        //    // Reset body force. To be used when not using round-off correction.
-        // bodyForceX[k] = 0.0f;
-        // bodyForceY[k] = 0.0f;
-        // bodyForceZ[k] = 0.0f;
-
-            ////////////////////////////////////////////////////////////////////////////////////
-            //!> Round-off correction
-            //!
-            //!> Similar to Kahan summation algorithm (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
-            //!> Essentially computes the round-off error of the applied force and adds it in the next time step as a compensation.
-            //!> Seems to be necesseary at very high Re boundary layers, where the forcing and velocity can  
-            //!> differ by several orders of magnitude.
-            //!> \note 16/05/2022: Testing, still ongoing! 
-            //!
-            bodyForceX[k_000] = (acc_x-(vvx-vx))*factor*c2o1;
-            bodyForceY[k_000] = (acc_y-(vvy-vy))*factor*c2o1;
-            bodyForceZ[k_000] = (acc_z-(vvz-vz))*factor*c2o1;
-        }
-        else{
-            vvx += fx * c1o2 / factor;
-            vvy += fy * c1o2 / factor;
-            vvz += fz * c1o2 / factor;
-        }
-        
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // calculate the square of velocities for this lattice node
-        real vx2 = vvx * vvx;
-        real vy2 = vvy * vvy;
-        real vz2 = vvz * vvz;
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Set relaxation limiters for third order cumulants to default value \f$ \lambda=0.001 \f$ according to
-        //! section 6 in \ref <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!
-        real quadricLimitP = quadricLimiters[0];
-        real quadricLimitM = quadricLimiters[1];
-        real quadricLimitD = quadricLimiters[2];
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Chimera transform from well conditioned distributions to central moments as defined in Appendix J in \ref
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> see also Eq. (6)-(14) in \ref <a
-        //! href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-        //! ]</b></a>
-        //!
-        ////////////////////////////////////////////////////////////////////////////////////
-        // Z - Dir
-        forwardInverseChimeraWithK(f_MMM, f_MM0, f_MMP, vvz, vz2, c36o1, c1o36);
-        forwardInverseChimeraWithK(f_M0M, f_M00, f_M0P, vvz, vz2, c9o1,  c1o9);
-        forwardInverseChimeraWithK(f_MPM, f_MP0, f_MPP, vvz, vz2, c36o1, c1o36);
-        forwardInverseChimeraWithK(f_0MM, f_0M0, f_0MP, vvz, vz2, c9o1,  c1o9);
-        forwardInverseChimeraWithK(f_00M, f_000, f_00P, vvz, vz2, c9o4,  c4o9);
-        forwardInverseChimeraWithK(f_0PM, f_0P0, f_0PP, vvz, vz2, c9o1,  c1o9);
-        forwardInverseChimeraWithK(f_PMM, f_PM0, f_PMP, vvz, vz2, c36o1, c1o36);
-        forwardInverseChimeraWithK(f_P0M, f_P00, f_P0P, vvz, vz2, c9o1,  c1o9);
-        forwardInverseChimeraWithK(f_PPM, f_PP0, f_PPP, vvz, vz2, c36o1, c1o36);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // Y - Dir
-        forwardInverseChimeraWithK(f_MMM, f_M0M, f_MPM, vvy, vy2, c6o1,  c1o6);
-        forwardChimera(            f_MM0, f_M00, f_MP0, vvy, vy2);
-        forwardInverseChimeraWithK(f_MMP, f_M0P, f_MPP, vvy, vy2, c18o1, c1o18);
-        forwardInverseChimeraWithK(f_0MM, f_00M, f_0PM, vvy, vy2, c3o2,  c2o3);
-        forwardChimera(            f_0M0, f_000, f_0P0, vvy, vy2);
-        forwardInverseChimeraWithK(f_0MP, f_00P, f_0PP, vvy, vy2, c9o2,  c2o9);
-        forwardInverseChimeraWithK(f_PMM, f_P0M, f_PPM, vvy, vy2, c6o1,  c1o6);
-        forwardChimera(            f_PM0, f_P00, f_PP0, vvy, vy2);
-        forwardInverseChimeraWithK(f_PMP, f_P0P, f_PPP, vvy, vy2, c18o1, c1o18);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // X - Dir
-        forwardInverseChimeraWithK(f_MMM, f_0MM, f_PMM, vvx, vx2, c1o1, c1o1);
-        forwardChimera(            f_M0M, f_00M, f_P0M, vvx, vx2);
-        forwardInverseChimeraWithK(f_MPM, f_0PM, f_PPM, vvx, vx2, c3o1, c1o3);
-        forwardChimera(            f_MM0, f_0M0, f_PM0, vvx, vx2);
-        forwardChimera(            f_M00, f_000, f_P00, vvx, vx2);
-        forwardChimera(            f_MP0, f_0P0, f_PP0, vvx, vx2);
-        forwardInverseChimeraWithK(f_MMP, f_0MP, f_PMP, vvx, vx2, c3o1, c1o3);
-        forwardChimera(            f_M0P, f_00P, f_P0P, vvx, vx2);
-        forwardInverseChimeraWithK(f_MPP, f_0PP, f_PPP, vvx, vx2, c3o1, c1o9);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Setting relaxation rates for non-hydrodynamic cumulants (default values). Variable names and equations
-        //! according to <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!  => [NAME IN PAPER]=[NAME IN CODE]=[DEFAULT VALUE].
-        //!  - Trace of second order cumulants \f$ C_{200}+C_{020}+C_{002} \f$ used to adjust bulk
-        //!  viscosity:\f$\omega_2=OxxPyyPzz=1.0 \f$.
-        //!  - Third order cumulants \f$ C_{120}+C_{102}, C_{210}+C_{012}, C_{201}+C_{021} \f$: \f$ \omega_3=OxyyPxzz
-        //!  \f$ set according to Eq. (111) with simplifications assuming \f$ \omega_2=1.0\f$.
-        //!  - Third order cumulants \f$ C_{120}-C_{102}, C_{210}-C_{012}, C_{201}-C_{021} \f$: \f$ \omega_4 = OxyyMxzz
-        //!  \f$ set according to Eq. (112) with simplifications assuming \f$ \omega_2 = 1.0\f$.
-        //!  - Third order cumulants \f$ C_{111} \f$: \f$ \omega_5 = Oxyz \f$ set according to Eq. (113) with
-        //!  simplifications assuming \f$ \omega_2 = 1.0\f$  (modify for different bulk viscosity).
-        //!  - Fourth order cumulants \f$ C_{220}, C_{202}, C_{022}, C_{211}, C_{121}, C_{112} \f$: for simplification
-        //!  all set to the same default value \f$ \omega_6=\omega_7=\omega_8=O4=1.0 \f$.
-        //!  - Fifth order cumulants \f$ C_{221}, C_{212}, C_{122}\f$: \f$\omega_9=O5=1.0\f$.
-        //!  - Sixth order cumulant \f$ C_{222}\f$: \f$\omega_{10}=O6=1.0\f$.
-        //!
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Calculate modified omega with turbulent viscosity
-        //!
-        real omega = omega_in / (c1o1 + c3o1*omega_in*turbulentViscosity[k_000]);
-        ////////////////////////////////////////////////////////////
-        // 2.
-        real OxxPyyPzz = c1o1;
-        ////////////////////////////////////////////////////////////
-        // 3.
-        real OxyyPxzz = c8o1 * (-c2o1 + omega) * (c1o1 + c2o1 * omega) / (-c8o1 - c14o1 * omega + c7o1 * omega * omega);
-        real OxyyMxzz =
-            c8o1 * (-c2o1 + omega) * (-c7o1 + c4o1 * omega) / (c56o1 - c50o1 * omega + c9o1 * omega * omega);
-        real Oxyz = c24o1 * (-c2o1 + omega) * (-c2o1 - c7o1 * omega + c3o1 * omega * omega) /
-                    (c48o1 + c152o1 * omega - c130o1 * omega * omega + c29o1 * omega * omega * omega);
-        ////////////////////////////////////////////////////////////
-        // 4.
-        real O4 = c1o1;
-        ////////////////////////////////////////////////////////////
-        // 5.
-        real O5 = c1o1;
-        ////////////////////////////////////////////////////////////
-        // 6.
-        real O6 = c1o1;
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - A and DIR_00M: parameters for fourth order convergence of the diffusion term according to Eq. (115) and (116)
-        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a> with simplifications assuming \f$ \omega_2 = 1.0 \f$ (modify for
-        //! different bulk viscosity).
-        //!
-        real factorA = (c4o1 + c2o1 * omega - c3o1 * omega * omega) / (c2o1 - c7o1 * omega + c5o1 * omega * omega);
-        real factorB = (c4o1 + c28o1 * omega - c14o1 * omega * omega) / (c6o1 - c21o1 * omega + c15o1 * omega * omega);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Compute cumulants from central moments according to Eq. (20)-(23) in
-        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!
-        ////////////////////////////////////////////////////////////
-        // 4.
-        real c_211 = m_211 - ((m_200 + c1o3) * m_011 + c2o1 * m_110 * m_101) * oneOverRho;
-        real c_121 = m_121 - ((m_020 + c1o3) * m_101 + c2o1 * m_110 * m_011) * oneOverRho;
-        real c_112 = m_112 - ((m_002 + c1o3) * m_110 + c2o1 * m_101 * m_011) * oneOverRho;
-
-        real c_220 = m_220 - (((m_200 * m_020 + c2o1 * m_110 * m_110) + c1o3 * (m_200 + m_020)) * oneOverRho - c1o9 * (drho * oneOverRho));
-        real c_202 = m_202 - (((m_200 * m_002 + c2o1 * m_101 * m_101) + c1o3 * (m_200 + m_002)) * oneOverRho - c1o9 * (drho * oneOverRho));
-        real c_022 = m_022 - (((m_002 * m_020 + c2o1 * m_011 * m_011) + c1o3 * (m_002 + m_020)) * oneOverRho - c1o9 * (drho * oneOverRho));
-        ////////////////////////////////////////////////////////////
-        // 5.
-        real c_122 =
-            m_122 - ((m_002 * m_120 + m_020 * m_102 + c4o1 * m_011 * m_111 + c2o1 * (m_101 * m_021 + m_110 * m_012)) +
-                    c1o3 * (m_120 + m_102)) *
-                    oneOverRho;
-        real c_212 =
-            m_212 - ((m_002 * m_210 + m_200 * m_012 + c4o1 * m_101 * m_111 + c2o1 * (m_011 * m_201 + m_110 * m_102)) +
-                    c1o3 * (m_210 + m_012)) *
-                    oneOverRho;
-        real c_221 =
-            m_221 - ((m_200 * m_021 + m_020 * m_201 + c4o1 * m_110 * m_111 + c2o1 * (m_101 * m_120 + m_011 * m_210)) +
-                    c1o3 * (m_021 + m_201)) *
-                    oneOverRho;
-        ////////////////////////////////////////////////////////////
-        // 6.
-        real c_222 = m_222 + ((-c4o1 * m_111 * m_111 - (m_200 * m_022 + m_020 * m_202 + m_002 * m_220) -
-                                c4o1 * (m_011 * m_211 + m_101 * m_121 + m_110 * m_112) -
-                                c2o1 * (m_120 * m_102 + m_210 * m_012 + m_201 * m_021)) *
-                                oneOverRho +
-                            (c4o1 * (m_101 * m_101 * m_020 + m_011 * m_011 * m_200 + m_110 * m_110 * m_002) +
-                                c2o1 * (m_200 * m_020 * m_002) + c16o1 * m_110 * m_101 * m_011) *
-                                oneOverRho * oneOverRho -
-                                c1o3 * (m_022 + m_202 + m_220) * oneOverRho - c1o9 * (m_200 + m_020 + m_002) * oneOverRho +
-                            (c2o1 * (m_101 * m_101 + m_011 * m_011 + m_110 * m_110) +
-                                (m_002 * m_020 + m_002 * m_200 + m_020 * m_200) + c1o3 * (m_002 + m_020 + m_200)) *
-                                oneOverRho * oneOverRho * c2o3 +
-                                c1o27 * ((drho * drho - drho) * oneOverRho * oneOverRho));
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Compute linear combinations of second and third order cumulants
-        //!
-        ////////////////////////////////////////////////////////////
-        // 2.
-        real mxxPyyPzz = m_200 + m_020 + m_002;
-        real mxxMyy    = m_200 - m_020;
-        real mxxMzz    = m_200 - m_002;
-        ////////////////////////////////////////////////////////////
-        // 3.
-        real mxxyPyzz = m_210 + m_012;
-        real mxxyMyzz = m_210 - m_012;
-
-        real mxxzPyyz = m_201 + m_021;
-        real mxxzMyyz = m_201 - m_021;
-
-        real mxyyPxzz = m_120 + m_102;
-        real mxyyMxzz = m_120 - m_102;
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // incl. correction
-        ////////////////////////////////////////////////////////////
-        //! - Compute velocity  gradients from second order cumulants according to Eq. (27)-(32)
-        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a> Further explanations of the correction in viscosity in Appendix H of
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> Note that the division by rho is omitted here as we need rho times
-        //! the gradients later.
-        //!
-        real Dxy  = -c3o1 * omega * m_110;
-        real Dxz  = -c3o1 * omega * m_101;
-        real Dyz  = -c3o1 * omega * m_011;
-        real dxux = c1o2 * (-omega) * (mxxMyy + mxxMzz) + c1o2 * OxxPyyPzz * (m_000 - mxxPyyPzz);
-        real dyuy = dxux + omega * c3o2 * mxxMyy;
-        real dzuz = dxux + omega * c3o2 * mxxMzz;
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        switch (turbulenceModel)
-        {
-        case TurbulenceModel::AMD:  //AMD is computed in separate kernel
-            break;
-        case TurbulenceModel::Smagorinsky:
-            turbulentViscosity[k_000] = calcTurbulentViscositySmagorinsky(SGSconstant, dxux, dyuy, dzuz, Dxy, Dxz , Dyz);
-            break;
-        case TurbulenceModel::QR:
-            turbulentViscosity[k_000] = calcTurbulentViscosityQR(SGSconstant, dxux, dyuy, dzuz, Dxy, Dxz , Dyz);
-            break;
-        default:
-            break;
-        }
-        ////////////////////////////////////////////////////////////
-        //! - Relaxation of second order cumulants with correction terms according to Eq. (33)-(35) in
-        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!
-        mxxPyyPzz += OxxPyyPzz * (m_000 - mxxPyyPzz) - c3o1 * (c1o1 - c1o2 * OxxPyyPzz) * (vx2 * dxux + vy2 * dyuy + vz2 * dzuz);
-        mxxMyy += omega * (-mxxMyy) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vy2 * dyuy);
-        mxxMzz += omega * (-mxxMzz) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vz2 * dzuz);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        ////no correction
-        // mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz);
-        // mxxMyy += -(-omega) * (-mxxMyy);
-        // mxxMzz += -(-omega) * (-mxxMzz);
-        //////////////////////////////////////////////////////////////////////////
-        m_011 += omega * (-m_011);
-        m_101 += omega * (-m_101);
-        m_110 += omega * (-m_110);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // relax
-        //////////////////////////////////////////////////////////////////////////
-        // incl. limiter
-        //! - Relaxation of third order cumulants including limiter according to Eq. (116)-(123)
-        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!
-        real wadjust = Oxyz + (c1o1 - Oxyz) * abs(m_111) / (abs(m_111) + quadricLimitD);
-        m_111 += wadjust * (-m_111);
-        wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxxyPyzz) / (abs(mxxyPyzz) + quadricLimitP);
-        mxxyPyzz += wadjust * (-mxxyPyzz);
-        wadjust = OxyyMxzz + (c1o1 - OxyyMxzz) * abs(mxxyMyzz) / (abs(mxxyMyzz) + quadricLimitM);
-        mxxyMyzz += wadjust * (-mxxyMyzz);
-        wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxxzPyyz) / (abs(mxxzPyyz) + quadricLimitP);
-        mxxzPyyz += wadjust * (-mxxzPyyz);
-        wadjust = OxyyMxzz + (c1o1 - OxyyMxzz) * abs(mxxzMyyz) / (abs(mxxzMyyz) + quadricLimitM);
-        mxxzMyyz += wadjust * (-mxxzMyyz);
-        wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxyyPxzz) / (abs(mxyyPxzz) + quadricLimitP);
-        mxyyPxzz += wadjust * (-mxyyPxzz);
-        wadjust = OxyyMxzz + (c1o1 - OxyyMxzz) * abs(mxyyMxzz) / (abs(mxyyMxzz) + quadricLimitM);
-        mxyyMxzz += wadjust * (-mxyyMxzz);
-        //////////////////////////////////////////////////////////////////////////
-        // no limiter
-        // mfbbb += OxyyMxzz * (-mfbbb);
-        // mxxyPyzz += OxyyPxzz * (-mxxyPyzz);
-        // mxxyMyzz += OxyyMxzz * (-mxxyMyzz);
-        // mxxzPyyz += OxyyPxzz * (-mxxzPyyz);
-        // mxxzMyyz += OxyyMxzz * (-mxxzMyyz);
-        // mxyyPxzz += OxyyPxzz * (-mxyyPxzz);
-        // mxyyMxzz += OxyyMxzz * (-mxyyMxzz);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Compute inverse linear combinations of second and third order cumulants
-        //!
-        m_200 = c1o3 * (mxxMyy + mxxMzz + mxxPyyPzz);
-        m_020 = c1o3 * (-c2o1 * mxxMyy + mxxMzz + mxxPyyPzz);
-        m_002 = c1o3 * (mxxMyy - c2o1 * mxxMzz + mxxPyyPzz);
-
-        m_210 = ( mxxyMyzz + mxxyPyzz) * c1o2;
-        m_012 = (-mxxyMyzz + mxxyPyzz) * c1o2;
-        m_201 = ( mxxzMyyz + mxxzPyyz) * c1o2;
-        m_021 = (-mxxzMyyz + mxxzPyyz) * c1o2;
-        m_120 = ( mxyyMxzz + mxyyPxzz) * c1o2;
-        m_102 = (-mxyyMxzz + mxyyPxzz) * c1o2;
-        //////////////////////////////////////////////////////////////////////////
-
-        //////////////////////////////////////////////////////////////////////////
-        // 4.
-        // no limiter
-        //! - Relax fourth order cumulants to modified equilibrium for fourth order convergence of diffusion according
-        //! to Eq. (43)-(48) <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!
-        c_022 = -O4 * (c1o1 / omega - c1o2) * (dyuy + dzuz) * c2o3 * factorA + (c1o1 - O4) * (c_022);
-        c_202 = -O4 * (c1o1 / omega - c1o2) * (dxux + dzuz) * c2o3 * factorA + (c1o1 - O4) * (c_202);
-        c_220 = -O4 * (c1o1 / omega - c1o2) * (dyuy + dxux) * c2o3 * factorA + (c1o1 - O4) * (c_220);
-        c_112 = -O4 * (c1o1 / omega - c1o2) * Dxy           * c1o3 * factorB + (c1o1 - O4) * (c_112);
-        c_121 = -O4 * (c1o1 / omega - c1o2) * Dxz           * c1o3 * factorB + (c1o1 - O4) * (c_121);
-        c_211 = -O4 * (c1o1 / omega - c1o2) * Dyz           * c1o3 * factorB + (c1o1 - O4) * (c_211);
-
-
-        //////////////////////////////////////////////////////////////////////////
-        // 5.
-        c_122 += O5 * (-c_122);
-        c_212 += O5 * (-c_212);
-        c_221 += O5 * (-c_221);
-
-        //////////////////////////////////////////////////////////////////////////
-        // 6.
-        c_222 += O6 * (-c_222);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Compute central moments from post collision cumulants according to Eq. (53)-(56) in
-        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
-        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-        //!
-
-        //////////////////////////////////////////////////////////////////////////
-        // 4.
-        m_211 = c_211 + c1o3 * ((c3o1 * m_200 + c1o1) * m_011 + c6o1 * m_110 * m_101) * oneOverRho;
-        m_121 = c_121 + c1o3 * ((c3o1 * m_020 + c1o1) * m_101 + c6o1 * m_110 * m_011) * oneOverRho;
-        m_112 = c_112 + c1o3 * ((c3o1 * m_002 + c1o1) * m_110 + c6o1 * m_101 * m_011) * oneOverRho;
-
-        m_220 =
-            c_220 + (((m_200 * m_020 + c2o1 * m_110 * m_110) * c9o1 + c3o1 * (m_200 + m_020)) * oneOverRho - (drho * oneOverRho)) * c1o9;
-        m_202 =
-            c_202 + (((m_200 * m_002 + c2o1 * m_101 * m_101) * c9o1 + c3o1 * (m_200 + m_002)) * oneOverRho - (drho * oneOverRho)) * c1o9;
-        m_022 =
-            c_022 + (((m_002 * m_020 + c2o1 * m_011 * m_011) * c9o1 + c3o1 * (m_002 + m_020)) * oneOverRho - (drho * oneOverRho)) * c1o9;
-
-        //////////////////////////////////////////////////////////////////////////
-        // 5.
-        m_122 = c_122 + c1o3 *
-                (c3o1 * (m_002 * m_120 + m_020 * m_102 + c4o1 * m_011 * m_111 + c2o1 * (m_101 * m_021 + m_110 * m_012)) +
-                (m_120 + m_102)) * oneOverRho;
-        m_212 = c_212 + c1o3 *
-                (c3o1 * (m_002 * m_210 + m_200 * m_012 + c4o1 * m_101 * m_111 + c2o1 * (m_011 * m_201 + m_110 * m_102)) +
-                (m_210 + m_012)) * oneOverRho;
-        m_221 = c_221 + c1o3 *
-                (c3o1 * (m_200 * m_021 + m_020 * m_201 + c4o1 * m_110 * m_111 + c2o1 * (m_101 * m_120 + m_011 * m_210)) +
-                (m_021 + m_201)) * oneOverRho;
-
-        //////////////////////////////////////////////////////////////////////////
-        // 6.
-        m_222 = c_222 - ((-c4o1 * m_111 * m_111 - (m_200 * m_022 + m_020 * m_202 + m_002 * m_220) -
-                        c4o1 * (m_011 * m_211 + m_101 * m_121 + m_110 * m_112) -
-                        c2o1 * (m_120 * m_102 + m_210 * m_012 + m_201 * m_021)) *
-                        oneOverRho +
-                        (c4o1 * (m_101 * m_101 * m_020 + m_011 * m_011 * m_200 + m_110 * m_110 * m_002) +
-                        c2o1 * (m_200 * m_020 * m_002) + c16o1 * m_110 * m_101 * m_011) *
-                        oneOverRho * oneOverRho -
-                        c1o3 * (m_022 + m_202 + m_220) * oneOverRho - c1o9 * (m_200 + m_020 + m_002) * oneOverRho +
-                        (c2o1 * (m_101 * m_101 + m_011 * m_011 + m_110 * m_110) +
-                        (m_002 * m_020 + m_002 * m_200 + m_020 * m_200) + c1o3 * (m_002 + m_020 + m_200)) *
-                        oneOverRho * oneOverRho * c2o3 +
-                        c1o27 * ((drho * drho - drho) * oneOverRho * oneOverRho));
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! -  Add acceleration (body force) to first order cumulants according to Eq. (85)-(87) in
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-        //!
-        m_100 = -m_100;
-        m_010 = -m_010;
-        m_001 = -m_001;
-
-        //Write to array here to distribute read/write
-        rho[k_000] = drho;
-        vx[k_000] = vvx;
-        vy[k_000] = vvy;
-        vz[k_000] = vvz;
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Chimera transform from central moments to well conditioned distributions as defined in Appendix J in
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> see also Eq. (88)-(96) in <a
-        //! href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-        //! ]</b></a>
-        //!
-        ////////////////////////////////////////////////////////////////////////////////////
-        // X - Dir
-        backwardInverseChimeraWithK(m_000, m_100, m_200, vvx, vx2, c1o1, c1o1);
-        backwardChimera(            m_010, m_110, m_210, vvx, vx2);
-        backwardInverseChimeraWithK(m_020, m_120, m_220, vvx, vx2, c3o1, c1o3);
-        backwardChimera(            m_001, m_101, m_201, vvx, vx2);
-        backwardChimera(            m_011, m_111, m_211, vvx, vx2);
-        backwardChimera(            m_021, m_121, m_221, vvx, vx2);
-        backwardInverseChimeraWithK(m_002, m_102, m_202, vvx, vx2, c3o1, c1o3);
-        backwardChimera(            m_012, m_112, m_212, vvx, vx2);
-        backwardInverseChimeraWithK(m_022, m_122, m_222, vvx, vx2, c9o1, c1o9);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // Y - Dir
-        backwardInverseChimeraWithK(m_000, m_010, m_020, vvy, vy2, c6o1, c1o6);
-        backwardChimera(            m_001, m_011, m_021, vvy, vy2);
-        backwardInverseChimeraWithK(m_002, m_012, m_022, vvy, vy2, c18o1, c1o18);
-        backwardInverseChimeraWithK(m_100, m_110, m_120, vvy, vy2, c3o2, c2o3);
-        backwardChimera(            m_101, m_111, m_121, vvy, vy2);
-        backwardInverseChimeraWithK(m_102, m_112, m_122, vvy, vy2, c9o2, c2o9);
-        backwardInverseChimeraWithK(m_200, m_210, m_220, vvy, vy2, c6o1, c1o6);
-        backwardChimera(            m_201, m_211, m_221, vvy, vy2);
-        backwardInverseChimeraWithK(m_202, m_212, m_222, vvy, vy2, c18o1, c1o18);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        // Z - Dir
-        backwardInverseChimeraWithK(m_000, m_001, m_002, vvz, vz2, c36o1, c1o36);
-        backwardInverseChimeraWithK(m_010, m_011, m_012, vvz, vz2, c9o1, c1o9);
-        backwardInverseChimeraWithK(m_020, m_021, m_022, vvz, vz2, c36o1, c1o36);
-        backwardInverseChimeraWithK(m_100, m_101, m_102, vvz, vz2, c9o1, c1o9);
-        backwardInverseChimeraWithK(m_110, m_111, m_112, vvz, vz2, c9o4, c4o9);
-        backwardInverseChimeraWithK(m_120, m_121, m_122, vvz, vz2, c9o1, c1o9);
-        backwardInverseChimeraWithK(m_200, m_201, m_202, vvz, vz2, c36o1, c1o36);
-        backwardInverseChimeraWithK(m_210, m_211, m_212, vvz, vz2, c9o1, c1o9);
-        backwardInverseChimeraWithK(m_220, m_221, m_222, vvz, vz2, c36o1, c1o36);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Write distributions: style of reading and writing the distributions from/to
-        //! stored arrays dependent on timestep is based on the esoteric twist algorithm
-        //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
-        //! DOI:10.3390/computation5020019 ]</b></a>
-        //!
-        (dist.f[DIR_P00])[k_000]    = f_M00;
-        (dist.f[DIR_M00])[k_M00]    = f_P00;
-        (dist.f[DIR_0P0])[k_000]    = f_0M0;
-        (dist.f[DIR_0M0])[k_0M0]    = f_0P0;
-        (dist.f[DIR_00P])[k_000]    = f_00M;
-        (dist.f[DIR_00M])[k_00M]    = f_00P;
-        (dist.f[DIR_PP0])[k_000]   = f_MM0;
-        (dist.f[DIR_MM0])[k_MM0]   = f_PP0;
-        (dist.f[DIR_PM0])[k_0M0]   = f_MP0;
-        (dist.f[DIR_MP0])[k_M00]   = f_PM0;
-        (dist.f[DIR_P0P])[k_000]   = f_M0M;
-        (dist.f[DIR_M0M])[k_M0M]   = f_P0P;
-        (dist.f[DIR_P0M])[k_00M]   = f_M0P;
-        (dist.f[DIR_M0P])[k_M00]   = f_P0M;
-        (dist.f[DIR_0PP])[k_000]   = f_0MM;
-        (dist.f[DIR_0MM])[k_0MM]   = f_0PP;
-        (dist.f[DIR_0PM])[k_00M]   = f_0MP;
-        (dist.f[DIR_0MP])[k_0M0]   = f_0PM;
-        (dist.f[DIR_000])[k_000] = f_000;
-        (dist.f[DIR_PPP])[k_000]  = f_MMM;
-        (dist.f[DIR_PMP])[k_0M0]  = f_MPM;
-        (dist.f[DIR_PPM])[k_00M]  = f_MMP;
-        (dist.f[DIR_PMM])[k_0MM]  = f_MPP;
-        (dist.f[DIR_MPP])[k_M00]  = f_PMM;
-        (dist.f[DIR_MMP])[k_MM0]  = f_PPM;
-        (dist.f[DIR_MPM])[k_M0M]  = f_PMP;
-        (dist.f[DIR_MMM])[k_MMM]  = f_PPP;
-    }
-}
-
-template __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim < TurbulenceModel::AMD > ( real omega_in, uint* typeOfGridNode, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long size_Mat, int level, bool bodyForce, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep);
-
-template __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim < TurbulenceModel::Smagorinsky > ( real omega_in, uint* typeOfGridNode, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long size_Mat, int level, bool bodyForce, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep);
-
-template __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim < TurbulenceModel::QR > ( real omega_in, uint* typeOfGridNode, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long size_Mat, int level, bool bodyForce, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep);
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cuh b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cuh
deleted file mode 100644
index 5ef37557399f263d25edf03b02b00f6a03c6e1cb..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef LB_Kernel_TURBULENT_VISCOSITY_CUMULANT_K17_COMP_CHIM_H
-#define LB_Kernel_TURBULENT_VISCOSITY_CUMULANT_K17_COMP_CHIM_H
-
-#include <DataTypes.h>
-#include <curand.h>
-
-template< TurbulenceModel turbulenceModel > __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim(
-	real omega_in,
-	uint* typeOfGridNode,
-	uint* neighborX,
-	uint* neighborY,
-	uint* neighborZ,
-	real* distributions,
-	real* rho,
-	real* vx,
-    real* vy,
-    real* vz,
-	real* turbulentViscosity,
-	real SGSconstant,
-	unsigned long size_Mat,
-	int level,
-	bool bodyForce,
-	real* forces,
-	real* bodyForceX,
-	real* bodyForceY,
-	real* bodyForceZ,
-	real* quadricLimiters,
-	bool isEvenTimestep);
-#endif
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/TurbulentViscosityFluidFlowCompStrategy.cpp b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/TurbulentViscosityFluidFlowCompStrategy.cpp
deleted file mode 100644
index f3615a89994f0ca1fafdc1eda905d3c3b615d478..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/TurbulentViscosityFluidFlowCompStrategy.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "TurbulentViscosityFluidFlowCompStrategy.h"
-
-#include "Parameter/Parameter.h"
-
-std::shared_ptr<TurbulentViscosityFluidFlowCompStrategy> TurbulentViscosityFluidFlowCompStrategy::getInstance()
-{
-    static std::shared_ptr<TurbulentViscosityFluidFlowCompStrategy> uniqueInstance;
-	if (!uniqueInstance)
-        uniqueInstance = std::shared_ptr<TurbulentViscosityFluidFlowCompStrategy>(new TurbulentViscosityFluidFlowCompStrategy());
-	return uniqueInstance;
-}
-
-bool TurbulentViscosityFluidFlowCompStrategy::checkParameter(std::shared_ptr<Parameter> para)
-{
-	if (!para->getUseTurbulentViscosity())
-		return false;
-	else if (!para->getCompOn())
-		return false;
-	else
-		return true;
-}
-
-TurbulentViscosityFluidFlowCompStrategy::TurbulentViscosityFluidFlowCompStrategy() {}
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/TurbulentViscosityFluidFlowCompStrategy.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/TurbulentViscosityFluidFlowCompStrategy.h
deleted file mode 100644
index 95eff17777f7f0d1c3e05fe1b0d93892a88646a4..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/TurbulentViscosityFluidFlowCompStrategy.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef AMD_FLUID_FLOW_COMP_STRATEGY_H
-#define AMD_FLUID_FLOW_COMP_STRATEGY_H
-
-#include "Kernel/Utilities/CheckParameterStrategy/CheckParameterStrategy.h"
-
-
-class TurbulentViscosityFluidFlowCompStrategy : public CheckParameterStrategy
-{
-public:
-    static std::shared_ptr<TurbulentViscosityFluidFlowCompStrategy> getInstance();
-
-	bool checkParameter(std::shared_ptr<Parameter> para);
-
-private:
-    TurbulentViscosityFluidFlowCompStrategy();
-
-};
-#endif 
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15/WaleCumulantK15Comp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15/WaleCumulantK15Comp.cu
index cfcb70cd2bd6f3cc8ec4349650c44b7d3b0619fc..2b8a7d61e8966e2ed00022986311ae68ac0ca6d6 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15/WaleCumulantK15Comp.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15/WaleCumulantK15Comp.cu
@@ -10,7 +10,7 @@ std::shared_ptr<WaleCumulantK15Comp> WaleCumulantK15Comp::getNewInstance(std::sh
 
 void WaleCumulantK15Comp::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
+	int size_Mat = (int)para->getParD(level)->numberOfNodes;
 	int numberOfThreads = para->getParD(level)->numberofthreads;
 
 	int Grid = (size_Mat / numberOfThreads) + 1;
@@ -28,22 +28,23 @@ void WaleCumulantK15Comp::run()
 	dim3 grid(Grid1, Grid2, 1);
 	dim3 threads(numberOfThreads, 1, 1);
 
-	LB_Kernel_WaleCumulantK15Comp << < grid, threads >> >(	para->getParD(level)->omega,
-																para->getParD(level)->typeOfGridNode,
-																para->getParD(level)->neighborX,
-																para->getParD(level)->neighborY,
-																para->getParD(level)->neighborZ,
-																para->getParD(level)->neighborInverse,
-																para->getParD(level)->velocityX,
-																para->getParD(level)->velocityY,
-																para->getParD(level)->velocityZ,
-																para->getParD(level)->distributions.f[0],
-																para->getParD(level)->turbViscosity,
-																para->getParD(level)->numberOfNodes,
-																level,
-																para->getTimestepOfCoarseLevel(),
-																para->getForcesDev(),
-																para->getParD(level)->isEvenTimestep);
+	LB_Kernel_WaleCumulantK15Comp <<< grid, threads >>>(
+		para->getParD(level)->omega,
+		para->getParD(level)->typeOfGridNode,
+		para->getParD(level)->neighborX,
+		para->getParD(level)->neighborY,
+		para->getParD(level)->neighborZ,
+		para->getParD(level)->neighborInverse,
+		para->getParD(level)->velocityX,
+		para->getParD(level)->velocityY,
+		para->getParD(level)->velocityZ,
+		para->getParD(level)->distributions.f[0],
+		para->getParD(level)->turbViscosity,
+		para->getParD(level)->numberOfNodes,
+		level,
+		para->getTimestepOfCoarseLevel(),
+		para->getForcesDev(),
+		para->getParD(level)->isEvenTimestep);
 	getLastCudaError("LB_Kernel_WaleCumulantK15Comp execution failed");
 }
 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15/WaleCumulantK15Comp_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15/WaleCumulantK15Comp_Device.cu
index 3da25060e6c82ea685a1659fecc8cf66eeaf44c4..a7018d1246c0832753df144ffbf2625b55f5508e 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15/WaleCumulantK15Comp_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15/WaleCumulantK15Comp_Device.cu
@@ -46,63 +46,63 @@ __global__ void LB_Kernel_WaleCumulantK15Comp(
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -136,33 +136,33 @@ __global__ void LB_Kernel_WaleCumulantK15Comp(
 			unsigned int kbsw = neighborZ[ksw];
 
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];
-			real mfabb = (D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];
-			real mfbab = (D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];
-			real mfbba = (D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];
-			real mfaab = (D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];
-			real mfacb = (D.f[DIR_MP0  ])[kw ];
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];
-			real mfaba = (D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];
-			real mfabc = (D.f[DIR_M0P  ])[kw ];
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];
-			real mfbac = (D.f[DIR_0MP  ])[ks ];
+			real mfcbb = (D.f[DIR_P00])[k  ];
+			real mfabb = (D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];
+			real mfbab = (D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k  ];
+			real mfbba = (D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k  ];
+			real mfaab = (D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks ];
+			real mfacb = (D.f[DIR_MP0])[kw ];
+			real mfcbc = (D.f[DIR_P0P])[k  ];
+			real mfaba = (D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb ];
+			real mfabc = (D.f[DIR_M0P])[kw ];
+			real mfbcc = (D.f[DIR_0PP])[k  ];
+			real mfbaa = (D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb ];
+			real mfbac = (D.f[DIR_0MP])[ks ];
 			real mfbbb = (D.f[DIR_000])[k  ];
-			real mfccc = (D.f[DIR_PPP ])[k  ];
-			real mfaac = (D.f[DIR_MMP ])[ksw];
-			real mfcac = (D.f[DIR_PMP ])[ks ];
-			real mfacc = (D.f[DIR_MPP ])[kw ];
-			real mfcca = (D.f[DIR_PPM ])[kb ];
+			real mfccc = (D.f[DIR_PPP])[k  ];
+			real mfaac = (D.f[DIR_MMP])[ksw];
+			real mfcac = (D.f[DIR_PMP])[ks ];
+			real mfacc = (D.f[DIR_MPP])[kw ];
+			real mfcca = (D.f[DIR_PPM])[kb ];
 			real mfaaa = (D.f[DIR_MMM])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs];
-			real mfaca = (D.f[DIR_MPM ])[kbw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];
+			real mfaca = (D.f[DIR_MPM])[kbw];
 			////////////////////////////////////////////////////////////////////////////////////
 			real drho = ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
 							(((mfbac+mfbca) + (mfbaa+mfbcc)) + ((mfabc+mfcba) + (mfaba+mfcbc)) + ((mfacb+mfcab) + (mfaab+mfccb))) +
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15BySoniMalav/WaleBySoniMalavCumulantK15Comp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15BySoniMalav/WaleBySoniMalavCumulantK15Comp.cu
index 15b808279a4c9dc771531f118cb369b7c5380a84..49ee20b44f37b01cd9bc837024a47c1428c00a18 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15BySoniMalav/WaleBySoniMalavCumulantK15Comp.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15BySoniMalav/WaleBySoniMalavCumulantK15Comp.cu
@@ -10,7 +10,7 @@ std::shared_ptr<WaleBySoniMalavCumulantK15Comp> WaleBySoniMalavCumulantK15Comp::
 
 void WaleBySoniMalavCumulantK15Comp::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
+	int size_Mat = (int)para->getParD(level)->numberOfNodes;
 	int numberOfThreads = para->getParD(level)->numberofthreads;
 
 	//int Grid = size_Array / numberOfThreads;
@@ -32,21 +32,22 @@ void WaleBySoniMalavCumulantK15Comp::run()
 	dim3 grid(Grid1, Grid2, 1);
 	dim3 threads(numberOfThreads, 1, 1);
 
-	LB_Kernel_WaleBySoniMalavCumulantK15Comp << < grid, threads >> >(	para->getParD(level)->omega,
-																			para->getParD(level)->typeOfGridNode,
-																			para->getParD(level)->neighborX,
-																			para->getParD(level)->neighborY,
-																			para->getParD(level)->neighborZ,
-																			para->getParD(level)->neighborInverse,
-																			para->getParD(level)->velocityX,
-																			para->getParD(level)->velocityY,
-																			para->getParD(level)->velocityZ,
-																			para->getParD(level)->distributions.f[0],
-																			para->getParD(level)->turbViscosity,
-																			para->getParD(level)->numberOfNodes,
-																			level,
-																			para->getForcesDev(),
-																			para->getParD(level)->isEvenTimestep);
+	LB_Kernel_WaleBySoniMalavCumulantK15Comp <<< grid, threads >>>(
+		para->getParD(level)->omega,
+		para->getParD(level)->typeOfGridNode,
+		para->getParD(level)->neighborX,
+		para->getParD(level)->neighborY,
+		para->getParD(level)->neighborZ,
+		para->getParD(level)->neighborInverse,
+		para->getParD(level)->velocityX,
+		para->getParD(level)->velocityY,
+		para->getParD(level)->velocityZ,
+		para->getParD(level)->distributions.f[0],
+		para->getParD(level)->turbViscosity,
+		para->getParD(level)->numberOfNodes,
+		level,
+		para->getForcesDev(),
+		para->getParD(level)->isEvenTimestep);
 	getLastCudaError("LB_Kernel_WaleBySoniMalavCumulantK15Comp execution failed");
 }
 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15BySoniMalav/WaleBySoniMalavCumulantK15Comp_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15BySoniMalav/WaleBySoniMalavCumulantK15Comp_Device.cu
index 511219c352c4d156428565f718191a70b9cc6c32..6258c72c36cafa27b06b2934db42a5813ed74f99 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15BySoniMalav/WaleBySoniMalavCumulantK15Comp_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15BySoniMalav/WaleBySoniMalavCumulantK15Comp_Device.cu
@@ -45,63 +45,63 @@ __global__ void LB_Kernel_WaleBySoniMalavCumulantK15Comp(
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -115,33 +115,33 @@ __global__ void LB_Kernel_WaleBySoniMalavCumulantK15Comp(
 			unsigned int kbsw = neighborZ[ksw];
 
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];
-			real mfabb = (D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];
-			real mfbab = (D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];
-			real mfbba = (D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];
-			real mfaab = (D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];
-			real mfacb = (D.f[DIR_MP0  ])[kw ];
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];
-			real mfaba = (D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];
-			real mfabc = (D.f[DIR_M0P  ])[kw ];
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];
-			real mfbac = (D.f[DIR_0MP  ])[ks ];
+			real mfcbb = (D.f[DIR_P00])[k  ];
+			real mfabb = (D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];
+			real mfbab = (D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k  ];
+			real mfbba = (D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k  ];
+			real mfaab = (D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks ];
+			real mfacb = (D.f[DIR_MP0])[kw ];
+			real mfcbc = (D.f[DIR_P0P])[k  ];
+			real mfaba = (D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb ];
+			real mfabc = (D.f[DIR_M0P])[kw ];
+			real mfbcc = (D.f[DIR_0PP])[k  ];
+			real mfbaa = (D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb ];
+			real mfbac = (D.f[DIR_0MP])[ks ];
 			real mfbbb = (D.f[DIR_000])[k  ];
-			real mfccc = (D.f[DIR_PPP ])[k  ];
-			real mfaac = (D.f[DIR_MMP ])[ksw];
-			real mfcac = (D.f[DIR_PMP ])[ks ];
-			real mfacc = (D.f[DIR_MPP ])[kw ];
-			real mfcca = (D.f[DIR_PPM ])[kb ];
+			real mfccc = (D.f[DIR_PPP])[k  ];
+			real mfaac = (D.f[DIR_MMP])[ksw];
+			real mfcac = (D.f[DIR_PMP])[ks ];
+			real mfacc = (D.f[DIR_MPP])[kw ];
+			real mfcca = (D.f[DIR_PPM])[kb ];
 			real mfaaa = (D.f[DIR_MMM])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs];
-			real mfaca = (D.f[DIR_MPM ])[kbw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];
+			real mfaca = (D.f[DIR_MPM])[kbw];
 			////////////////////////////////////////////////////////////////////////////////////
 			real drho = ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
 							(((mfbac+mfbca) + (mfbaa+mfbcc)) + ((mfabc+mfcba) + (mfaba+mfcbc)) + ((mfacb+mfcab) + (mfaab+mfccb))) +
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17/WaleCumulantK17Comp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17/WaleCumulantK17Comp.cu
index 5eeea51301c666cf17546c85a444413111bebf2c..c9c16e2d2d2259656248948f3f10977c8f18fd24 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17/WaleCumulantK17Comp.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17/WaleCumulantK17Comp.cu
@@ -10,7 +10,7 @@ std::shared_ptr<WaleCumulantK17Comp> WaleCumulantK17Comp::getNewInstance(std::sh
 
 void WaleCumulantK17Comp::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
+	int size_Mat = (int)para->getParD(level)->numberOfNodes;
 	int numberOfThreads = para->getParD(level)->numberofthreads;
 
 	//int Grid = size_Array / numberOfThreads;
@@ -32,23 +32,24 @@ void WaleCumulantK17Comp::run()
 	dim3 grid(Grid1, Grid2, 1);
 	dim3 threads(numberOfThreads, 1, 1);
 
-	LB_Kernel_WaleCumulantK17Comp <<< grid, threads >>>(para->getParD(level)->omega,
-														para->getParD(level)->typeOfGridNode,
-														para->getParD(level)->neighborX,
-														para->getParD(level)->neighborY,
-														para->getParD(level)->neighborZ,
-														para->getParD(level)->neighborInverse,
-														para->getParD(level)->velocityX,
-														para->getParD(level)->velocityY,
-														para->getParD(level)->velocityZ,
-														para->getParD(level)->distributions.f[0],
-														para->getParD(level)->turbViscosity,
-														para->getParD(level)->numberOfNodes,
-														level,
-														para->getTimestepOfCoarseLevel(),
-														para->getForcesDev(),
-                                                        para->getQuadricLimitersDev(),
-														para->getParD(level)->isEvenTimestep);
+	LB_Kernel_WaleCumulantK17Comp <<< grid, threads >>>(
+		para->getParD(level)->omega,
+		para->getParD(level)->typeOfGridNode,
+		para->getParD(level)->neighborX,
+		para->getParD(level)->neighborY,
+		para->getParD(level)->neighborZ,
+		para->getParD(level)->neighborInverse,
+		para->getParD(level)->velocityX,
+		para->getParD(level)->velocityY,
+		para->getParD(level)->velocityZ,
+		para->getParD(level)->distributions.f[0],
+		para->getParD(level)->turbViscosity,
+		para->getParD(level)->numberOfNodes,
+		level,
+		para->getTimestepOfCoarseLevel(),
+		para->getForcesDev(),
+        para->getQuadricLimitersDev(),
+		para->getParD(level)->isEvenTimestep);
 	getLastCudaError("LB_Kernel_WaleCumulantK17Comp execution failed");
 }
 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17/WaleCumulantK17Comp_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17/WaleCumulantK17Comp_Device.cu
index 8aaa13ab1d868e15ea5707d1566ba653b44c645d..e3161e0d26efe8993bb4b6c34bda32bf15af5d3d 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17/WaleCumulantK17Comp_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17/WaleCumulantK17Comp_Device.cu
@@ -47,63 +47,63 @@ __global__ void LB_Kernel_WaleCumulantK17Comp(
 			Distributions27 D;
 			if (EvenOrOdd==true)
 			{
-				D.f[DIR_P00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00   ] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00   ] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0   ] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0   ] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M   ] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P   ] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0  ] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0  ] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0  ] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0  ] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M  ] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P  ] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P  ] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M  ] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM  ] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP  ] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP  ] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM  ] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM ] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM ] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM ] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM ] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP ] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP ] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP ] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP ] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
@@ -137,33 +137,33 @@ __global__ void LB_Kernel_WaleCumulantK17Comp(
 			unsigned int kbsw = neighborZ[ksw];
 
 			//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-			real mfcbb = (D.f[DIR_P00   ])[k  ];
-			real mfabb = (D.f[DIR_M00   ])[kw ];
-			real mfbcb = (D.f[DIR_0P0   ])[k  ];
-			real mfbab = (D.f[DIR_0M0   ])[ks ];
-			real mfbbc = (D.f[DIR_00P   ])[k  ];
-			real mfbba = (D.f[DIR_00M   ])[kb ];
-			real mfccb = (D.f[DIR_PP0  ])[k  ];
-			real mfaab = (D.f[DIR_MM0  ])[ksw];
-			real mfcab = (D.f[DIR_PM0  ])[ks ];
-			real mfacb = (D.f[DIR_MP0  ])[kw ];
-			real mfcbc = (D.f[DIR_P0P  ])[k  ];
-			real mfaba = (D.f[DIR_M0M  ])[kbw];
-			real mfcba = (D.f[DIR_P0M  ])[kb ];
-			real mfabc = (D.f[DIR_M0P  ])[kw ];
-			real mfbcc = (D.f[DIR_0PP  ])[k  ];
-			real mfbaa = (D.f[DIR_0MM  ])[kbs];
-			real mfbca = (D.f[DIR_0PM  ])[kb ];
-			real mfbac = (D.f[DIR_0MP  ])[ks ];
+			real mfcbb = (D.f[DIR_P00])[k  ];
+			real mfabb = (D.f[DIR_M00])[kw ];
+			real mfbcb = (D.f[DIR_0P0])[k  ];
+			real mfbab = (D.f[DIR_0M0])[ks ];
+			real mfbbc = (D.f[DIR_00P])[k  ];
+			real mfbba = (D.f[DIR_00M])[kb ];
+			real mfccb = (D.f[DIR_PP0])[k  ];
+			real mfaab = (D.f[DIR_MM0])[ksw];
+			real mfcab = (D.f[DIR_PM0])[ks ];
+			real mfacb = (D.f[DIR_MP0])[kw ];
+			real mfcbc = (D.f[DIR_P0P])[k  ];
+			real mfaba = (D.f[DIR_M0M])[kbw];
+			real mfcba = (D.f[DIR_P0M])[kb ];
+			real mfabc = (D.f[DIR_M0P])[kw ];
+			real mfbcc = (D.f[DIR_0PP])[k  ];
+			real mfbaa = (D.f[DIR_0MM])[kbs];
+			real mfbca = (D.f[DIR_0PM])[kb ];
+			real mfbac = (D.f[DIR_0MP])[ks ];
 			real mfbbb = (D.f[DIR_000])[k  ];
-			real mfccc = (D.f[DIR_PPP ])[k  ];
-			real mfaac = (D.f[DIR_MMP ])[ksw];
-			real mfcac = (D.f[DIR_PMP ])[ks ];
-			real mfacc = (D.f[DIR_MPP ])[kw ];
-			real mfcca = (D.f[DIR_PPM ])[kb ];
+			real mfccc = (D.f[DIR_PPP])[k  ];
+			real mfaac = (D.f[DIR_MMP])[ksw];
+			real mfcac = (D.f[DIR_PMP])[ks ];
+			real mfacc = (D.f[DIR_MPP])[kw ];
+			real mfcca = (D.f[DIR_PPM])[kb ];
 			real mfaaa = (D.f[DIR_MMM])[kbsw];
-			real mfcaa = (D.f[DIR_PMM ])[kbs];
-			real mfaca = (D.f[DIR_MPM ])[kbw];
+			real mfcaa = (D.f[DIR_PMM])[kbs];
+			real mfaca = (D.f[DIR_MPM])[kbw];
 			////////////////////////////////////////////////////////////////////////////////////
 			real drho = ((((mfccc+mfaaa) + (mfaca+mfcac)) + ((mfacc+mfcaa) + (mfaac+mfcca))) + 
 							(((mfbac+mfbca) + (mfbaa+mfbcc)) + ((mfabc+mfcba) + (mfaba+mfcbc)) + ((mfacb+mfcab) + (mfaab+mfccb))) +
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17Debug/WaleCumulantK17DebugComp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17Debug/WaleCumulantK17DebugComp.cu
index 98dca58f522bf02ce66328819e42c717f0ceef28..b3cdd494c02c6649d60818b6b264b8db8b79d426 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17Debug/WaleCumulantK17DebugComp.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17Debug/WaleCumulantK17DebugComp.cu
@@ -10,7 +10,7 @@ std::shared_ptr<WaleCumulantK17DebugComp> WaleCumulantK17DebugComp::getNewInstan
 
 void WaleCumulantK17DebugComp::run()
 {
-	int size_Mat = para->getParD(level)->numberOfNodes;
+	int size_Mat = (int)para->getParD(level)->numberOfNodes;
 	int numberOfThreads = para->getParD(level)->numberofthreads;
 
 	//int Grid = size_Array / numberOfThreads;
@@ -32,34 +32,34 @@ void WaleCumulantK17DebugComp::run()
 	dim3 grid(Grid1, Grid2, 1);
 	dim3 threads(numberOfThreads, 1, 1);
 
-	LB_Kernel_WaleCumulantK17DebugComp << < grid, threads >> >(
-																		para->getParD(level)->omega,
-																		para->getParD(level)->typeOfGridNode,
-																		para->getParD(level)->neighborX,
-																		para->getParD(level)->neighborY,
-																		para->getParD(level)->neighborZ,
-																		para->getParD(level)->neighborInverse,
-																		para->getParD(level)->velocityX,
-																		para->getParD(level)->velocityY,
-																		para->getParD(level)->velocityZ,
-																		para->getParD(level)->distributions.f[0],
-																		para->getParD(level)->turbViscosity,
-																		para->getParD(level)->gSij,
-																		para->getParD(level)->gSDij,
-																		para->getParD(level)->gDxvx,
-																		para->getParD(level)->gDyvx,
-																		para->getParD(level)->gDzvx,
-																		para->getParD(level)->gDxvy,
-																		para->getParD(level)->gDyvy,
-																		para->getParD(level)->gDzvy,
-																		para->getParD(level)->gDxvz,
-																		para->getParD(level)->gDyvz,
-																		para->getParD(level)->gDzvz,
-																		para->getParD(level)->numberOfNodes,
-																		level,
-																		para->getForcesDev(),
-                                                                        para->getQuadricLimitersDev(),
-																		para->getParD(level)->isEvenTimestep);
+	LB_Kernel_WaleCumulantK17DebugComp <<< grid, threads >>>(
+		para->getParD(level)->omega,
+		para->getParD(level)->typeOfGridNode,
+		para->getParD(level)->neighborX,
+		para->getParD(level)->neighborY,
+		para->getParD(level)->neighborZ,
+		para->getParD(level)->neighborInverse,
+		para->getParD(level)->velocityX,
+		para->getParD(level)->velocityY,
+		para->getParD(level)->velocityZ,
+		para->getParD(level)->distributions.f[0],
+		para->getParD(level)->turbViscosity,
+		para->getParD(level)->gSij,
+		para->getParD(level)->gSDij,
+		para->getParD(level)->gDxvx,
+		para->getParD(level)->gDyvx,
+		para->getParD(level)->gDzvx,
+		para->getParD(level)->gDxvy,
+		para->getParD(level)->gDyvy,
+		para->getParD(level)->gDzvy,
+		para->getParD(level)->gDxvz,
+		para->getParD(level)->gDyvz,
+		para->getParD(level)->gDzvz,
+		para->getParD(level)->numberOfNodes,
+		level,
+		para->getForcesDev(),
+        para->getQuadricLimitersDev(),
+		para->getParD(level)->isEvenTimestep);
 	getLastCudaError("LB_Kernel_WaleCumulantK17DebugComp execution failed");
 }
 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17Debug/WaleCumulantK17DebugComp_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17Debug/WaleCumulantK17DebugComp_Device.cu
index a1feba477a6555ea728311a6e99d5302652813ff..63f4ecc8716fcd606fb6a75709408b0885d781e9 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17Debug/WaleCumulantK17DebugComp_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK17Debug/WaleCumulantK17DebugComp_Device.cu
@@ -57,63 +57,63 @@ __global__ void LB_Kernel_WaleCumulantK17DebugComp(
 			Distributions27 D;
 			if (EvenOrOdd == true)
 			{
-				D.f[DIR_P00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_M00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_PMM]= &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_MPM]= &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_PMM]= &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_MPM]= &DDStart[DIR_MPM * size_Mat];
 			}
 			else
 			{
-				D.f[DIR_M00] = &DDStart[DIR_P00   *size_Mat];
-				D.f[DIR_P00] = &DDStart[DIR_M00   *size_Mat];
-				D.f[DIR_0M0] = &DDStart[DIR_0P0   *size_Mat];
-				D.f[DIR_0P0] = &DDStart[DIR_0M0   *size_Mat];
-				D.f[DIR_00M] = &DDStart[DIR_00P   *size_Mat];
-				D.f[DIR_00P] = &DDStart[DIR_00M   *size_Mat];
-				D.f[DIR_MM0] = &DDStart[DIR_PP0  *size_Mat];
-				D.f[DIR_PP0] = &DDStart[DIR_MM0  *size_Mat];
-				D.f[DIR_MP0] = &DDStart[DIR_PM0  *size_Mat];
-				D.f[DIR_PM0] = &DDStart[DIR_MP0  *size_Mat];
-				D.f[DIR_M0M] = &DDStart[DIR_P0P  *size_Mat];
-				D.f[DIR_P0P] = &DDStart[DIR_M0M  *size_Mat];
-				D.f[DIR_M0P] = &DDStart[DIR_P0M  *size_Mat];
-				D.f[DIR_P0M] = &DDStart[DIR_M0P  *size_Mat];
-				D.f[DIR_0MM] = &DDStart[DIR_0PP  *size_Mat];
-				D.f[DIR_0PP] = &DDStart[DIR_0MM  *size_Mat];
-				D.f[DIR_0MP] = &DDStart[DIR_0PM  *size_Mat];
-				D.f[DIR_0PM] = &DDStart[DIR_0MP  *size_Mat];
-				D.f[DIR_000] = &DDStart[DIR_000*size_Mat];
-				D.f[DIR_MMM] = &DDStart[DIR_PPP *size_Mat];
-				D.f[DIR_PPM] = &DDStart[DIR_MMP *size_Mat];
-				D.f[DIR_MPM]= &DDStart[DIR_PMP *size_Mat];
-				D.f[DIR_PMM]= &DDStart[DIR_MPP *size_Mat];
-				D.f[DIR_MMP] = &DDStart[DIR_PPM *size_Mat];
-				D.f[DIR_PPP] = &DDStart[DIR_MMM *size_Mat];
-				D.f[DIR_MPP] = &DDStart[DIR_PMM *size_Mat];
-				D.f[DIR_PMP] = &DDStart[DIR_MPM *size_Mat];
+				D.f[DIR_M00] = &DDStart[DIR_P00 * size_Mat];
+				D.f[DIR_P00] = &DDStart[DIR_M00 * size_Mat];
+				D.f[DIR_0M0] = &DDStart[DIR_0P0 * size_Mat];
+				D.f[DIR_0P0] = &DDStart[DIR_0M0 * size_Mat];
+				D.f[DIR_00M] = &DDStart[DIR_00P * size_Mat];
+				D.f[DIR_00P] = &DDStart[DIR_00M * size_Mat];
+				D.f[DIR_MM0] = &DDStart[DIR_PP0 * size_Mat];
+				D.f[DIR_PP0] = &DDStart[DIR_MM0 * size_Mat];
+				D.f[DIR_MP0] = &DDStart[DIR_PM0 * size_Mat];
+				D.f[DIR_PM0] = &DDStart[DIR_MP0 * size_Mat];
+				D.f[DIR_M0M] = &DDStart[DIR_P0P * size_Mat];
+				D.f[DIR_P0P] = &DDStart[DIR_M0M * size_Mat];
+				D.f[DIR_M0P] = &DDStart[DIR_P0M * size_Mat];
+				D.f[DIR_P0M] = &DDStart[DIR_M0P * size_Mat];
+				D.f[DIR_0MM] = &DDStart[DIR_0PP * size_Mat];
+				D.f[DIR_0PP] = &DDStart[DIR_0MM * size_Mat];
+				D.f[DIR_0MP] = &DDStart[DIR_0PM * size_Mat];
+				D.f[DIR_0PM] = &DDStart[DIR_0MP * size_Mat];
+				D.f[DIR_000] = &DDStart[DIR_000 * size_Mat];
+				D.f[DIR_MMM] = &DDStart[DIR_PPP * size_Mat];
+				D.f[DIR_PPM] = &DDStart[DIR_MMP * size_Mat];
+				D.f[DIR_MPM]= &DDStart[DIR_PMP * size_Mat];
+				D.f[DIR_PMM]= &DDStart[DIR_MPP * size_Mat];
+				D.f[DIR_MMP] = &DDStart[DIR_PPM * size_Mat];
+				D.f[DIR_PPP] = &DDStart[DIR_MMM * size_Mat];
+				D.f[DIR_MPP] = &DDStart[DIR_PMM * size_Mat];
+				D.f[DIR_PMP] = &DDStart[DIR_MPM * size_Mat];
 			}
 
 			////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Utilities/ChimeraTransformation.h b/src/gpu/VirtualFluids_GPU/Kernel/Utilities/ChimeraTransformation.h
deleted file mode 100644
index f7822d63fa0efd34b27773dffdeebddf521a8792..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Utilities/ChimeraTransformation.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef CHIMERA_TRANSFORMATION_H
-#define CHIMERA_TRANSFORMATION_H
-
-#include <lbm/constants/NumericConstants.h>
-
-using namespace vf::lbm::constant;
-
-////////////////////////////////////////////////////////////////////////////////
-//! \brief forward chimera transformation \ref forwardInverseChimeraWithK
-//! Transformation from distributions to central moments according to Eq. (6)-(14) in \ref
-//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-//! ]</b></a> Modified for lower round-off errors.
-inline __device__ void forwardInverseChimeraWithK(real &mfa, real &mfb, real &mfc, real vv, real v2, real Kinverse, real K)
-{
-    real m2 = mfa + mfc;
-    real m1 = mfc - mfa;
-    real m0 = m2 + mfb;
-    mfa     = m0;
-    m0 *= Kinverse;
-    m0 += c1o1;
-    mfb = (m1 * Kinverse - m0 * vv) * K;
-    mfc = ((m2 - c2o1 * m1 * vv) * Kinverse + v2 * m0) * K;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! \brief backward chimera transformation \ref backwardInverseChimeraWithK
-//! Transformation from central moments to distributions according to Eq. (57)-(65) in \ref
-//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-//! ]</b></a> Modified for lower round-off errors.
-inline __device__ void backwardInverseChimeraWithK(real &mfa, real &mfb, real &mfc, real vv, real v2, real Kinverse, real K)
-{
-    real m0 = (((mfc - mfb) * c1o2 + mfb * vv) * Kinverse + (mfa * Kinverse + c1o1) * (v2 - vv) * c1o2) * K;
-    real m1 = (((mfa - mfc) - c2o1 * mfb * vv) * Kinverse + (mfa * Kinverse + c1o1) * (-v2)) * K;
-    mfc     = (((mfc + mfb) * c1o2 + mfb * vv) * Kinverse + (mfa * Kinverse + c1o1) * (v2 + vv) * c1o2) * K;
-    mfa     = m0;
-    mfb     = m1;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! \brief forward chimera transformation \ref forwardChimera
-//! Transformation from distributions to central moments according to Eq. (6)-(14) in \ref
-//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-//! ]</b></a> for \f$ K_{abc}=0 \f$. This is to avoid unnessary floating point operations. Modified for lower round-off
-//! errors.
-inline __device__ void forwardChimera(real &mfa, real &mfb, real &mfc, real vv, real v2)
-{
-    real m1 = (mfa + mfc) + mfb;
-    real m2 = mfc - mfa;
-    mfc     = (mfc + mfa) + (v2 * m1 - c2o1 * vv * m2);
-    mfb     = m2 - vv * m1;
-    mfa     = m1;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! \brief backward chimera transformation \ref backwardChimera
-//! Transformation from central moments to distributions according to Eq. (57)-(65) in \ref
-//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-//! ]</b></a> for \f$ K_{abc}=0 \f$. This is to avoid unnessary floating point operations. Modified for lower round-off
-//! errors.
-inline __device__ void backwardChimera(real &mfa, real &mfb, real &mfc, real vv, real v2)
-{
-    real ma = (mfc + mfa * (v2 - vv)) * c1o2 + mfb * (vv - c1o2);
-    real mb = ((mfa - mfc) - mfa * v2) - c2o1 * mfb * vv;
-    mfc     = (mfc + mfa * (v2 + vv)) * c1o2 + mfb * (vv + c1o2);
-    mfb     = mb;
-    mfa     = ma;
-}
-#endif
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Utilities/DistributionHelper.cu b/src/gpu/VirtualFluids_GPU/Kernel/Utilities/DistributionHelper.cu
index 7c477c539dc3526389dc22563b50501e778a63f3..240a6ffbace64147aa67224fe72c946761fdc452 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Utilities/DistributionHelper.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Utilities/DistributionHelper.cu
@@ -2,8 +2,7 @@
 
 #include <cuda_runtime.h>
 
-
-#include <lbm/constants/NumericConstants.h>
+#include "lbm/constants/NumericConstants.h"
 #include "lbm/constants/D3Q27.h"
 using namespace vf::lbm::dir;
 
@@ -80,10 +79,4 @@ __device__ void DistributionWrapper::write()
     (distribution_references.f[DIR_000])[k]   = distribution.f[vf::lbm::dir::ZZZ];
 }
 
-__device__ bool isValidFluidNode(uint nodeType)
-{
-    return (nodeType == GEO_FLUID || nodeType == GEO_PM_0 || nodeType == GEO_PM_1 || nodeType == GEO_PM_2);
-}
-
-
 }
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Utilities/DistributionHelper.cuh b/src/gpu/VirtualFluids_GPU/Kernel/Utilities/DistributionHelper.cuh
index 1009ecfa92f31e821d825ad72ba681bc3ae96d1b..599f3f46668c07da49725770177d77239f8ef9df 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Utilities/DistributionHelper.cuh
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Utilities/DistributionHelper.cuh
@@ -37,76 +37,13 @@
 
 #include "lbm/KernelParameter.h"
 #include "lbm/constants/D3Q27.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
 
 using namespace vf::lbm::dir;
 
 namespace vf::gpu
 {
 
-__inline__ __device__ __host__ void getPointersToDistributions(Distributions27 &dist, real *distributionArray, const uint numberOfLBnodes, const bool isEvenTimestep)
-{
-    if (isEvenTimestep)
-    {
-        dist.f[DIR_000] = &distributionArray[DIR_000 * numberOfLBnodes];
-        dist.f[DIR_P00] = &distributionArray[DIR_P00 * numberOfLBnodes];
-        dist.f[DIR_M00] = &distributionArray[DIR_M00 * numberOfLBnodes];
-        dist.f[DIR_0P0] = &distributionArray[DIR_0P0 * numberOfLBnodes];
-        dist.f[DIR_0M0] = &distributionArray[DIR_0M0 * numberOfLBnodes];
-        dist.f[DIR_00P] = &distributionArray[DIR_00P * numberOfLBnodes];
-        dist.f[DIR_00M] = &distributionArray[DIR_00M * numberOfLBnodes];
-        dist.f[DIR_PP0] = &distributionArray[DIR_PP0 * numberOfLBnodes];
-        dist.f[DIR_MM0] = &distributionArray[DIR_MM0 * numberOfLBnodes];
-        dist.f[DIR_PM0] = &distributionArray[DIR_PM0 * numberOfLBnodes];
-        dist.f[DIR_MP0] = &distributionArray[DIR_MP0 * numberOfLBnodes];
-        dist.f[DIR_P0P] = &distributionArray[DIR_P0P * numberOfLBnodes];
-        dist.f[DIR_M0M] = &distributionArray[DIR_M0M * numberOfLBnodes];
-        dist.f[DIR_P0M] = &distributionArray[DIR_P0M * numberOfLBnodes];
-        dist.f[DIR_M0P] = &distributionArray[DIR_M0P * numberOfLBnodes];
-        dist.f[DIR_0PP] = &distributionArray[DIR_0PP * numberOfLBnodes];
-        dist.f[DIR_0MM] = &distributionArray[DIR_0MM * numberOfLBnodes];
-        dist.f[DIR_0PM] = &distributionArray[DIR_0PM * numberOfLBnodes];
-        dist.f[DIR_0MP] = &distributionArray[DIR_0MP * numberOfLBnodes];
-        dist.f[DIR_PPP] = &distributionArray[DIR_PPP * numberOfLBnodes];
-        dist.f[DIR_MMP] = &distributionArray[DIR_MMP * numberOfLBnodes];
-        dist.f[DIR_PMP] = &distributionArray[DIR_PMP * numberOfLBnodes];
-        dist.f[DIR_MPP] = &distributionArray[DIR_MPP * numberOfLBnodes];
-        dist.f[DIR_PPM] = &distributionArray[DIR_PPM * numberOfLBnodes];
-        dist.f[DIR_MMM] = &distributionArray[DIR_MMM * numberOfLBnodes];
-        dist.f[DIR_PMM] = &distributionArray[DIR_PMM * numberOfLBnodes];
-        dist.f[DIR_MPM] = &distributionArray[DIR_MPM * numberOfLBnodes];
-    }
-    else
-    {
-         dist.f[DIR_M00] = &distributionArray[DIR_P00 * numberOfLBnodes];
-         dist.f[DIR_P00] = &distributionArray[DIR_M00 * numberOfLBnodes];
-         dist.f[DIR_0M0] = &distributionArray[DIR_0P0 * numberOfLBnodes];
-         dist.f[DIR_0P0] = &distributionArray[DIR_0M0 * numberOfLBnodes];
-         dist.f[DIR_00M] = &distributionArray[DIR_00P * numberOfLBnodes];
-         dist.f[DIR_00P] = &distributionArray[DIR_00M * numberOfLBnodes];
-         dist.f[DIR_MM0] = &distributionArray[DIR_PP0 * numberOfLBnodes];
-         dist.f[DIR_PP0] = &distributionArray[DIR_MM0 * numberOfLBnodes];
-         dist.f[DIR_MP0] = &distributionArray[DIR_PM0 * numberOfLBnodes];
-         dist.f[DIR_PM0] = &distributionArray[DIR_MP0 * numberOfLBnodes];
-         dist.f[DIR_M0M] = &distributionArray[DIR_P0P * numberOfLBnodes];
-         dist.f[DIR_P0P] = &distributionArray[DIR_M0M * numberOfLBnodes];
-         dist.f[DIR_M0P] = &distributionArray[DIR_P0M * numberOfLBnodes];
-         dist.f[DIR_P0M] = &distributionArray[DIR_M0P * numberOfLBnodes];
-         dist.f[DIR_0MM] = &distributionArray[DIR_0PP * numberOfLBnodes];
-         dist.f[DIR_0PP] = &distributionArray[DIR_0MM * numberOfLBnodes];
-         dist.f[DIR_0MP] = &distributionArray[DIR_0PM * numberOfLBnodes];
-         dist.f[DIR_0PM] = &distributionArray[DIR_0MP * numberOfLBnodes];
-         dist.f[DIR_000] = &distributionArray[DIR_000 * numberOfLBnodes];
-         dist.f[DIR_PPP] = &distributionArray[DIR_MMM * numberOfLBnodes];
-         dist.f[DIR_MMP] = &distributionArray[DIR_PPM * numberOfLBnodes];
-         dist.f[DIR_PMP] = &distributionArray[DIR_MPM * numberOfLBnodes];
-         dist.f[DIR_MPP] = &distributionArray[DIR_PMM * numberOfLBnodes];
-         dist.f[DIR_PPM] = &distributionArray[DIR_MMP * numberOfLBnodes];
-         dist.f[DIR_MMM] = &distributionArray[DIR_PPP * numberOfLBnodes];
-         dist.f[DIR_PMM] = &distributionArray[DIR_MPP * numberOfLBnodes];
-         dist.f[DIR_MPM] = &distributionArray[DIR_PMP * numberOfLBnodes];
-    }
-}
-
 /**
 *  Getting references to the 27 directions.
 *  @params distributions 1D real* array containing all data (number of elements = 27 * matrix_size)
@@ -114,7 +51,7 @@ __inline__ __device__ __host__ void getPointersToDistributions(Distributions27 &
 *  @params isEvenTimestep: stored data dependent on timestep is based on the esoteric twist algorithm
 *  @return a data struct containing the addresses to the 27 directions within the 1D distribution array
 */
-__inline__ __device__ __host__ DistributionReferences27 getDistributionReferences27(real* distributions, unsigned int numberOfLBnodes, bool isEvenTimestep){
+__inline__ __device__ __host__ DistributionReferences27 getDistributionReferences27(real* distributions, const unsigned long long numberOfLBnodes, const bool isEvenTimestep){
     DistributionReferences27 distribution_references;
     getPointersToDistributions(distribution_references, distributions, numberOfLBnodes, isEvenTimestep);
     return distribution_references;
@@ -157,20 +94,6 @@ struct DistributionWrapper
     const uint kbsw;
 };
 
-__inline__ __device__ unsigned int getNodeIndex()
-{
-    const unsigned x = threadIdx.x;
-    const unsigned y = blockIdx.x;
-    const unsigned z = blockIdx.y;
-
-    const unsigned nx = blockDim.x;
-    const unsigned ny = gridDim.x;
-
-    return nx * (ny * z + y) + x;
-}
-
-__device__ bool isValidFluidNode(uint nodeType);
-
 }
 
 #endif
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.cpp b/src/gpu/VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.cpp
index 53ec240f096080097416e640fdd095c3812fb34c..5a2d8c9a426e5cb23ca75f91aaf6fbff75cba72b 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.cpp
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.cpp
@@ -8,11 +8,9 @@
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/BGKPlus/BGKPlusCompSP27.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cascade/CascadeCompSP27.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/Cumulant/CumulantCompSP27.h"
-#include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Unified/CumulantK17Unified.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chim/CumulantK17CompChim.h"
-#include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.h"
-#include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.h"
+#include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Bulk/CumulantK17BulkComp.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantAll4/CumulantAll4CompSP27.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK18/CumulantK18Comp.h"
@@ -49,9 +47,6 @@
 #include "Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15/WaleCumulantK15Comp.h"
 #include "Kernel/Kernels/WaleKernels/FluidFlow/Compressible/CumulantK15BySoniMalav/WaleBySoniMalavCumulantK15Comp.h"
 
-//turbulent viscosity kernel
-#include "Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim.h"
-
 //strategies
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/FluidFlowCompStrategy.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Incompressible/FluidFlowIncompStrategy.h"
@@ -61,7 +56,6 @@
 #include "Kernel/Kernels/BasicKernels/AdvectionDiffusion/Incompressible/Mod7/ADMod7IncompStrategy.h"
 #include "Kernel/Kernels/PorousMediaKernels/FluidFlow/Compressible/PMFluidFlowCompStrategy.h"
 #include "Kernel/Kernels/WaleKernels/FluidFlow/Compressible/WaleFluidFlowCompStrategy.h"
-#include "Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/TurbulentViscosityFluidFlowCompStrategy.h"
 
 std::vector<std::shared_ptr<Kernel>> KernelFactoryImp::makeKernels(std::shared_ptr<Parameter> para)
 {
@@ -118,9 +112,6 @@ std::shared_ptr<Kernel> KernelFactoryImp::makeKernel(std::shared_ptr<Parameter>
     } else if (kernel == "CumulantCompSP27") {
         newKernel     = CumulantCompSP27::getNewInstance(para, level);
         checkStrategy = FluidFlowCompStrategy::getInstance();
-    } else if (kernel == "CumulantK17Comp") {
-        newKernel     = CumulantK17Comp::getNewInstance(para, level);
-        checkStrategy = FluidFlowCompStrategy::getInstance();
     } else if (kernel == "CumulantK15Unified") {
         newKernel     = std::make_shared<vf::gpu::CumulantK15Unified>(para, level);
         checkStrategy = FluidFlowCompStrategy::getInstance();
@@ -133,12 +124,26 @@ std::shared_ptr<Kernel> KernelFactoryImp::makeKernel(std::shared_ptr<Parameter>
     } else if (kernel == "CumulantK17CompChim") {
         newKernel     = CumulantK17CompChim::getNewInstance(para, level);
         checkStrategy = FluidFlowCompStrategy::getInstance();
-    } else if (kernel == "CumulantK17CompChimStream") {
-        newKernel     = CumulantK17CompChimStream::getNewInstance(para, level);
-        checkStrategy = FluidFlowCompStrategy::getInstance();
-    } else if (kernel == "CumulantK17CompChimRedesigned") {
-        newKernel     = CumulantK17CompChimRedesigned::getNewInstance(para, level);
-        checkStrategy = FluidFlowCompStrategy::getInstance();
+    } else if (kernel == "CumulantK17"){               
+        switch(para->getTurbulenceModel())                                          
+        {   
+            case TurbulenceModel::AMD:
+                newKernel = CumulantK17<TurbulenceModel::AMD>::getNewInstance(para, level);   
+                break;
+            case TurbulenceModel::Smagorinsky:
+                newKernel = CumulantK17<TurbulenceModel::Smagorinsky>::getNewInstance(para, level);  
+                break;
+            case TurbulenceModel::QR:
+                newKernel = CumulantK17<TurbulenceModel::QR>::getNewInstance(para, level);  
+                break;
+            case TurbulenceModel::None:
+                newKernel = CumulantK17<TurbulenceModel::None>::getNewInstance(para, level); 
+                break;
+            default:
+                throw std::runtime_error("Unknown turbulence model!");
+            break;                                                              
+        }                                                                       
+        checkStrategy = FluidFlowCompStrategy::getInstance();       
     } else if (kernel == "CumulantAll4CompSP27") {
         newKernel     = CumulantAll4CompSP27::getNewInstance(para, level);
         checkStrategy = FluidFlowCompStrategy::getInstance();
@@ -197,35 +202,9 @@ std::shared_ptr<Kernel> KernelFactoryImp::makeKernel(std::shared_ptr<Parameter>
         newKernel     = WaleBySoniMalavCumulantK15Comp::getNewInstance(para, level);// ||
         checkStrategy = WaleFluidFlowCompStrategy::getInstance();               // wale model
     }                                                                          //===============
-    else if (kernel == "TurbulentViscosityCumulantK17CompChim"){               // compressible with turbulent viscosity
-        switch(para->getTurbulenceModel())                                     //       ||          
-        {                                                                      //       \/      //
-            case TurbulenceModel::AMD:
-                newKernel = TurbulentViscosityCumulantK17CompChim<TurbulenceModel::AMD>::getNewInstance(para, level);   
-                break;
-            case TurbulenceModel::Smagorinsky:
-                newKernel = TurbulentViscosityCumulantK17CompChim<TurbulenceModel::Smagorinsky>::getNewInstance(para, level);  
-                break;
-            case TurbulenceModel::QR:
-                newKernel = TurbulentViscosityCumulantK17CompChim<TurbulenceModel::QR>::getNewInstance(para, level);  
-                break;
-            case TurbulenceModel::None:
-                throw std::runtime_error("TurbulentViscosityCumulantK17CompChim currently not implemented for TurbulenceModel::None!");
-                break;
-            default:
-                throw std::runtime_error("Unknown turbulence model!");
-            break;                                                              
-        }                                                                       
-        checkStrategy = TurbulentViscosityFluidFlowCompStrategy::getInstance(); 
-                                                                                //     /\      //
-                                                                                //     ||    
-                                                                                // compressible with turbulent viscosity  
-                                                                                //===============         
-    }
     else {
         throw std::runtime_error("KernelFactory does not know the KernelType.");
     }
-
     newKernel->setCheckParameterStrategy(checkStrategy);
     para->setKernelNeedsFluidNodeIndicesToRun(newKernel->getKernelUsesFluidNodeIndices());
     return newKernel;
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Utilities/ScalingHelperFunctions.h b/src/gpu/VirtualFluids_GPU/Kernel/Utilities/ScalingHelperFunctions.h
deleted file mode 100644
index 13ce5d88aaa7cb49225fa914c1f59c2de05802f5..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/Kernel/Utilities/ScalingHelperFunctions.h
+++ /dev/null
@@ -1,148 +0,0 @@
-//=======================================================================================
-// ____          ____    __    ______     __________   __      __       __        __         
-// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
-//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
-//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
-//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
-//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
-//      \    \  |    |   ________________________________________________________________    
-//       \    \ |    |  |  ______________________________________________________________|   
-//        \    \|    |  |  |         __          __     __     __     ______      _______    
-//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
-//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
-//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
-//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
-//
-//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file scalingHelperFunctions.h
-//! \ingroup GPU/Kernel/Utilities
-//! \author Martin Schoenherr, Anna Wellmann
-//=======================================================================================
-
-#ifndef SCALING_HELPER_FUNCTIONS_H
-#define SCALING_HELPER_FUNCTIONS_H
-
-#include "LBM/LB.h" 
-#include "lbm/constants/D3Q27.h"
-#include "lbm/constants/NumericConstants.h"
-
-using namespace vf::lbm::constant;
-using namespace vf::lbm::dir;
-
-__device__ __inline__ void calculateMomentsOnSourceNodes(
-    Distributions27& dist,
-    real& omega,
-    unsigned int& k_000,
-    unsigned int& k_M00,
-    unsigned int& k_0M0,
-    unsigned int& k_00M,
-    unsigned int& k_MM0,
-    unsigned int& k_M0M,
-    unsigned int& k_0MM,
-    unsigned int& k_MMM,
-    real& drho,
-    real& velocityX,
-    real& velocityY,
-    real& velocityZ,
-    real& kxyFromfcNEQ,
-    real& kyzFromfcNEQ,
-    real& kxzFromfcNEQ,
-    real& kxxMyyFromfcNEQ,
-    real& kxxMzzFromfcNEQ
-    ){
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Set local distributions (f's) on source nodes:
-        //!
-        real f_000 = (dist.f[DIR_000])[k_000]; 
-        real f_P00 = (dist.f[DIR_P00])[k_000];
-        real f_M00 = (dist.f[DIR_M00])[k_M00];
-        real f_0P0 = (dist.f[DIR_0P0])[k_000];
-        real f_0M0 = (dist.f[DIR_0M0])[k_0M0];
-        real f_00P = (dist.f[DIR_00P])[k_000];
-        real f_00M = (dist.f[DIR_00M])[k_00M];
-        real f_PP0 = (dist.f[DIR_PP0])[k_000];
-        real f_MM0 = (dist.f[DIR_MM0])[k_MM0];
-        real f_PM0 = (dist.f[DIR_PM0])[k_0M0];
-        real f_MP0 = (dist.f[DIR_MP0])[k_M00];
-        real f_P0P = (dist.f[DIR_P0P])[k_000];
-        real f_M0M = (dist.f[DIR_M0M])[k_M0M];
-        real f_P0M = (dist.f[DIR_P0M])[k_00M];
-        real f_M0P = (dist.f[DIR_M0P])[k_M00];
-        real f_0PP = (dist.f[DIR_0PP])[k_000];
-        real f_0MM = (dist.f[DIR_0MM])[k_0MM];
-        real f_0PM = (dist.f[DIR_0PM])[k_00M];
-        real f_0MP = (dist.f[DIR_0MP])[k_0M0];
-        real f_PPP = (dist.f[DIR_PPP])[k_000];
-        real f_MPP = (dist.f[DIR_MPP])[k_M00];
-        real f_PMP = (dist.f[DIR_PMP])[k_0M0];
-        real f_MMP = (dist.f[DIR_MMP])[k_MM0];
-        real f_PPM = (dist.f[DIR_PPM])[k_00M];
-        real f_MPM = (dist.f[DIR_MPM])[k_M0M];
-        real f_PMM = (dist.f[DIR_PMM])[k_0MM];
-        real f_MMM = (dist.f[DIR_MMM])[k_MMM];
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
-        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
-        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-        //!
-        drho = ((((f_PPP + f_MMM) + (f_MPM + f_PMP)) + ((f_MPP + f_PMM) + (f_MMP + f_PPM))) +
-                (((f_0MP + f_0PM) + (f_0MM + f_0PP)) + ((f_M0P + f_P0M) + (f_M0M + f_P0P)) +
-                 ((f_MP0 + f_PM0) + (f_MM0 + f_PP0))) +
-                 ((f_M00 + f_P00) + (f_0M0 + f_0P0) + (f_00M + f_00P))) +
-                   f_000;
-
-        real oneOverRho = c1o1 / (c1o1 + drho);
-
-        velocityX = ((((f_PPP - f_MMM) + (f_PMP - f_MPM)) + ((f_PMM - f_MPP) + (f_PPM - f_MMP))) +
-                     (((f_P0M - f_M0P) + (f_P0P - f_M0M)) + ((f_PM0 - f_MP0) + (f_PP0 - f_MM0))) + (f_P00 - f_M00)) *
-                    oneOverRho;
-        velocityY = ((((f_PPP - f_MMM) + (f_MPM - f_PMP)) + ((f_MPP - f_PMM) + (f_PPM - f_MMP))) +
-                     (((f_0PM - f_0MP) + (f_0PP - f_0MM)) + ((f_MP0 - f_PM0) + (f_PP0 - f_MM0))) + (f_0P0 - f_0M0)) *
-                    oneOverRho;
-        velocityZ = ((((f_PPP - f_MMM) + (f_PMP - f_MPM)) + ((f_MPP - f_PMM) + (f_MMP - f_PPM))) +
-                     (((f_0MP - f_0PM) + (f_0PP - f_0MM)) + ((f_M0P - f_P0M) + (f_P0P - f_M0M))) + (f_00P - f_00M)) *
-                    oneOverRho;
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Calculate second order moments for interpolation
-        //!
-        // example: kxxMzz: moment, second derivative in x direction minus the second derivative in z direction
-        kxyFromfcNEQ =
-            -c3o1 * omega *
-            ((f_MM0 + f_MMM + f_MMP - f_MP0 - f_MPM - f_MPP - f_PM0 - f_PMM - f_PMP + f_PP0 + f_PPM + f_PPP) /
-                 (c1o1 + drho) -
-             ((velocityX * velocityY)));
-        kyzFromfcNEQ =
-            -c3o1 * omega *
-            ((f_0MM + f_PMM + f_MMM - f_0MP - f_PMP - f_MMP - f_0PM - f_PPM - f_MPM + f_0PP + f_PPP + f_MPP) /
-                 (c1o1 + drho) -
-             ((velocityY * velocityZ)));
-        kxzFromfcNEQ =
-            -c3o1 * omega *
-            ((f_M0M + f_MMM + f_MPM - f_M0P - f_MMP - f_MPP - f_P0M - f_PMM - f_PPM + f_P0P + f_PMP + f_PPP) /
-                 (c1o1 + drho) -
-             ((velocityX * velocityZ)));
-        kxxMyyFromfcNEQ =
-            -c3o2 * omega *
-            ((f_M0M + f_M00 + f_M0P - f_0MM - f_0M0 - f_0MP - f_0PM - f_0P0 - f_0PP + f_P0M + f_P00 + f_P0P) / (c1o1 + drho) -
-             ((velocityX * velocityX - velocityY * velocityY)));
-        kxxMzzFromfcNEQ =
-            -c3o2 * omega *
-            ((f_MM0 + f_M00 + f_MP0 - f_0MM - f_0MP - f_00M - f_00P - f_0PM - f_0PP + f_PM0 + f_P00 + f_PP0) / (c1o1 + drho) -
-             ((velocityX * velocityX - velocityZ * velocityZ)));
-}
-
-#endif
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/ADKernelManager.cpp b/src/gpu/VirtualFluids_GPU/KernelManager/ADKernelManager.cpp
index 9ca813ac4987af618491422acb60207b7fee543c..5a36daecd5a82fc8a052bf51fedc1cb35b94a960 100644
--- a/src/gpu/VirtualFluids_GPU/KernelManager/ADKernelManager.cpp
+++ b/src/gpu/VirtualFluids_GPU/KernelManager/ADKernelManager.cpp
@@ -90,10 +90,10 @@ void ADKernelManager::initAD(const int level) const
 ////////////////////////////////////////////////////////////////////////////////
 void ADKernelManager::setInitialNodeValuesAD(const int level, SPtr<CudaMemoryManager> cudaMemoryManager) const
 {
-    for (uint j = 1; j <= para->getParH(level)->numberOfNodes; j++) {
-        const real coordX = para->getParH(level)->coordinateX[j];
-        const real coordY = para->getParH(level)->coordinateY[j];
-        const real coordZ = para->getParH(level)->coordinateZ[j];
+    for (size_t index = 1; index <= para->getParH(level)->numberOfNodes; index++) {
+        const real coordX = para->getParH(level)->coordinateX[index];
+        const real coordY = para->getParH(level)->coordinateY[index];
+        const real coordZ = para->getParH(level)->coordinateZ[index];
 
         real concentration;
 
@@ -104,7 +104,7 @@ void ADKernelManager::setInitialNodeValuesAD(const int level, SPtr<CudaMemoryMan
             concentration = real(0.0);
         }
 
-        para->getParH(level)->concentration[j] = concentration;
+        para->getParH(level)->concentration[index] = concentration;
     }
 
     cudaMemoryManager->cudaCopyConcentrationHostToDevice(level);
diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.cpp b/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.cpp
index cc945ea225a28c58dca4ceefdb80fffb76228b21..e8fc3f318c920be36be7861a28659124a7b1e977 100644
--- a/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.cpp
+++ b/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.cpp
@@ -38,6 +38,7 @@
 
 #include "BCKernelManager.h"
 #include "Factories/BoundaryConditionFactory.h"
+#include "GridGenerator/TransientBCSetter/TransientBCSetter.h"
 #include "Calculation/Cp.h"
 #include "Calculation/DragLift.h"
 #include "GPU/GPU_Interface.h"
@@ -51,6 +52,7 @@ BCKernelManager::BCKernelManager(SPtr<Parameter> parameter, BoundaryConditionFac
     this->pressureBoundaryConditionPre  = bcFactory->getPressureBoundaryConditionPre();
     this->geometryBoundaryConditionPost = bcFactory->getGeometryBoundaryConditionPost();
     this->stressBoundaryConditionPost   = bcFactory->getStressBoundaryConditionPost();
+    this->precursorBoundaryConditionPost = bcFactory->getPrecursorBoundaryConditionPost();
 
     checkBoundaryCondition(this->velocityBoundaryConditionPost, this->para->getParD(0)->velocityBC,
                            "velocityBoundaryConditionPost");
@@ -64,6 +66,8 @@ BCKernelManager::BCKernelManager(SPtr<Parameter> parameter, BoundaryConditionFac
                            "geometryBoundaryConditionPost");
     checkBoundaryCondition(this->stressBoundaryConditionPost, this->para->getParD(0)->stressBC,
                            "stressBoundaryConditionPost");
+    checkBoundaryCondition(this->precursorBoundaryConditionPost, this->para->getParD(0)->precursorBC,
+                           "precursorBoundaryConditionPost");
 }
 
 void BCKernelManager::runVelocityBCKernelPre(const int level) const
@@ -387,3 +391,41 @@ void BCKernelManager::runNoSlipBCKernelPost(const int level) const{
         noSlipBoundaryConditionPost(para->getParD(level).get(), &(para->getParD(level)->noSlipBC));
     }
 }
+
+void BCKernelManager::runPrecursorBCKernelPost(int level, uint t, CudaMemoryManager* cudaMemoryManager)
+{
+    if(para->getParH(level)->precursorBC.numberOfBCnodes == 0) return;
+
+    uint t_level = para->getTimeStep(level, t, true);
+
+    uint lastTime =    (para->getParD(level)->precursorBC.nPrecursorReads-2)*para->getParD(level)->precursorBC.timeStepsBetweenReads; // timestep currently loaded into last arrays
+    uint currentTime = (para->getParD(level)->precursorBC.nPrecursorReads-1)*para->getParD(level)->precursorBC.timeStepsBetweenReads; // timestep currently loaded into current arrays
+    uint nextTime =     para->getParD(level)->precursorBC.nPrecursorReads   *para->getParD(level)->precursorBC.timeStepsBetweenReads; // timestep currently loaded into next arrays
+    
+    if(t_level>=currentTime)
+    {
+        //cycle time
+        lastTime = currentTime;
+        currentTime = nextTime;
+        nextTime += para->getParD(level)->precursorBC.timeStepsBetweenReads;
+
+        //cycle pointers
+        real* tmp = para->getParD(level)->precursorBC.last;
+        para->getParD(level)->precursorBC.last = para->getParD(level)->precursorBC.current;
+        para->getParD(level)->precursorBC.current = para->getParD(level)->precursorBC.next;
+        para->getParD(level)->precursorBC.next = tmp;
+
+        real loadTime = nextTime*pow(2,-level)*para->getTimeRatio();
+
+        for(auto reader : para->getParH(level)->transientBCInputFileReader)
+        {   
+            reader->getNextData(para->getParH(level)->precursorBC.next, para->getParH(level)->precursorBC.numberOfPrecursorNodes, loadTime);
+        }
+        cudaMemoryManager->cudaCopyPrecursorData(level);
+        para->getParD(level)->precursorBC.nPrecursorReads++;
+        para->getParH(level)->precursorBC.nPrecursorReads++;  
+    }
+    
+    real tRatio = real(t_level-lastTime)/para->getParD(level)->precursorBC.timeStepsBetweenReads;
+    precursorBoundaryConditionPost(para->getParD(level).get(), &para->getParD(level)->precursorBC, tRatio, para->getVelocityRatio());
+}
diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.h b/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.h
index 423a9cc9056281a3a2a135ae32fa26cc47f93967..339100e6b5307e8e60f8d0846560bf89c6eea1a1 100644
--- a/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.h
+++ b/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.h
@@ -41,6 +41,7 @@
 #include "PointerDefinitions.h"
 #include "VirtualFluids_GPU_export.h"
 
+
 class CudaMemoryManager;
 class BoundaryConditionFactory;
 class Parameter;
@@ -48,6 +49,7 @@ struct LBMSimulationParameter;
 
 using boundaryCondition = std::function<void(LBMSimulationParameter *, QforBoundaryConditions *)>;
 using boundaryConditionWithParameter = std::function<void(Parameter *, QforBoundaryConditions *, const int level)>;
+using precursorBoundaryCondition = std::function<void(LBMSimulationParameter *, QforPrecursorBoundaryConditions *, real tRatio, real velocityRatio)>;
 
 //! \class BCKernelManager
 //! \brief manage the cuda kernel calls to boundary conditions
@@ -84,7 +86,10 @@ public:
     //! \brief calls the device function of the pressure boundary condition (post-collision)
     void runPressureBCKernelPost(const int level) const;
 
-    //! \brief calls the device function of the outflow boundary condition (pre-collision)
+	//! \brief calls the device function of the precursor boundary condition
+	void runPrecursorBCKernelPost(int level, uint t, CudaMemoryManager* cudaMemoryManager);
+
+    //! \brief calls the device function of the outflow boundary condition
     void runOutflowBCKernelPre(const int level) const;
 
     //! \brief calls the device function of the stress wall model (post-collision)
@@ -96,13 +101,16 @@ private:
     //! \param boundaryCondition: a kernel function for the boundary condition
     //! \param bcStruct: a struct containing the grid nodes which are part of the boundary condition
     //! \param bcName: the name of the checked boundary condition
-    template <typename bcFunction>
-    void checkBoundaryCondition(const bcFunction &boundaryCondition, const QforBoundaryConditions &bcStruct, const std::string &bcName)
+    template <typename bcFunction, typename QforBC>
+    void checkBoundaryCondition(const bcFunction &boundaryCondition, const QforBC &bcStruct, const std::string &bcName)
     {
         if (!boundaryCondition && bcStruct.numberOfBCnodes > 0)
             throw std::runtime_error("The boundary condition " + bcName + " was not set!");
     }
 
+    void runDistributionPrecursorBCKernelPost(int level, uint t, CudaMemoryManager* cudaMemoryManager);
+    void runVelocityPrecursorBCKernelPost(int level, uint t, CudaMemoryManager* cudaMemoryManager);
+
     SPtr<Parameter> para;
 
     boundaryCondition velocityBoundaryConditionPost = nullptr;
@@ -111,5 +119,6 @@ private:
     boundaryCondition pressureBoundaryConditionPre = nullptr;
     boundaryCondition geometryBoundaryConditionPost = nullptr;
     boundaryConditionWithParameter stressBoundaryConditionPost = nullptr;
+    precursorBoundaryCondition precursorBoundaryConditionPost = nullptr;
 };
 #endif
diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManagerTest.cpp b/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManagerTest.cpp
index d55fa51bd8a225dd4e89e684bc81cd56f3f450c0..a0e02112e821eedcfeb013d3465529f668309529 100644
--- a/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManagerTest.cpp
+++ b/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManagerTest.cpp
@@ -53,3 +53,9 @@ TEST_F(BCKernelManagerTest_BCsNotSpecified, stressBoundaryConditionPost_NotSpeci
     para->getParD(0)->stressBC.numberOfBCnodes = 1;
     EXPECT_THROW(BCKernelManager(para, &bcFactory), std::runtime_error);
 }
+
+TEST_F(BCKernelManagerTest_BCsNotSpecified, precursorBoundaryConditionPost_NotSpecified)
+{
+    para->getParD(0)->precursorBC.numberOfBCnodes = 1;
+    EXPECT_THROW(BCKernelManager(para, &bcFactory), std::runtime_error);
+}
diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp
index c3129e31a9c750a012a26d58961062eaf3f40add..2b6a266c0d4e5f523091fa4982eee5d83b2ec675 100644
--- a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp
+++ b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp
@@ -59,8 +59,9 @@ GridScalingKernelManager::GridScalingKernelManager(SPtr<Parameter> parameter, Gr
         VF_LOG_TRACE("Function for scalingCoarseToFine is nullptr");
 }
 
-void GridScalingKernelManager::runFineToCoarseKernelLB(const int level, InterpolationCellFC *icellFC, OffFC &offFC, int streamIndex) const{
-    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+void GridScalingKernelManager::runFineToCoarseKernelLB(const int level, InterpolationCellFC *icellFC, OffFC &offFC, CudaStreamIndex streamIndex) const
+{
+    cudaStream_t stream = para->getStreamManager()->getStream(streamIndex);
 
     this->scalingFineToCoarse(para->getParD(level).get(), para->getParD(level+1).get(), icellFC, offFC, stream);
 
@@ -327,9 +328,9 @@ void GridScalingKernelManager::runFineToCoarseKernelAD(const int level) const
     }
 }
 
-void GridScalingKernelManager::runCoarseToFineKernelLB(const int level, InterpolationCellCF* icellCF, OffCF &offCF, int streamIndex) const
+void GridScalingKernelManager::runCoarseToFineKernelLB(const int level, InterpolationCellCF* icellCF, OffCF &offCF, CudaStreamIndex streamIndex) const
 {
-    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+    cudaStream_t stream = para->getStreamManager()->getStream(streamIndex);
     this->scalingCoarseToFine(para->getParD(level).get(), para->getParD(level+1).get(), icellCF, offCF, stream);
 
     // ScaleCF_comp_D3Q27F3(
diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.h b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.h
index 85cdd88ec2e3a6622108026ce8f53c5c770f8afe..3c78ee7f9db254556e8ec6dbbafaf51cd995f10b 100644
--- a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.h
+++ b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.h
@@ -44,6 +44,7 @@
 class Parameter;
 class CudaMemoryManager;
 class GridScalingFactory;
+enum class CudaStreamIndex;
 struct LBMSimulationParameter;
 struct CUstream_st;
 
@@ -62,14 +63,14 @@ public:
     //! \throws std::runtime_error when the user forgets to specify a scaling function
     GridScalingKernelManager(SPtr<Parameter> parameter, GridScalingFactory *gridScalingFactory);
 
-    //! \brief calls the device function of the fine to coarse grid interpolation kernel
-    void runFineToCoarseKernelLB(const int level, InterpolationCellFC *icellFC, OffFC &offFC, int streamIndex) const;
+    //! \brief calls the device function of the fine to coarse grid interpolation kernelH
+    void runFineToCoarseKernelLB(const int level, InterpolationCellFC *icellFC, OffFC &offFC, CudaStreamIndex streamIndex) const;
 
     //! \brief calls the device function of the fine to coarse grid interpolation kernel (advection diffusion)
     void runFineToCoarseKernelAD(const int level) const;
 
     //! \brief calls the device function of the coarse to fine grid interpolation kernel
-    void runCoarseToFineKernelLB(const int level, InterpolationCellCF *icellCF, OffCF &offCF, int streamIndex) const;
+    void runCoarseToFineKernelLB(const int level, InterpolationCellCF *icellCF, OffCF &offCF, CudaStreamIndex streamIndex) const;
 
     //! \brief calls the device function of the coarse to fine grid interpolation kernel (advection diffusion)
     void runCoarseToFineKernelAD(const int level) const;
diff --git a/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/ChimeraTransformation.h b/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/ChimeraTransformation.h
new file mode 100644
index 0000000000000000000000000000000000000000..225f615ec3ad2d8ef11ec295f8d9e8a4166d99fe
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/ChimeraTransformation.h
@@ -0,0 +1,108 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ChimeraTransformation.h
+//! \ingroup LBM/GPUHelperFunctions
+//! \author Martin Schoenherr, Anna Wellmann, Soeren Peters
+//=======================================================================================
+#ifndef CHIMERA_TRANSFORMATION_H
+#define CHIMERA_TRANSFORMATION_H
+
+#include "LBM/LB.h"
+
+#include <lbm/constants/NumericConstants.h>
+
+using namespace vf::lbm::constant;
+
+namespace vf::gpu
+{
+
+////////////////////////////////////////////////////////////////////////////////
+//! \brief forward chimera transformation \ref forwardInverseChimeraWithK
+//! Transformation from distributions to central moments according to Eq. (6)-(14) in \ref
+//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
+//! ]</b></a> Modified for lower round-off errors.
+__inline__ __device__ void forwardInverseChimeraWithK(real &mfa, real &mfb, real &mfc, real vv, real v2, real Kinverse, real K)
+{
+    real m2 = mfa + mfc;
+    real m1 = mfc - mfa;
+    real m0 = m2 + mfb;
+    mfa = m0;
+    m0 *= Kinverse;
+    m0 += c1o1;
+    mfb = (m1 * Kinverse - m0 * vv) * K;
+    mfc = ((m2 - c2o1 * m1 * vv) * Kinverse + v2 * m0) * K;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! \brief backward chimera transformation \ref backwardInverseChimeraWithK
+//! Transformation from central moments to distributions according to Eq. (57)-(65) in \ref
+//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
+//! ]</b></a> Modified for lower round-off errors.
+__inline__ __device__ void backwardInverseChimeraWithK(real &mfa, real &mfb, real &mfc, real vv, real v2, real Kinverse, real K)
+{
+    real m0 = (((mfc - mfb) * c1o2 + mfb * vv) * Kinverse + (mfa * Kinverse + c1o1) * (v2 - vv) * c1o2) * K;
+    real m1 = (((mfa - mfc) - c2o1 * mfb * vv) * Kinverse + (mfa * Kinverse + c1o1) * (-v2)) * K;
+    mfc = (((mfc + mfb) * c1o2 + mfb * vv) * Kinverse + (mfa * Kinverse + c1o1) * (v2 + vv) * c1o2) * K;
+    mfa = m0;
+    mfb = m1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! \brief forward chimera transformation \ref forwardChimera
+//! Transformation from distributions to central moments according to Eq. (6)-(14) in \ref
+//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
+//! ]</b></a> for \f$ K_{abc}=0 \f$. This is to avoid unnessary floating point operations. Modified for lower round-off
+//! errors.
+__inline__ __device__ void forwardChimera(real &mfa, real &mfb, real &mfc, real vv, real v2)
+{
+    real m1 = (mfa + mfc) + mfb;
+    real m2 = mfc - mfa;
+    mfc = (mfc + mfa) + (v2 * m1 - c2o1 * vv * m2);
+    mfb = m2 - vv * m1;
+    mfa = m1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! \brief backward chimera transformation \ref backwardChimera
+//! Transformation from central moments to distributions according to Eq. (57)-(65) in \ref
+//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
+//! ]</b></a> for \f$ K_{abc}=0 \f$. This is to avoid unnessary floating point operations. Modified for lower round-off
+//! errors.
+__inline__ __device__ void backwardChimera(real &mfa, real &mfb, real &mfc, real vv, real v2)
+{
+    real ma = (mfc + mfa * (v2 - vv)) * c1o2 + mfb * (vv - c1o2);
+    real mb = ((mfa - mfc) - mfa * v2) - c2o1 * mfb * vv;
+    mfc = (mfc + mfa * (v2 + vv)) * c1o2 + mfb * (vv + c1o2);
+    mfb = mb;
+    mfa = ma;
+}
+
+} // namespace vf::gpu
+
+#endif
diff --git a/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/KernelUtilities.h b/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/KernelUtilities.h
new file mode 100644
index 0000000000000000000000000000000000000000..37208ee59586533fa7f8ffbc269246826ed27fb8
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/KernelUtilities.h
@@ -0,0 +1,198 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file KernelUtilities.h
+//! \ingroup LBM/GPUHelperFunctions
+//! \author Martin Schoenherr, Anna Wellmann, Soeren Peters
+//=======================================================================================
+#ifndef KERNEL_UTILITIES_H
+#define KERNEL_UTILITIES_H
+
+#include "LBM/LB.h"
+#include "lbm/constants/D3Q27.h"
+#include "lbm/constants/NumericConstants.h"
+
+using namespace vf::lbm::constant;
+using namespace vf::lbm::dir;
+
+namespace vf::gpu
+{
+
+__inline__ __device__ __host__ void getPointersToDistributions(Distributions27 &dist, real *distributionArray, const unsigned long long numberOfLBnodes, const bool isEvenTimestep)
+{
+    if (isEvenTimestep)
+    {
+        dist.f[DIR_000] = &distributionArray[DIR_000 * numberOfLBnodes];
+        dist.f[DIR_P00] = &distributionArray[DIR_P00 * numberOfLBnodes];
+        dist.f[DIR_M00] = &distributionArray[DIR_M00 * numberOfLBnodes];
+        dist.f[DIR_0P0] = &distributionArray[DIR_0P0 * numberOfLBnodes];
+        dist.f[DIR_0M0] = &distributionArray[DIR_0M0 * numberOfLBnodes];
+        dist.f[DIR_00P] = &distributionArray[DIR_00P * numberOfLBnodes];
+        dist.f[DIR_00M] = &distributionArray[DIR_00M * numberOfLBnodes];
+        dist.f[DIR_PP0] = &distributionArray[DIR_PP0 * numberOfLBnodes];
+        dist.f[DIR_MM0] = &distributionArray[DIR_MM0 * numberOfLBnodes];
+        dist.f[DIR_PM0] = &distributionArray[DIR_PM0 * numberOfLBnodes];
+        dist.f[DIR_MP0] = &distributionArray[DIR_MP0 * numberOfLBnodes];
+        dist.f[DIR_P0P] = &distributionArray[DIR_P0P * numberOfLBnodes];
+        dist.f[DIR_M0M] = &distributionArray[DIR_M0M * numberOfLBnodes];
+        dist.f[DIR_P0M] = &distributionArray[DIR_P0M * numberOfLBnodes];
+        dist.f[DIR_M0P] = &distributionArray[DIR_M0P * numberOfLBnodes];
+        dist.f[DIR_0PP] = &distributionArray[DIR_0PP * numberOfLBnodes];
+        dist.f[DIR_0MM] = &distributionArray[DIR_0MM * numberOfLBnodes];
+        dist.f[DIR_0PM] = &distributionArray[DIR_0PM * numberOfLBnodes];
+        dist.f[DIR_0MP] = &distributionArray[DIR_0MP * numberOfLBnodes];
+        dist.f[DIR_PPP] = &distributionArray[DIR_PPP * numberOfLBnodes];
+        dist.f[DIR_MMP] = &distributionArray[DIR_MMP * numberOfLBnodes];
+        dist.f[DIR_PMP] = &distributionArray[DIR_PMP * numberOfLBnodes];
+        dist.f[DIR_MPP] = &distributionArray[DIR_MPP * numberOfLBnodes];
+        dist.f[DIR_PPM] = &distributionArray[DIR_PPM * numberOfLBnodes];
+        dist.f[DIR_MMM] = &distributionArray[DIR_MMM * numberOfLBnodes];
+        dist.f[DIR_PMM] = &distributionArray[DIR_PMM * numberOfLBnodes];
+        dist.f[DIR_MPM] = &distributionArray[DIR_MPM * numberOfLBnodes];
+    }
+    else
+    {
+         dist.f[DIR_M00] = &distributionArray[DIR_P00 * numberOfLBnodes];
+         dist.f[DIR_P00] = &distributionArray[DIR_M00 * numberOfLBnodes];
+         dist.f[DIR_0M0] = &distributionArray[DIR_0P0 * numberOfLBnodes];
+         dist.f[DIR_0P0] = &distributionArray[DIR_0M0 * numberOfLBnodes];
+         dist.f[DIR_00M] = &distributionArray[DIR_00P * numberOfLBnodes];
+         dist.f[DIR_00P] = &distributionArray[DIR_00M * numberOfLBnodes];
+         dist.f[DIR_MM0] = &distributionArray[DIR_PP0 * numberOfLBnodes];
+         dist.f[DIR_PP0] = &distributionArray[DIR_MM0 * numberOfLBnodes];
+         dist.f[DIR_MP0] = &distributionArray[DIR_PM0 * numberOfLBnodes];
+         dist.f[DIR_PM0] = &distributionArray[DIR_MP0 * numberOfLBnodes];
+         dist.f[DIR_M0M] = &distributionArray[DIR_P0P * numberOfLBnodes];
+         dist.f[DIR_P0P] = &distributionArray[DIR_M0M * numberOfLBnodes];
+         dist.f[DIR_M0P] = &distributionArray[DIR_P0M * numberOfLBnodes];
+         dist.f[DIR_P0M] = &distributionArray[DIR_M0P * numberOfLBnodes];
+         dist.f[DIR_0MM] = &distributionArray[DIR_0PP * numberOfLBnodes];
+         dist.f[DIR_0PP] = &distributionArray[DIR_0MM * numberOfLBnodes];
+         dist.f[DIR_0MP] = &distributionArray[DIR_0PM * numberOfLBnodes];
+         dist.f[DIR_0PM] = &distributionArray[DIR_0MP * numberOfLBnodes];
+         dist.f[DIR_000] = &distributionArray[DIR_000 * numberOfLBnodes];
+         dist.f[DIR_PPP] = &distributionArray[DIR_MMM * numberOfLBnodes];
+         dist.f[DIR_MMP] = &distributionArray[DIR_PPM * numberOfLBnodes];
+         dist.f[DIR_PMP] = &distributionArray[DIR_MPM * numberOfLBnodes];
+         dist.f[DIR_MPP] = &distributionArray[DIR_PMM * numberOfLBnodes];
+         dist.f[DIR_PPM] = &distributionArray[DIR_MMP * numberOfLBnodes];
+         dist.f[DIR_MMM] = &distributionArray[DIR_PPP * numberOfLBnodes];
+         dist.f[DIR_PMM] = &distributionArray[DIR_MPP * numberOfLBnodes];
+         dist.f[DIR_MPM] = &distributionArray[DIR_PMP * numberOfLBnodes];
+    }
+}
+
+__inline__ __device__ void getPointersToSubgridDistances(SubgridDistances27& subgridD, real* subgridDistances, const unsigned int numberOfSubgridIndices)
+{
+    subgridD.q[DIR_P00] = &subgridDistances[DIR_P00 * numberOfSubgridIndices];
+    subgridD.q[DIR_M00] = &subgridDistances[DIR_M00 * numberOfSubgridIndices];
+    subgridD.q[DIR_0P0] = &subgridDistances[DIR_0P0 * numberOfSubgridIndices];
+    subgridD.q[DIR_0M0] = &subgridDistances[DIR_0M0 * numberOfSubgridIndices];
+    subgridD.q[DIR_00P] = &subgridDistances[DIR_00P * numberOfSubgridIndices];
+    subgridD.q[DIR_00M] = &subgridDistances[DIR_00M * numberOfSubgridIndices];
+    subgridD.q[DIR_PP0] = &subgridDistances[DIR_PP0 * numberOfSubgridIndices];
+    subgridD.q[DIR_MM0] = &subgridDistances[DIR_MM0 * numberOfSubgridIndices];
+    subgridD.q[DIR_PM0] = &subgridDistances[DIR_PM0 * numberOfSubgridIndices];
+    subgridD.q[DIR_MP0] = &subgridDistances[DIR_MP0 * numberOfSubgridIndices];
+    subgridD.q[DIR_P0P] = &subgridDistances[DIR_P0P * numberOfSubgridIndices];
+    subgridD.q[DIR_M0M] = &subgridDistances[DIR_M0M * numberOfSubgridIndices];
+    subgridD.q[DIR_P0M] = &subgridDistances[DIR_P0M * numberOfSubgridIndices];
+    subgridD.q[DIR_M0P] = &subgridDistances[DIR_M0P * numberOfSubgridIndices];
+    subgridD.q[DIR_0PP] = &subgridDistances[DIR_0PP * numberOfSubgridIndices];
+    subgridD.q[DIR_0MM] = &subgridDistances[DIR_0MM * numberOfSubgridIndices];
+    subgridD.q[DIR_0PM] = &subgridDistances[DIR_0PM * numberOfSubgridIndices];
+    subgridD.q[DIR_0MP] = &subgridDistances[DIR_0MP * numberOfSubgridIndices];
+    subgridD.q[DIR_000] = &subgridDistances[DIR_000 * numberOfSubgridIndices];
+    subgridD.q[DIR_PPP] = &subgridDistances[DIR_PPP * numberOfSubgridIndices];
+    subgridD.q[DIR_MMP] = &subgridDistances[DIR_MMP * numberOfSubgridIndices];
+    subgridD.q[DIR_PMP] = &subgridDistances[DIR_PMP * numberOfSubgridIndices];
+    subgridD.q[DIR_MPP] = &subgridDistances[DIR_MPP * numberOfSubgridIndices];
+    subgridD.q[DIR_PPM] = &subgridDistances[DIR_PPM * numberOfSubgridIndices];
+    subgridD.q[DIR_MMM] = &subgridDistances[DIR_MMM * numberOfSubgridIndices];
+    subgridD.q[DIR_PMM] = &subgridDistances[DIR_PMM * numberOfSubgridIndices];
+    subgridD.q[DIR_MPM] = &subgridDistances[DIR_MPM * numberOfSubgridIndices];
+}
+
+__inline__ __device__ real getEquilibriumForBC(const real& drho, const real& velocity, const real& cu_sq, const real weight)
+{
+    return weight * (drho + c9o2 * velocity * velocity * (c1o1 + drho) - cu_sq);
+}
+
+__inline__ __device__ real getInterpolatedDistributionForVeloBC(const real& q, const real& f, const real& fInverse, const real& feq,
+                                                                const real& omega, const real& velocity, const real weight)
+{
+
+    return (c1o1-q) / (c1o1+q) * (f - fInverse + (f + fInverse - c2o1 * feq * omega) / (c1o1 - omega)) * c1o2
+           + (q * (f + fInverse) - c6o1 * weight * velocity) / (c1o1 + q);
+}
+
+__inline__ __device__ real getBounceBackDistributionForVeloBC(  const real& f,
+                                                                const real& velocity, const real weight)
+{
+
+    return f - (c6o1 * weight * velocity);
+}
+
+__inline__ __device__ real getInterpolatedDistributionForNoSlipBC(const real& q, const real& f, const real& fInverse, const real& feq,
+                                                                  const real& omega)
+{
+
+    return (c1o1-q) / (c1o1+q) * (f - fInverse + (f + fInverse - c2o1 * feq * omega) / (c1o1 - omega)) * c1o2
+           + (q * (f + fInverse)) / (c1o1 + q);
+}
+
+
+__inline__ __device__ real getInterpolatedDistributionForVeloWithPressureBC(const real& q, const real& f, const real& fInverse, const real& feq,
+                                                                            const real& omega, const real& drho, const real& velocity, const real weight)
+{
+
+    return (c1o1-q) / (c1o1+q) * (f - fInverse + (f + fInverse - c2o1 * feq * omega) / (c1o1 - omega)) * c1o2
+           + (q * (f + fInverse) - c6o1 * weight * velocity) / (c1o1 + q) - weight * drho;
+}
+
+__inline__ __device__ unsigned int getNodeIndex()
+{
+    const unsigned x = threadIdx.x;
+    const unsigned y = blockIdx.x;
+    const unsigned z = blockIdx.y;
+
+    const unsigned nx = blockDim.x;
+    const unsigned ny = gridDim.x;
+
+    return nx * (ny * z + y) + x;
+}
+
+__inline__ __device__ bool isValidFluidNode(uint nodeType)
+{
+    return (nodeType == GEO_FLUID || nodeType == GEO_PM_0 || nodeType == GEO_PM_1 || nodeType == GEO_PM_2);
+}
+
+
+}
+
+#endif
diff --git a/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/ScalingUtilities.h b/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/ScalingUtilities.h
new file mode 100644
index 0000000000000000000000000000000000000000..53990e452be06dc6840c801816e8231d26861e2e
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/ScalingUtilities.h
@@ -0,0 +1,136 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ScalingUtilities.h
+//! \ingroup LBM/GPUHelperFunctions
+//! \author Martin Schoenherr, Anna Wellmann
+//=======================================================================================
+#ifndef SCALING_HELPER_FUNCTIONS_H
+#define SCALING_HELPER_FUNCTIONS_H
+
+#include "LBM/LB.h" 
+#include "lbm/constants/D3Q27.h"
+#include "lbm/constants/NumericConstants.h"
+
+using namespace vf::lbm::constant;
+using namespace vf::lbm::dir;
+
+namespace vf::gpu
+{
+
+__device__ __inline__ void calculateMomentsOnSourceNodes(Distributions27 &dist, real &omega, unsigned int &k_000,
+                                                         unsigned int &k_M00, unsigned int &k_0M0, unsigned int &k_00M,
+                                                         unsigned int &k_MM0, unsigned int &k_M0M, unsigned int &k_0MM,
+                                                         unsigned int &k_MMM, real &drho, real &velocityX,
+                                                         real &velocityY, real &velocityZ, real &kxyFromfcNEQ,
+                                                         real &kyzFromfcNEQ, real &kxzFromfcNEQ, real &kxxMyyFromfcNEQ,
+                                                         real &kxxMzzFromfcNEQ)
+{
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Set local distributions (f's) on source nodes:
+    //!
+    real f_000 = (dist.f[DIR_000])[k_000];
+    real f_P00 = (dist.f[DIR_P00])[k_000];
+    real f_M00 = (dist.f[DIR_M00])[k_M00];
+    real f_0P0 = (dist.f[DIR_0P0])[k_000];
+    real f_0M0 = (dist.f[DIR_0M0])[k_0M0];
+    real f_00P = (dist.f[DIR_00P])[k_000];
+    real f_00M = (dist.f[DIR_00M])[k_00M];
+    real f_PP0 = (dist.f[DIR_PP0])[k_000];
+    real f_MM0 = (dist.f[DIR_MM0])[k_MM0];
+    real f_PM0 = (dist.f[DIR_PM0])[k_0M0];
+    real f_MP0 = (dist.f[DIR_MP0])[k_M00];
+    real f_P0P = (dist.f[DIR_P0P])[k_000];
+    real f_M0M = (dist.f[DIR_M0M])[k_M0M];
+    real f_P0M = (dist.f[DIR_P0M])[k_00M];
+    real f_M0P = (dist.f[DIR_M0P])[k_M00];
+    real f_0PP = (dist.f[DIR_0PP])[k_000];
+    real f_0MM = (dist.f[DIR_0MM])[k_0MM];
+    real f_0PM = (dist.f[DIR_0PM])[k_00M];
+    real f_0MP = (dist.f[DIR_0MP])[k_0M0];
+    real f_PPP = (dist.f[DIR_PPP])[k_000];
+    real f_MPP = (dist.f[DIR_MPP])[k_M00];
+    real f_PMP = (dist.f[DIR_PMP])[k_0M0];
+    real f_MMP = (dist.f[DIR_MMP])[k_MM0];
+    real f_PPM = (dist.f[DIR_PPM])[k_00M];
+    real f_MPM = (dist.f[DIR_MPM])[k_M0M];
+    real f_PMM = (dist.f[DIR_PMM])[k_0MM];
+    real f_MMM = (dist.f[DIR_MMM])[k_MMM];
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
+    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+    //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+    //!
+    drho = ((((f_PPP + f_MMM) + (f_MPM + f_PMP)) + ((f_MPP + f_PMM) + (f_MMP + f_PPM))) +
+            (((f_0MP + f_0PM) + (f_0MM + f_0PP)) + ((f_M0P + f_P0M) + (f_M0M + f_P0P)) +
+             ((f_MP0 + f_PM0) + (f_MM0 + f_PP0))) +
+            ((f_M00 + f_P00) + (f_0M0 + f_0P0) + (f_00M + f_00P))) +
+           f_000;
+
+    real oneOverRho = c1o1 / (c1o1 + drho);
+
+    velocityX = ((((f_PPP - f_MMM) + (f_PMP - f_MPM)) + ((f_PMM - f_MPP) + (f_PPM - f_MMP))) +
+                 (((f_P0M - f_M0P) + (f_P0P - f_M0M)) + ((f_PM0 - f_MP0) + (f_PP0 - f_MM0))) + (f_P00 - f_M00)) *
+                oneOverRho;
+    velocityY = ((((f_PPP - f_MMM) + (f_MPM - f_PMP)) + ((f_MPP - f_PMM) + (f_PPM - f_MMP))) +
+                 (((f_0PM - f_0MP) + (f_0PP - f_0MM)) + ((f_MP0 - f_PM0) + (f_PP0 - f_MM0))) + (f_0P0 - f_0M0)) *
+                oneOverRho;
+    velocityZ = ((((f_PPP - f_MMM) + (f_PMP - f_MPM)) + ((f_MPP - f_PMM) + (f_MMP - f_PPM))) +
+                 (((f_0MP - f_0PM) + (f_0PP - f_0MM)) + ((f_M0P - f_P0M) + (f_P0P - f_M0M))) + (f_00P - f_00M)) *
+                oneOverRho;
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Calculate second order moments for interpolation
+    //!
+    // example: kxxMzz: moment, second derivative in x direction minus the second derivative in z direction
+    kxyFromfcNEQ = -c3o1 * omega *
+                   ((f_MM0 + f_MMM + f_MMP - f_MP0 - f_MPM - f_MPP - f_PM0 - f_PMM - f_PMP + f_PP0 + f_PPM + f_PPP) /
+                    (c1o1 + drho) -
+                    ((velocityX * velocityY)));
+    kyzFromfcNEQ = -c3o1 * omega *
+                   ((f_0MM + f_PMM + f_MMM - f_0MP - f_PMP - f_MMP - f_0PM - f_PPM - f_MPM + f_0PP + f_PPP + f_MPP) /
+                    (c1o1 + drho) -
+                    ((velocityY * velocityZ)));
+    kxzFromfcNEQ = -c3o1 * omega *
+                   ((f_M0M + f_MMM + f_MPM - f_M0P - f_MMP - f_MPP - f_P0M - f_PMM - f_PPM + f_P0P + f_PMP + f_PPP) /
+                    (c1o1 + drho) -
+                    ((velocityX * velocityZ)));
+    kxxMyyFromfcNEQ = -c3o2 * omega *
+                      ((f_M0M + f_M00 + f_M0P - f_0MM - f_0M0 - f_0MP - f_0PM - f_0P0 - f_0PP + f_P0M + f_P00 + f_P0P) /
+                       (c1o1 + drho) -
+                       ((velocityX * velocityX - velocityY * velocityY)));
+    kxxMzzFromfcNEQ = -c3o2 * omega *
+                      ((f_MM0 + f_M00 + f_MP0 - f_0MM - f_0MP - f_00M - f_00P - f_0PM - f_0PP + f_PM0 + f_P00 + f_PP0) /
+                       (c1o1 + drho) -
+                       ((velocityX * velocityX - velocityZ * velocityZ)));
+}
+
+} // namespace vf::gpu
+
+#endif
diff --git a/src/gpu/VirtualFluids_GPU/LBM/LB.h b/src/gpu/VirtualFluids_GPU/LBM/LB.h
index eea4adfda3c1ef0862f39ef58fc6e065af7bab1b..cfdbbbae040a13f94e97d40d702b93d5a1e19c86 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/LB.h
+++ b/src/gpu/VirtualFluids_GPU/LBM/LB.h
@@ -15,9 +15,9 @@
 
 //////////////////////////
 //porous media
-#define GEO_PM_0		 5
-#define GEO_PM_1		 6
-#define GEO_PM_2		 7
+#define GEO_PM_0         5
+#define GEO_PM_1         6
+#define GEO_PM_2         7
 //////////////////////////
 
 #define GEO_SOLID       15
@@ -53,17 +53,33 @@
 //! \brief An enumeration for selecting a turbulence model
 enum class TurbulenceModel {
    //! - Smagorinsky
-    Smagorinsky,
+   Smagorinsky,
     //! - AMD (Anisotropic Minimum Dissipation) model, see e.g. Rozema et al., Phys. Fluids 27, 085107 (2015), https://doi.org/10.1063/1.4928700
-    AMD,
+   AMD,
     //! - QR model by Verstappen 
-    QR,
+   QR,
     //! - TODO: move the WALE model here from the old kernels
     //WALE
     //! - No turbulence model
-    None
+   None
 };
 
+//! \brief An enumeration for selecting a template of the collision kernel (CumulantK17)
+enum class CollisionTemplate {
+   //! - Default: plain collision without additional read/write
+   Default,
+   //!  - WriteMacroVars: collision \w write out macroscopic variables
+   WriteMacroVars,
+   //! - ApplyBodyForce: collision \w read and apply body force in the collision kernel
+   ApplyBodyForce,
+   //! - AllFeatures: collision \w write out macroscopic variables AND read and apply body force
+   AllFeatures,
+   //! - Border: collision on border nodes
+   SubDomainBorder
+};
+constexpr std::initializer_list<CollisionTemplate> all_CollisionTemplate  = { CollisionTemplate::Default, CollisionTemplate::WriteMacroVars, CollisionTemplate::ApplyBodyForce, CollisionTemplate::AllFeatures, CollisionTemplate::SubDomainBorder};
+constexpr std::initializer_list<CollisionTemplate> bulk_CollisionTemplate = { CollisionTemplate::Default, CollisionTemplate::WriteMacroVars, CollisionTemplate::ApplyBodyForce, CollisionTemplate::AllFeatures};
+
 struct InitCondition
 {
    real Re;
@@ -144,6 +160,7 @@ struct InitCondition
    bool hasWallModelMonitor {false};
    bool simulatePorousMedia {false};
    bool streetVelocityFile {false};
+   real outflowPressureCorrectionFactor {0.0};
 };
 
 //Interface Cells
@@ -174,7 +191,7 @@ typedef struct OffFC{
 
 // Distribution functions g 6
 typedef struct  Distri6 {
-	real* g[6];
+   real* g[6];
 } Distributions6;
 
 // Distribution functions f 7
@@ -214,6 +231,21 @@ typedef struct QforBC{
    real *normalX, *normalY, *normalZ;
 }QforBoundaryConditions;
 
+typedef struct QforPrecursorBC{
+   int* k;
+   int numberOfBCnodes=0;
+   int sizeQ;
+   int numberOfPrecursorNodes=0;
+   uint nPrecursorReads=0;
+   uint timeStepsBetweenReads;
+   size_t numberOfQuantities;
+   real* q27[27];
+   uint* planeNeighbor0PP, *planeNeighbor0PM, *planeNeighbor0MP, *planeNeighbor0MM;
+   real* weights0PP, *weights0PM, *weights0MP,  *weights0MM;
+   real* last, *current, *next;
+   real velocityX, velocityY, velocityZ;
+}QforPrecursorBoundaryConditions;
+
 //BCTemp
 typedef struct TempforBC{
    int* k;
@@ -249,57 +281,56 @@ typedef struct WMparas{
    real* Fz;
 }WallModelParameters;
 
+
 //measurePoints
 typedef struct MeasP{
-	std::string name;
-	uint k;
-	std::vector<real> Vx;
-	std::vector<real> Vy;
-	std::vector<real> Vz;
-	std::vector<real> Rho;
-	//real* Vx;
-	//real* Vy;
-	//real* Vz;
-	//real* Rho;
+   std::string name;
+   uint k;
+   std::vector<real> Vx;
+   std::vector<real> Vy;
+   std::vector<real> Vz;
+   std::vector<real> Rho;
+   //real* Vx;
+   //real* Vy;
+   //real* Vz;
+   //real* Rho;
 }MeasurePoints;
 
 //Process Neighbors
 typedef struct PN27{
-	real* f[27];
-	uint memsizeFs;
-	int* index;
-	uint memsizeIndex;
-	uint rankNeighbor;
-	int numberOfNodes;
-	int numberOfFs;
+   real* f[27];
+   uint memsizeFs;
+   int* index;
+   uint memsizeIndex;
+   uint rankNeighbor;
+   int numberOfNodes;
+   int numberOfFs;
 }ProcessNeighbor27;
 
 typedef struct PN_F3 {
-	real* g[6];
-	uint memsizeGs;
-	int* index;
-	uint memsizeIndex;
-	uint rankNeighbor;
-	int numberOfNodes;
-	int numberOfGs;
+   real* g[6];
+   uint memsizeGs;
+   int* index;
+   uint memsizeIndex;
+   uint rankNeighbor;
+   int numberOfNodes;
+   int numberOfGs;
 }ProcessNeighborF3;
 
 //path line particles
 typedef struct PLP{
-	bool *stuck, *hot;
-	real *coordXabsolut, *coordYabsolut, *coordZabsolut;
-	real *coordXlocal,   *coordYlocal,   *coordZlocal;
-	real *veloX,         *veloY,         *veloZ;
-	real *randomLocationInit;
-	uint *timestep;
-	uint *ID;
-	uint *cellBaseID;
-	uint numberOfParticles, numberOfTimestepsParticles;
-	uint memSizeID, memSizeTimestep, memSizerealAll, memSizereal, memSizeBool, memSizeBoolBC;
+   bool *stuck, *hot;
+   real *coordXabsolut, *coordYabsolut, *coordZabsolut;
+   real *coordXlocal,   *coordYlocal,   *coordZlocal;
+   real *veloX,         *veloY,         *veloZ;
+   real *randomLocationInit;
+   uint *timestep;
+   uint *ID;
+   uint *cellBaseID;
+   uint numberOfParticles, numberOfTimestepsParticles;
+   uint memSizeID, memSizeTimestep, memSizerealAll, memSizereal, memSizeBool, memSizeBoolBC;
 }PathLineParticles;
 
-
-
 //////////////////////////////////////////////////////////////////////////
 inline int vectorPosition(int i, int j, int k, int Lx, int Ly )
 {
@@ -308,7 +339,4 @@ inline int vectorPosition(int i, int j, int k, int Lx, int Ly )
 }
 //////////////////////////////////////////////////////////////////////////
 
-
 #endif
-
-
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
index 703e935e6edb5676c7d6e045a38e3ec20d7a4b41..84ab84ff93fa7706bcc27d7e61a18f580f3c8dbe 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
@@ -97,11 +97,7 @@ void Simulation::init(GridProvider &gridProvider, BoundaryConditionFactory *bcFa
 
     gridProvider.allocAndCopyForcing();
     gridProvider.allocAndCopyQuadricLimiters();
-    if (para->getKernelNeedsFluidNodeIndicesToRun()) {
-        gridProvider.allocArrays_fluidNodeIndices();
-        gridProvider.allocArrays_fluidNodeIndicesBorder();
-    }
-
+        
     gridProvider.setDimensions();
     gridProvider.setBoundingBox();
 
@@ -113,12 +109,7 @@ void Simulation::init(GridProvider &gridProvider, BoundaryConditionFactory *bcFa
         para->setStartTurn((unsigned int)0); // 100000
 
     restart_object = std::make_shared<ASCIIRestartObject>();
-    //////////////////////////////////////////////////////////////////////////
-    // CUDA streams
-    if (para->getUseStreams()) {
-        para->getStreamManager()->launchStreams(2u);
-        para->getStreamManager()->createCudaEvents();
-    }
+
     //////////////////////////////////////////////////////////////////////////
     VF_LOG_INFO("LB_Modell:       D3Q{}", para->getD3Qxx());
     VF_LOG_INFO("Re:              {}", para->getRe());
@@ -134,14 +125,32 @@ void Simulation::init(GridProvider &gridProvider, BoundaryConditionFactory *bcFa
     //////////////////////////////////////////////////////////////////////////
     allocNeighborsOffsetsScalesAndBoundaries(gridProvider);
 
+    //! Get tagged fluid nodes with corresponding value for CollisionTemplate from interactors
     for (SPtr<PreCollisionInteractor> actuator : para->getActuators()) {
         actuator->init(para.get(), &gridProvider, cudaMemoryManager.get());
+        actuator->getTaggedFluidNodes( para.get(), &gridProvider );
     }
 
     for (SPtr<PreCollisionInteractor> probe : para->getProbes()) {
         probe->init(para.get(), &gridProvider, cudaMemoryManager.get());
+        probe->getTaggedFluidNodes( para.get(), &gridProvider );
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    // CUDA streams
+    if (para->getUseStreams()) {
+        para->getStreamManager()->registerStream(CudaStreamIndex::SubDomainBorder);
+        para->getStreamManager()->registerStream(CudaStreamIndex::Bulk);
+        para->getStreamManager()->launchStreams();
+        para->getStreamManager()->createCudaEvents();
+    }
+    //////////////////////////////////////////////////////////////////////////
+    
+    if (para->getKernelNeedsFluidNodeIndicesToRun())
+    {
+        gridProvider.sortFluidNodeTags();
+        gridProvider.allocArrays_taggedFluidNodes();
+    }
     //////////////////////////////////////////////////////////////////////////
     // Kernel init
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Output/DistributionDebugInspector.cu b/src/gpu/VirtualFluids_GPU/Output/DistributionDebugInspector.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f7bb2e680c0fb3ea597239ee0cbc1772f2efe81b
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Output/DistributionDebugInspector.cu
@@ -0,0 +1,179 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file DistributionDebugInspector.cu
+//! \ingroup Output
+//! \author Henrik Asmuth, Henry Korb
+//======================================================================================
+#include "DistributionDebugInspector.h"
+
+#include "Parameter/Parameter.h"
+#include "lbm/constants/D3Q27.h"
+#include "lbm/constants/NumericConstants.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
+
+#include <cuda/CudaGrid.h>
+#include <cuda.h>
+
+#include <iostream>
+
+using namespace vf::lbm::constant;
+using namespace vf::lbm::dir;
+using namespace vf::gpu;
+
+__global__ void printFs(
+    real* distributions,
+    bool isEvenTimestep,
+    unsigned long long numberOfFluidNodes,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    uint* typeOfGridNode,
+    real* coordX,
+    real* coordY,
+    real* coordZ,
+    real minX,
+    real maxX,
+    real minY,
+    real maxY,
+    real minZ,
+    real maxZ)
+{
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned k_000 = getNodeIndex();
+
+    if (k_000 >= numberOfFluidNodes || typeOfGridNode[k_000]!=GEO_FLUID ) 
+        return;
+
+    real coordNodeX = coordX[k_000];
+    real coordNodeY = coordY[k_000];
+    real coordNodeZ = coordZ[k_000];
+
+    if( coordNodeX>=minX && coordNodeX<=maxX &&
+        coordNodeY>=minY && coordNodeY<=maxY &&
+        coordNodeZ>=minZ && coordNodeZ<=maxZ    )
+        {
+            Distributions27 dist;
+            getPointersToDistributions(dist, distributions, numberOfFluidNodes, isEvenTimestep);
+            ////////////////////////////////////////////////////////////////////////////////
+            //! - Set neighbor indices (necessary for indirect addressing)
+            uint k_M00 = neighborX[k_000];
+            uint k_0M0 = neighborY[k_000];
+            uint k_00M = neighborZ[k_000];
+            uint k_MM0 = neighborY[k_M00];
+            uint k_M0M = neighborZ[k_M00];
+            uint k_0MM = neighborZ[k_0M0];
+            uint k_MMM = neighborZ[k_MM0];
+            ////////////////////////////////////////////////////////////////////////////////////
+            //! - Set local distributions
+            //!
+            real f_000 = (dist.f[DIR_000])[k_000];
+            real f_P00 = (dist.f[DIR_P00])[k_000];
+            real f_M00 = (dist.f[DIR_M00])[k_M00];
+            real f_0P0 = (dist.f[DIR_0P0])[k_000];
+            real f_0M0 = (dist.f[DIR_0M0])[k_0M0];
+            real f_00P = (dist.f[DIR_00P])[k_000];
+            real f_00M = (dist.f[DIR_00M])[k_00M];
+            real f_PP0 = (dist.f[DIR_PP0])[k_000];
+            real f_MM0 = (dist.f[DIR_MM0])[k_MM0];
+            real f_PM0 = (dist.f[DIR_PM0])[k_0M0];
+            real f_MP0 = (dist.f[DIR_MP0])[k_M00];
+            real f_P0P = (dist.f[DIR_P0P])[k_000];
+            real f_M0M = (dist.f[DIR_M0M])[k_M0M];
+            real f_P0M = (dist.f[DIR_P0M])[k_00M];
+            real f_M0P = (dist.f[DIR_M0P])[k_M00];
+            real f_0PP = (dist.f[DIR_0PP])[k_000];
+            real f_0MM = (dist.f[DIR_0MM])[k_0MM];
+            real f_0PM = (dist.f[DIR_0PM])[k_00M];
+            real f_0MP = (dist.f[DIR_0MP])[k_0M0];
+            real f_PPP = (dist.f[DIR_PPP])[k_000];
+            real f_MPP = (dist.f[DIR_MPP])[k_M00];
+            real f_PMP = (dist.f[DIR_PMP])[k_0M0];
+            real f_MMP = (dist.f[DIR_MMP])[k_MM0];
+            real f_PPM = (dist.f[DIR_PPM])[k_00M];
+            real f_MPM = (dist.f[DIR_MPM])[k_M0M];
+            real f_PMM = (dist.f[DIR_PMM])[k_0MM];
+            real f_MMM = (dist.f[DIR_MMM])[k_MMM];
+
+            real drho = ((((f_PPP + f_MMM) + (f_MPM + f_PMP)) + ((f_MPP + f_PMM) + (f_MMP + f_PPM))) +
+                        (((f_0MP + f_0PM) + (f_0MM + f_0PP)) + ((f_M0P + f_P0M) + (f_M0M + f_P0P)) +
+                        ((f_MP0 + f_PM0) + (f_MM0 + f_PP0))) +
+                        ((f_M00 + f_P00) + (f_0M0 + f_0P0) + (f_00M + f_00P))) +
+                            f_000;
+
+            real oneOverRho = c1o1 / (c1o1 + drho);
+
+            real vvx = ((((f_PPP - f_MMM) + (f_PMP - f_MPM)) + ((f_PMM - f_MPP) + (f_PPM - f_MMP))) +
+                        (((f_P0M - f_M0P) + (f_P0P - f_M0M)) + ((f_PM0 - f_MP0) + (f_PP0 - f_MM0))) + (f_P00 - f_M00)) *
+                    oneOverRho;
+            real vvy = ((((f_PPP - f_MMM) + (f_MPM - f_PMP)) + ((f_MPP - f_PMM) + (f_PPM - f_MMP))) +
+                        (((f_0PM - f_0MP) + (f_0PP - f_0MM)) + ((f_MP0 - f_PM0) + (f_PP0 - f_MM0))) + (f_0P0 - f_0M0)) *
+                    oneOverRho;
+            real vvz = ((((f_PPP - f_MMM) + (f_PMP - f_MPM)) + ((f_MPP - f_PMM) + (f_MMP - f_PPM))) +
+                        (((f_0MP - f_0PM) + (f_0PP - f_0MM)) + ((f_M0P - f_P0M) + (f_P0P - f_M0M))) + (f_00P - f_00M)) *
+                    oneOverRho;
+
+            printf("Node %u \t (%f\t%f\t%f)\n rho: %f\t velo: %f\t %f \t %f\n\n" , k_000, coordNodeX, coordNodeY, coordNodeZ, drho, vvx, vvy, vvz);
+            printf("Node %u \t (%f\t%f\t%f)\n f_M00\t%f\t f_000\t%f\t f_P00\t%f\n f_MP0\t%f\t f_0P0\t%f\t f_PP0\t%f\n f_MM0\t%f\t f_0M0\t%f\t f_PM0\t%f\n f_M0P\t%f\t f_00P\t%f\t f_P0P\t%f\n f_M0M\t%f\t f_00M\t%f\t f_P0M\t%f\n f_MPP\t%f\t f_0PP\t%f\t f_PPP\t%f\n f_MPM\t%f\t f_0PM\t%f\t f_PPM\t%f\n f_MMP\t%f\t f_0MP\t%f\t f_PMP\t%f\n f_MMM\t%f\t f_0MM\t%f\t f_PMM\t%f\n\n\n" , k_000, coordNodeX, coordNodeY, coordNodeZ, f_M00, f_000, f_P00,f_MP0, f_0P0, f_PP0, f_MM0, f_0M0, f_PM0, f_M0P, f_00P, f_P0P, f_M0M, f_00M, f_P0M, f_MPP, f_0PP, f_PPP, f_MPM, f_0PM, f_PPM, f_MMP, f_0MP, f_PMP, f_MMM, f_0MM, f_PMM);
+
+        }
+
+}
+
+
+
+
+void DistributionDebugInspector::inspect(std::shared_ptr<Parameter> para, uint level, uint t)
+{
+    if(this->inspectionLevel!=level)
+        return;
+
+    std::cout << tag << ": distributions on level " << level << " at t " << t <<  std::endl;
+
+    vf::cuda::CudaGrid cudaGrid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    printFs <<< cudaGrid.grid, cudaGrid.threads >>>(    para->getParD(level)->distributions.f[0],
+                                                        para->getParD(level)->isEvenTimestep,
+                                                        para->getParD(level)->numberOfNodes,
+                                                        para->getParD(level)->neighborX,
+                                                        para->getParD(level)->neighborY,
+                                                        para->getParD(level)->neighborZ,
+                                                        para->getParD(level)->typeOfGridNode,
+                                                        para->getParD(level)->coordinateX,
+                                                        para->getParD(level)->coordinateY,
+                                                        para->getParD(level)->coordinateZ,
+                                                        minX,
+                                                        maxX,
+                                                        minY,
+                                                        maxY,
+                                                        minZ,
+                                                        maxZ);
+
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Output/DistributionDebugInspector.h b/src/gpu/VirtualFluids_GPU/Output/DistributionDebugInspector.h
new file mode 100644
index 0000000000000000000000000000000000000000..95fea46d4eba0c2f2ff0846d22ee5da4f6c357ea
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Output/DistributionDebugInspector.h
@@ -0,0 +1,76 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file DistributionDebugInspector.h
+//! \author Henrik Asmuth
+//! \date 13/012/2022
+//! \brief Basic debugging class to print out f's in a certain area of the domain
+//!
+//! Basic debugging class. Needs to be directly added in UpdateGrid (could potentially also be added as a proper Probe in the future)
+//! How to use: Define a part of the domain via min/max x, y, and z. The DistributionDebugInspector will print out all f's in that area.
+//!
+//=======================================================================================
+
+#ifndef DISTRIBUTION_INSPECTOR_H
+#define DISTRIBUTION_INSPECTOR_H
+
+#include "Parameter/Parameter.h"
+
+
+class DistributionDebugInspector
+{
+public:
+    DistributionDebugInspector(uint _inspectionLevel, real _minX, real _maxX, real _minY, real _maxY, real _minZ, real _maxZ, std::string _tag):
+    inspectionLevel(_inspectionLevel),
+    minX(_minX),
+    maxX(_maxX),
+    minY(_minY),
+    maxY(_maxY),
+    minZ(_minZ),
+    maxZ(_maxZ),
+    tag(_tag)
+    {};
+
+    ~DistributionDebugInspector() = default;
+
+    void inspect(std::shared_ptr<Parameter> para, uint level, uint t);
+
+
+private:
+uint inspectionLevel;
+real minX;
+real maxX;
+real minY;
+real maxY;
+real minZ;
+real maxZ;
+std::string tag;
+
+};
+
+#endif
diff --git a/src/gpu/VirtualFluids_GPU/Output/FileWriter.cpp b/src/gpu/VirtualFluids_GPU/Output/FileWriter.cpp
index c6e53ee3cbfb98f11e373ca014c7faf4e70a86f0..edf705421530bdbc9c2c9fd8c44eca6d3c5ab923 100644
--- a/src/gpu/VirtualFluids_GPU/Output/FileWriter.cpp
+++ b/src/gpu/VirtualFluids_GPU/Output/FileWriter.cpp
@@ -50,7 +50,7 @@ void FileWriter::writeTimestep(std::shared_ptr<Parameter> para, unsigned int tim
 
 void FileWriter::writeTimestep(std::shared_ptr<Parameter> para, unsigned int timestep, int level)
 {
-    const unsigned int numberOfParts = para->getParH(level)->numberOfNodes / para->getlimitOfNodesForVTK() + 1;
+    const unsigned int numberOfParts = (uint)para->getParH(level)->numberOfNodes / para->getlimitOfNodesForVTK() + 1;
     std::vector<std::string> fname;
     std::vector<std::string> fnameMed;
 
@@ -217,8 +217,8 @@ void FileWriter::writeUnstrucuredGridLT(std::shared_ptr<Parameter> para, int lev
 
     for (unsigned int part = 0; part < fname.size(); part++)
     {
-        if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
-            sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+        if (((part + 1)*para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
+            sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
         else
             sizeOfNodes = para->getlimitOfNodesForVTK();
 
@@ -340,8 +340,8 @@ void FileWriter::writeUnstrucuredGridLTConc(std::shared_ptr<Parameter> para, int
 
     for (unsigned int part = 0; part < fname.size(); part++)
     {
-        if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
-            sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+        if (((part + 1) * para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
+            sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
         else
             sizeOfNodes = para->getlimitOfNodesForVTK();
 
@@ -449,9 +449,9 @@ void FileWriter::writeUnstrucuredGridMedianLT(std::shared_ptr<Parameter> para, i
     {
         //printf("\n test in if I... \n");
         //////////////////////////////////////////////////////////////////////////
-        if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
+        if (((part + 1) * para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
         {
-            sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+            sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
         }
         else
         {
@@ -558,8 +558,8 @@ void FileWriter::writeUnstrucuredGridMedianLTConc(std::shared_ptr<Parameter> par
 
     for (unsigned int part = 0; part < fname.size(); part++)
     {
-        if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
-            sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+        if (((part + 1) * para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
+            sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
         else
             sizeOfNodes = para->getlimitOfNodesForVTK();
         //////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Output/InterfaceDebugWriter.hpp b/src/gpu/VirtualFluids_GPU/Output/InterfaceDebugWriter.hpp
index 0b1e9dc1c25457457eabe3013a288c4c93577dc3..4d5895b323efa1b94a5780a59c882fd5ce1be7eb 100644
--- a/src/gpu/VirtualFluids_GPU/Output/InterfaceDebugWriter.hpp
+++ b/src/gpu/VirtualFluids_GPU/Output/InterfaceDebugWriter.hpp
@@ -290,10 +290,10 @@ void writeNeighborXPointsDebug(Parameter *para)
     nodesVec.resize(nodeNumberVec);
     int nodeCount2 = 0;
     for (int level = 0; level <= para->getMaxLevel(); level++) {
-        for (unsigned int u = 0; u < para->getParH(level)->numberOfNodes; u++) {
-            real x1 = para->getParH(level)->coordinateX[para->getParH(level)->neighborX[u]];
-            real x2 = para->getParH(level)->coordinateY[para->getParH(level)->neighborX[u]];
-            real x3 = para->getParH(level)->coordinateZ[para->getParH(level)->neighborX[u]];
+        for (size_t index = 0; index < para->getParH(level)->numberOfNodes; index++) {
+            real x1 = para->getParH(level)->coordinateX[para->getParH(level)->neighborX[index]];
+            real x2 = para->getParH(level)->coordinateY[para->getParH(level)->neighborX[index]];
+            real x3 = para->getParH(level)->coordinateZ[para->getParH(level)->neighborX[index]];
 
             nodesVec[nodeCount2++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
         }
@@ -317,18 +317,18 @@ void writeNeighborXLinesDebug(Parameter *para)
     nodesVec.resize(nodeNumberVec * 2);
     int nodeCount = 0;
     for (int level = 0; level < para->getMaxLevel(); level++) {
-        for (unsigned int u = 0; u < para->getParH(level)->numberOfNodes; u++) {
-            real x1  = para->getParH(level)->coordinateX[u];
-            real x2  = para->getParH(level)->coordinateY[u];
-            real x3  = para->getParH(level)->coordinateZ[u];
-            real x1N = para->getParH(level)->coordinateX[para->getParH(level)->neighborX[u]];
-            real x2N = para->getParH(level)->coordinateY[para->getParH(level)->neighborX[u]];
-            real x3N = para->getParH(level)->coordinateZ[para->getParH(level)->neighborX[u]];
+        for (size_t index = 0; index < para->getParH(level)->numberOfNodes; index++) {
+            real x1  = para->getParH(level)->coordinateX[index];
+            real x2  = para->getParH(level)->coordinateY[index];
+            real x3  = para->getParH(level)->coordinateZ[index];
+            real x1N = para->getParH(level)->coordinateX[para->getParH(level)->neighborX[index]];
+            real x2N = para->getParH(level)->coordinateY[para->getParH(level)->neighborX[index]];
+            real x3N = para->getParH(level)->coordinateZ[para->getParH(level)->neighborX[index]];
 
             nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
             nodesVec[nodeCount++] = (makeUbTuple((float)(x1N), (float)(x2N), (float)(x3N)));
 
-            if (para->getParH(level)->typeOfGridNode[u] == GEO_FLUID) {
+            if (para->getParH(level)->typeOfGridNode[index] == GEO_FLUID) {
                 cellsVec.push_back(makeUbTuple(nodeCount - 2, nodeCount - 1));
             }
         }
@@ -350,10 +350,10 @@ void writeNeighborYPointsDebug(Parameter *para)
     nodesVec.resize(nodeNumberVec);
     int nodeCount2 = 0;
     for (int level = 0; level <= para->getMaxLevel(); level++) {
-        for (unsigned int u = 0; u < para->getParH(level)->numberOfNodes; u++) {
-            real x1 = para->getParH(level)->coordinateX[para->getParH(level)->neighborY[u]];
-            real x2 = para->getParH(level)->coordinateY[para->getParH(level)->neighborY[u]];
-            real x3 = para->getParH(level)->coordinateZ[para->getParH(level)->neighborY[u]];
+        for (size_t index = 0; index < para->getParH(level)->numberOfNodes; index++) {
+            real x1 = para->getParH(level)->coordinateX[para->getParH(level)->neighborY[index]];
+            real x2 = para->getParH(level)->coordinateY[para->getParH(level)->neighborY[index]];
+            real x3 = para->getParH(level)->coordinateZ[para->getParH(level)->neighborY[index]];
 
             nodesVec[nodeCount2++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
         }
@@ -377,18 +377,18 @@ void writeNeighborYLinesDebug(Parameter *para)
     nodesVec.resize(nodeNumberVec * 2);
     int nodeCount = 0;
     for (int level = 0; level < para->getMaxLevel(); level++) {
-        for (unsigned int u = 0; u < para->getParH(level)->numberOfNodes; u++) {
-            real x1  = para->getParH(level)->coordinateX[u];
-            real x2  = para->getParH(level)->coordinateY[u];
-            real x3  = para->getParH(level)->coordinateZ[u];
-            real x1N = para->getParH(level)->coordinateX[para->getParH(level)->neighborY[u]];
-            real x2N = para->getParH(level)->coordinateY[para->getParH(level)->neighborY[u]];
-            real x3N = para->getParH(level)->coordinateZ[para->getParH(level)->neighborY[u]];
+        for (size_t index = 0; index < para->getParH(level)->numberOfNodes; index++) {
+            real x1  = para->getParH(level)->coordinateX[index];
+            real x2  = para->getParH(level)->coordinateY[index];
+            real x3  = para->getParH(level)->coordinateZ[index];
+            real x1N = para->getParH(level)->coordinateX[para->getParH(level)->neighborY[index]];
+            real x2N = para->getParH(level)->coordinateY[para->getParH(level)->neighborY[index]];
+            real x3N = para->getParH(level)->coordinateZ[para->getParH(level)->neighborY[index]];
 
             nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
             nodesVec[nodeCount++] = (makeUbTuple((float)(x1N), (float)(x2N), (float)(x3N)));
 
-            if (para->getParH(level)->typeOfGridNode[u] == GEO_FLUID) {
+            if (para->getParH(level)->typeOfGridNode[index] == GEO_FLUID) {
                 cellsVec.push_back(makeUbTuple(nodeCount - 2, nodeCount - 1));
             }
         }
@@ -410,10 +410,10 @@ void writeNeighborZPointsDebug(Parameter *para)
     nodesVec.resize(nodeNumberVec);
     int nodeCount2 = 0;
     for (int level = 0; level <= para->getMaxLevel(); level++) {
-        for (unsigned int u = 0; u < para->getParH(level)->numberOfNodes; u++) {
-            real x1 = para->getParH(level)->coordinateX[para->getParH(level)->neighborZ[u]];
-            real x2 = para->getParH(level)->coordinateY[para->getParH(level)->neighborZ[u]];
-            real x3 = para->getParH(level)->coordinateZ[para->getParH(level)->neighborZ[u]];
+        for (size_t index = 0; index < para->getParH(level)->numberOfNodes; index++) {
+            real x1 = para->getParH(level)->coordinateX[para->getParH(level)->neighborZ[index]];
+            real x2 = para->getParH(level)->coordinateY[para->getParH(level)->neighborZ[index]];
+            real x3 = para->getParH(level)->coordinateZ[para->getParH(level)->neighborZ[index]];
 
             nodesVec[nodeCount2++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
         }
@@ -437,18 +437,18 @@ void writeNeighborZLinesDebug(Parameter *para)
     nodesVec.resize(nodeNumberVec * 2);
     int nodeCount = 0;
     for (int level = 0; level < para->getMaxLevel(); level++) {
-        for (unsigned int u = 0; u < para->getParH(level)->numberOfNodes; u++) {
-            real x1  = para->getParH(level)->coordinateX[u];
-            real x2  = para->getParH(level)->coordinateY[u];
-            real x3  = para->getParH(level)->coordinateZ[u];
-            real x1N = para->getParH(level)->coordinateX[para->getParH(level)->neighborZ[u]];
-            real x2N = para->getParH(level)->coordinateY[para->getParH(level)->neighborZ[u]];
-            real x3N = para->getParH(level)->coordinateZ[para->getParH(level)->neighborZ[u]];
+        for (size_t index = 0; index < para->getParH(level)->numberOfNodes; index++) {
+            real x1  = para->getParH(level)->coordinateX[index];
+            real x2  = para->getParH(level)->coordinateY[index];
+            real x3  = para->getParH(level)->coordinateZ[index];
+            real x1N = para->getParH(level)->coordinateX[para->getParH(level)->neighborZ[index]];
+            real x2N = para->getParH(level)->coordinateY[para->getParH(level)->neighborZ[index]];
+            real x3N = para->getParH(level)->coordinateZ[para->getParH(level)->neighborZ[index]];
 
             nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
             nodesVec[nodeCount++] = (makeUbTuple((float)(x1N), (float)(x2N), (float)(x3N)));
 
-            if (para->getParH(level)->typeOfGridNode[u] == GEO_FLUID) {
+            if (para->getParH(level)->typeOfGridNode[index] == GEO_FLUID) {
                 cellsVec.push_back(makeUbTuple(nodeCount - 2, nodeCount - 1));
             }
         }
diff --git a/src/gpu/VirtualFluids_GPU/Output/NeighborDebugWriter.hpp b/src/gpu/VirtualFluids_GPU/Output/NeighborDebugWriter.hpp
index 83f0a677b0012153cf079b466a333acc58bda6be..57139d25ae4d046e1dd1be1f3ef5e179daf0872e 100644
--- a/src/gpu/VirtualFluids_GPU/Output/NeighborDebugWriter.hpp
+++ b/src/gpu/VirtualFluids_GPU/Output/NeighborDebugWriter.hpp
@@ -5,53 +5,57 @@
 #include "Logger.h"
 #include "Parameter/Parameter.h"
 #include "basics/utilities/UbSystem.h"
-#include "grid/NodeValues.h"
+#include "gpu/GridGenerator/grid/NodeValues.h"
 #include "lbm/constants/D3Q27.h"
 #include <basics/writer/WbWriterVtkXmlBinary.h>
 
-#include "Utilities/FindNeighbors.h"
-#include "VirtualFluids_GPU/Communication/Communicator.h"
 #include "Core/StringUtilities/StringUtil.h"
+#include "Utilities/FindNeighbors.h"
+#include "gpu/VirtualFluids_GPU/Communication/Communicator.h"
 
 namespace NeighborDebugWriter
 {
 
-inline void writeNeighborLinkLines(Parameter *para, const int level, const uint numberOfNodes, const int direction,
-                                   const std::string &name)
+inline void writeNeighborLinkLines(LBMSimulationParameter *parH, int direction, const std::string &name,
+                                   WbWriter *writer)
 {
     VF_LOG_INFO("Write node links in direction {}.", direction);
-    std::vector<UbTupleFloat3> nodes(numberOfNodes * 2);
-    std::vector<UbTupleInt2> cells(numberOfNodes);
 
-    for (uint position = 0; position < numberOfNodes; position++) {
-        if (para->getParH(level)->typeOfGridNode[position] != GEO_FLUID)
+    const unsigned long long numberOfNodes = parH->numberOfNodes;
+    std::vector<UbTupleFloat3> nodes;
+    nodes.reserve(numberOfNodes);
+    std::vector<UbTupleInt2> cells;
+    cells.reserve(numberOfNodes/2);
+
+    for (size_t position = 0; position < numberOfNodes; position++) {
+        if (parH->typeOfGridNode[position] != GEO_FLUID)
             continue;
 
-        const double x1 = para->getParH(level)->coordinateX[position];
-        const double x2 = para->getParH(level)->coordinateY[position];
-        const double x3 = para->getParH(level)->coordinateZ[position];
+        const double x1 = parH->coordinateX[position];
+        const double x2 = parH->coordinateY[position];
+        const double x3 = parH->coordinateZ[position];
 
-        const uint positionNeighbor = getNeighborIndex(para->getParH(level).get(), position, direction);
+        const uint positionNeighbor = getNeighborIndex(parH, (uint)position, direction);
 
-        const double x1Neighbor = para->getParH(level)->coordinateX[positionNeighbor];
-        const double x2Neighbor = para->getParH(level)->coordinateY[positionNeighbor];
-        const double x3Neighbor = para->getParH(level)->coordinateZ[positionNeighbor];
+        const double x1Neighbor = parH->coordinateX[positionNeighbor];
+        const double x2Neighbor = parH->coordinateY[positionNeighbor];
+        const double x3Neighbor = parH->coordinateZ[positionNeighbor];
 
         nodes.emplace_back(float(x1), float(x2), float(x3));
         nodes.emplace_back(float(x1Neighbor), float(x2Neighbor), float(x3Neighbor));
 
         cells.emplace_back((int)nodes.size() - 2, (int)nodes.size() - 1);
     }
-    WbWriterVtkXmlBinary::getInstance()->writeLines(name, nodes, cells);
+    writer->writeLines(name, nodes, cells);
 }
 
 inline void writeNeighborLinkLinesDebug(Parameter *para)
 {
     for (int level = 0; level <= para->getMaxLevel(); level++) {
-        for (int direction = vf::lbm::dir::STARTDIR; direction <= vf::lbm::dir::ENDDIR; direction++) {
+        for (size_t direction = vf::lbm::dir::STARTDIR; direction <= vf::lbm::dir::ENDDIR; direction++) {
             const std::string fileName = para->getFName() + "_" + StringUtil::toString<int>(level) + "_Link_" +
                                          std::to_string(direction) + "_Debug.vtk";
-            writeNeighborLinkLines(para, level, para->getParH(level)->numberOfNodes, direction, fileName);
+            writeNeighborLinkLines(para->getParH(level).get(), (int)direction, fileName, WbWriterVtkXmlBinary::getInstance());
         }
     }
 }
diff --git a/src/gpu/VirtualFluids_GPU/Output/NeighborDebugWriterTest.cpp b/src/gpu/VirtualFluids_GPU/Output/NeighborDebugWriterTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a19ed3d723f28998f5d27cd15ebf4bab8ba061c4
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Output/NeighborDebugWriterTest.cpp
@@ -0,0 +1,79 @@
+#include <gmock/gmock.h>
+#include "NeighborDebugWriter.hpp"
+#include "gpu/VirtualFluids_GPU/Utilities/testUtilitiesGPU.h"
+
+class WbWriterSpy : public WbWriter
+{
+public:
+    std::string writeLines(const std::string & /*filename*/, std::vector<UbTupleFloat3> &nodes,
+                           std::vector<UbTupleInt2> &lines) override
+    {
+        this->nodes = nodes;
+        this->lines = lines;
+        return "";
+    }
+    std::vector<UbTupleFloat3> nodes;
+    std::vector<UbTupleInt2> lines;
+
+    std::string getFileExtension() override { return ""; }
+};
+
+class NeighborDebugWriterTest : public testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        typeOfGridNode = std::vector<uint>(numberOfNodes, GEO_FLUID);
+        neighbors = std::vector<uint>(numberOfNodes, 2);
+        coordinates = std::vector<real>(numberOfNodes, 1.0);
+        coordinates[2] = 3.0;
+
+        parH->numberOfNodes = numberOfNodes;
+        parH->coordinateX = coordinates.data();
+        parH->coordinateY = coordinates.data();
+        parH->coordinateZ = coordinates.data();
+        parH->neighborX = neighbors.data();
+        parH->typeOfGridNode = typeOfGridNode.data();
+    }
+
+    const int level = 0;
+    const unsigned long long numberOfNodes = 3;
+    const uint direction = vf::lbm::dir::DIR_P00; // x
+    std::unique_ptr<LBMSimulationParameter> parH = std::make_unique<LBMSimulationParameter>();
+    WbWriterSpy writerSpy;
+    std::vector<uint> typeOfGridNode;
+    std::vector<uint> neighbors;
+    std::vector<real> coordinates;
+};
+
+TEST_F(NeighborDebugWriterTest, writeNeighborLinkLines_onlyFLuidNodes_writesAllNodes)
+{
+    UbTupleFloat3 oneCoord(1.0, 1.0, 1.0);
+    UbTupleFloat3 threeCoord(3.0, 3.0, 3.0);
+    std::vector<UbTupleFloat3> expectedNodes = { oneCoord, threeCoord, oneCoord, threeCoord, threeCoord, threeCoord };
+    std::vector<UbTupleInt2> expectedLines = { UbTupleInt2(0, 1), UbTupleInt2(2, 3), UbTupleInt2(4, 5) };
+
+    NeighborDebugWriter::writeNeighborLinkLines(parH.get(), direction, "name", &writerSpy);
+
+    EXPECT_THAT(writerSpy.nodes.size(), testing::Eq(numberOfNodes * 2));
+    EXPECT_THAT(writerSpy.lines.size(), testing::Eq(numberOfNodes));
+    EXPECT_THAT(writerSpy.nodes, testing::Eq(expectedNodes));
+    EXPECT_THAT(writerSpy.lines, testing::Eq(expectedLines));
+}
+
+TEST_F(NeighborDebugWriterTest, writeNeighborLinkLines_fluidAndSolidNodes_writesOnlyFluidNodes)
+{
+    typeOfGridNode[2] = GEO_SOLID;
+    
+    UbTupleFloat3 oneCoord(1.0, 1.0, 1.0);
+    UbTupleFloat3 threeCoord(3.0, 3.0, 3.0);
+    std::vector<UbTupleFloat3> expectedNodes = { oneCoord, threeCoord, oneCoord, threeCoord};
+    std::vector<UbTupleInt2> expectedLines = { UbTupleInt2(0, 1), UbTupleInt2(2, 3)};
+
+    NeighborDebugWriter::writeNeighborLinkLines(parH.get(), direction, "name", &writerSpy);
+
+    EXPECT_THAT(writerSpy.nodes.size(), testing::Eq((numberOfNodes-1) * 2));
+    EXPECT_THAT(writerSpy.lines.size(), testing::Eq(numberOfNodes-1));
+    EXPECT_THAT(writerSpy.nodes, testing::Eq(expectedNodes));
+    EXPECT_THAT(writerSpy.lines, testing::Eq(expectedLines));
+}
diff --git a/src/gpu/VirtualFluids_GPU/Output/PosWriter.hpp b/src/gpu/VirtualFluids_GPU/Output/PosWriter.hpp
index 456f9c148c75c27fb899f976ba4f99b109fc3d4b..ce611d25d1aa3f9e98840a0f04d9b2045d0a224f 100644
--- a/src/gpu/VirtualFluids_GPU/Output/PosWriter.hpp
+++ b/src/gpu/VirtualFluids_GPU/Output/PosWriter.hpp
@@ -33,9 +33,9 @@ public:
 			{
 				out.writeInteger(para->getParH(level)->numberOfNodes);
 				out.writeLine();
-				for(unsigned int u=0; u<para->getParH(level)->numberOfNodes; u++)
+				for(size_t index = 0; index < para->getParH(level)->numberOfNodes; index++)
 				{
-					out.writeInteger(para->getParH(level)->typeOfGridNode[u]);
+					out.writeInteger(para->getParH(level)->typeOfGridNode[index]);
 				}
 				out.writeLine();
 			} //end levelloop
@@ -46,9 +46,9 @@ public:
 			{
 				out.writeInteger(para->getParH(level)->numberOfNodes);
 				out.writeLine();
-				for(unsigned int u=0; u<para->getParH(level)->numberOfNodes; u++)
+                for (size_t index = 0; index < para->getParH(level)->numberOfNodes; index++)
 				{
-					out.writeInteger(para->getParH(level)->neighborX[u]);
+					out.writeInteger(para->getParH(level)->neighborX[index]);
 				}
 				out.writeLine();
 			} //end levelloop
@@ -59,9 +59,9 @@ public:
 			{
 				out.writeInteger(para->getParH(level)->numberOfNodes);
 				out.writeLine();
-				for(unsigned int u=0; u<para->getParH(level)->numberOfNodes; u++)
+                for (size_t index = 0; index < para->getParH(level)->numberOfNodes; index++)
 				{
-					out.writeInteger(para->getParH(level)->neighborY[u]);
+					out.writeInteger(para->getParH(level)->neighborY[index]);
 				}
 				out.writeLine();
 			} //end levelloop
@@ -72,9 +72,9 @@ public:
 			{
 				out.writeInteger(para->getParH(level)->numberOfNodes);
 				out.writeLine();
-				for(unsigned int u=0; u<para->getParH(level)->numberOfNodes; u++)
+                for (size_t index = 0; index < para->getParH(level)->numberOfNodes; index++)
 				{
-					out.writeInteger(para->getParH(level)->neighborZ[u]);
+					out.writeInteger(para->getParH(level)->neighborZ[index]);
 				}
 				out.writeLine();
 			} //end levelloop
diff --git a/src/gpu/VirtualFluids_GPU/Output/QDebugVtkWriter.hpp b/src/gpu/VirtualFluids_GPU/Output/QDebugVtkWriter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d567c695a0e33b7a88c2c8cf3bcb88093ce5b802
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Output/QDebugVtkWriter.hpp
@@ -0,0 +1,96 @@
+#ifndef QVTKWRITER_HPP
+#define QVTKWRITER_HPP
+
+#include <array>
+#include <vector>
+
+#include "basics/Core/StringUtilities/StringUtil.h"
+#include "basics/utilities/UbSystem.h"
+#include "basics/writer/WbWriterVtkXmlBinary.h"
+#include "lbm/constants/D3Q27.h"
+#include "logger/Logger.h"
+
+#include "gpu/GridGenerator/grid/NodeValues.h"
+#include "gpu/VirtualFluids_GPU/Communication/Communicator.h"
+#include "gpu/VirtualFluids_GPU/LBM/LB.h"
+#include "gpu/VirtualFluids_GPU/Parameter/Parameter.h"
+#include "gpu/VirtualFluids_GPU/Utilities/FindNeighbors.h"
+
+namespace QDebugVtkWriter
+{
+
+using namespace vf::lbm::dir;
+
+namespace
+{
+inline void modifyLineLengthsForQs(const std::array<double, 3> &coords, std::array<double, 3> &neighborCoords, real q)
+{
+    if (q == 1.0 || q <= 0.0)
+        return;
+
+    const auto dx = neighborCoords[0] - coords[0];
+    const auto dy = neighborCoords[1] - coords[1];
+    const auto dz = neighborCoords[2] - coords[2];
+
+    neighborCoords[0] = coords[0] + q * dx;
+    neighborCoords[1] = coords[1] + q * dy;
+    neighborCoords[2] = coords[2] + q * dz;
+}
+
+inline void writeQLines(LBMSimulationParameter *parH, QforBoundaryConditions &boundaryQ, const std::string &filepath,
+                        WbWriter *writer)
+{
+    VF_LOG_INFO("Write qs in for boundary condition to {}.", filepath);
+
+    const auto numberOfNodes = boundaryQ.numberOfBCnodes;
+    std::vector<UbTupleFloat3> nodes;
+    nodes.reserve(numberOfNodes * 8 * 2);
+    std::vector<UbTupleInt2> lines;
+    lines.reserve(numberOfNodes * 8);
+
+    std::vector<std::string> dataNames = { "nodeIndex", "q" };
+    std::vector<std::vector<float>> lineData(2);
+
+    for (size_t i = 0; i < numberOfNodes; i++) {
+        const auto nodeIndex = boundaryQ.k[i];
+        const std::array<double, 3> coords = { parH->coordinateX[nodeIndex], parH->coordinateY[nodeIndex],
+                                               parH->coordinateZ[nodeIndex] };
+
+        for (size_t direction = 1; direction < ENDDIR; direction++) {
+
+            const auto q = boundaryQ.q27[direction][i];
+            if (q <= (real)0.0) {
+                continue;
+            }
+
+            const auto positionNeighbor = getNeighborIndex(parH, (uint)nodeIndex, (int)direction);
+
+            std::array<double, 3> neighborCoords = { parH->coordinateX[positionNeighbor],
+                                                     parH->coordinateY[positionNeighbor],
+                                                     parH->coordinateZ[positionNeighbor] };
+
+            modifyLineLengthsForQs(coords, neighborCoords, q);
+
+            nodes.emplace_back(float(coords[0]), float(coords[1]), coords[2]);
+            nodes.emplace_back(float(neighborCoords[0]), float(neighborCoords[1]), float(neighborCoords[2]));
+
+            lines.emplace_back((int)nodes.size() - 2, (int)nodes.size() - 1);
+            lineData[0].push_back(nodeIndex);
+            lineData[1].push_back(q);
+        }
+    }
+
+    writer->writeLinesWithLineData(filepath, nodes, lines, dataNames, lineData);
+}
+} // namespace
+
+inline void writeQLinesDebug(Parameter *para, QforBoundaryConditions &boundaryQ, uint level, const std::string& fileName)
+{
+    const auto filePath = para->getFName() + "_" + fileName + ".vtk";
+    auto writer = WbWriterVtkXmlBinary::getInstance();
+    writeQLines(para->getParH(level).get(), boundaryQ, filePath, writer);
+}
+
+} // namespace QDebugVtkWriter
+
+#endif
diff --git a/src/gpu/VirtualFluids_GPU/Output/QDebugVtkWriterTest.cpp b/src/gpu/VirtualFluids_GPU/Output/QDebugVtkWriterTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9eecb25c663fcfc8fde353b76ccf20cbcb9cf272
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Output/QDebugVtkWriterTest.cpp
@@ -0,0 +1,60 @@
+#include "gmock/gmock.h"
+#include <cmath>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "QDebugVtkWriter.hpp"
+#include <tuple>
+
+MATCHER(DoubleNear5, "") {
+    return abs(std::get<0>(arg) - std::get<1>(arg)) < 0.00001;
+}
+
+using namespace QDebugVtkWriter;
+
+double calcVectorLength(const std::array<double, 3> coords, const std::array<double, 3> neighborCoords)
+{
+    return std::sqrt(std::pow((neighborCoords[0] - coords[0]), 2) + std::pow((neighborCoords[1] - coords[1]), 2) +
+                     std::pow((neighborCoords[2] - coords[2]), 2));
+}
+
+TEST(QDebugVtkWriterTest, modifyLineLengthsForQsSameCoords3)
+{
+    const std::array<double, 3> coords = { 0, 0, 0 };
+    std::array<double, 3> neighborCoords = { 1, 1, 1 };
+    const real q = 0.3;
+    const real initialLength = calcVectorLength(coords, neighborCoords);
+
+    modifyLineLengthsForQs(coords, neighborCoords, q);
+
+    std::array<double, 3> expectedNeighborCoords = { 0.3, 0.3, 0.3 };
+    EXPECT_THAT(neighborCoords,testing::Pointwise(DoubleNear5(), expectedNeighborCoords));
+    EXPECT_THAT(calcVectorLength(coords, neighborCoords), testing::DoubleNear(q*initialLength, 0.00001));
+}
+
+TEST(QDebugVtkWriterTest, modifyLineLengthDifferentCoords)
+{
+    const std::array<double, 3> coords = { 0, 0, 0 };
+    std::array<double, 3> neighborCoords = { 1, 2, 3 };
+    const real q = 0.3;
+    const real initialLength = calcVectorLength(coords, neighborCoords);
+
+    modifyLineLengthsForQs(coords, neighborCoords, q);
+
+    std::array<double, 3> expectedNeighborCoords = { 0.3, 0.6, 0.9 };
+    EXPECT_THAT(neighborCoords,testing::Pointwise(DoubleNear5(), expectedNeighborCoords));
+    EXPECT_THAT(calcVectorLength(coords, neighborCoords), testing::DoubleNear(q*initialLength, 0.00001));
+}
+
+TEST(QDebugVtkWriterTest, modifyLineLengthNegativeCoord)
+{
+    const std::array<double, 3> coords = { 0, 0, 0 };
+    std::array<double, 3> neighborCoords = { 1, 2, -3 };
+    const real q = 0.3;
+    const real initialLength = calcVectorLength(coords, neighborCoords);
+
+    modifyLineLengthsForQs(coords, neighborCoords, q);
+
+    std::array<double, 3> expectedNeighborCoords = { 0.3, 0.6, -0.9 };
+    EXPECT_THAT(neighborCoords,testing::Pointwise(DoubleNear5(), expectedNeighborCoords));
+    EXPECT_THAT(calcVectorLength(coords, neighborCoords), testing::DoubleNear(q*initialLength, 0.00001));
+}
diff --git a/src/gpu/VirtualFluids_GPU/Output/QDebugWriter.hpp b/src/gpu/VirtualFluids_GPU/Output/QDebugWriter.hpp
index d006636572377477aeb3599a8ae843ea2b1e31ff..c1a3658d318eb47e84530bf437afa0bb6ba91743 100644
--- a/src/gpu/VirtualFluids_GPU/Output/QDebugWriter.hpp
+++ b/src/gpu/VirtualFluids_GPU/Output/QDebugWriter.hpp
@@ -13,8 +13,6 @@
 #include <basics/writer/WbWriterVtkXmlBinary.h>
 #include "Core/StringUtilities/StringUtil.h"
 
-//using namespace std;
-
 namespace QDebugWriter
 {
     void writeQValues(QforBoundaryConditions &Q, int* k, int kq, const std::string &name)
diff --git a/src/gpu/VirtualFluids_GPU/Output/UnstructuredGridWriter.hpp b/src/gpu/VirtualFluids_GPU/Output/UnstructuredGridWriter.hpp
index 81f2c028a6bbc7cd9c077571349f4f0465a08a05..f26b4e5795466a72aa1894de37bdb066b9ab9d04 100644
--- a/src/gpu/VirtualFluids_GPU/Output/UnstructuredGridWriter.hpp
+++ b/src/gpu/VirtualFluids_GPU/Output/UnstructuredGridWriter.hpp
@@ -33,7 +33,7 @@ namespace UnstructuredGridWriter
 
 		bool neighborsFluid;
 
-		unsigned int allnodes = para->getParH(level)->numberOfNodes * 8;
+		unsigned long long allnodes = para->getParH(level)->numberOfNodes * 8;
 
 		nodes.resize(allnodes);
 		nodedata[0].resize(allnodes);
@@ -45,7 +45,7 @@ namespace UnstructuredGridWriter
 		unsigned int nodeCount = 0;
 		double nodeDeltaLevel = para->getParH(level)->dx;
 
-		for (unsigned int pos=0;pos<para->getParH(level)->numberOfNodes;pos++)
+		for (size_t pos = 0; pos < para->getParH(level)->numberOfNodes; pos++)
 		{
 			if (para->getParH(level)->typeOfGridNode[pos] == GEO_FLUID /*!= GEO_VOID*/)
 			{
@@ -197,9 +197,9 @@ namespace UnstructuredGridWriter
 			vxmax = 0;
 			//printf("\n test in if I... \n");
 			//////////////////////////////////////////////////////////////////////////
-			if ( ((part+1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
+			if ( ((part+1)*para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
 			{
-				sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+                sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
 			}
 			else
 			{
@@ -340,9 +340,9 @@ namespace UnstructuredGridWriter
 			vxmax = 0;
 			//printf("\n test in if I... \n");
 			//////////////////////////////////////////////////////////////////////////
-			if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
+            if (((part + 1) * para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
 			{
-				sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+                sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
 			}
 			else
 			{
@@ -479,9 +479,9 @@ namespace UnstructuredGridWriter
 			vxmax = 0;
 			//printf("\n test in if I... \n");
 			//////////////////////////////////////////////////////////////////////////
-			if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
+            if (((part + 1) * para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
 			{
-				sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+                sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
 			}
 			else
 			{
@@ -628,9 +628,9 @@ namespace UnstructuredGridWriter
 			vxmax = 0;
 			//printf("\n test in if I... \n");
 			//////////////////////////////////////////////////////////////////////////
-			if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
+            if (((part + 1) * para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
 			{
-				sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+                sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
 			}
 			else
 			{
@@ -771,9 +771,9 @@ namespace UnstructuredGridWriter
 			vxmax = 0;
 			//printf("\n test in if I... \n");
 			//////////////////////////////////////////////////////////////////////////
-			if ( ((part+1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
+            if (((part + 1) * para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
 			{
-				sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+                sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
 			}
 			else
 			{
@@ -896,10 +896,10 @@ namespace UnstructuredGridWriter
 		vector< vector< double > > nodedata(nodedatanames.size());
 
 		//printf("\n test for if... \n");
-		if (para->getParH(level)->numberOfNodes > limitOfNodes)
+        if ((uint)para->getParH(level)->numberOfNodes > limitOfNodes)
 		{
 			//printf("\n test in if I... \n");
-			unsigned int restOfNodes = para->getParH(level)->numberOfNodes - limitOfNodes;
+            unsigned int restOfNodes = (uint)para->getParH(level)->numberOfNodes - limitOfNodes;
 			//////////////////////////////////////////////////////////////////////////
 			//PART I
 			nodes.resize(limitOfNodes);
@@ -984,7 +984,7 @@ namespace UnstructuredGridWriter
 			nodedata[5].resize(restOfNodes);
 			//printf("\n test in if IV... \n");
 
-			for (unsigned int pos=limitOfNodes;pos<para->getParH(level)->numberOfNodes;pos++)
+			for (size_t pos = limitOfNodes; pos < para->getParH(level)->numberOfNodes; pos++)
 			{
 				if (para->getParH(level)->typeOfGridNode[pos] == GEO_FLUID)
 				{
@@ -1055,7 +1055,7 @@ namespace UnstructuredGridWriter
 			nodedata[5].resize(para->getParH(level)->numberOfNodes);
 
 			//printf("\n test in else II... \n");
-			for (unsigned int pos=0;pos<para->getParH(level)->numberOfNodes;pos++)
+			for (size_t pos = 0; pos < para->getParH(level)->numberOfNodes; pos++)
 			{
 				if (para->getParH(level)->typeOfGridNode[pos] == GEO_FLUID)
 				{
@@ -1148,7 +1148,7 @@ namespace UnstructuredGridWriter
 		unsigned int number1,number2,number3,number4,number5,number6,number7,number8;
 		bool neighborsFluid;
 		double vxmax = 0;
-		vector< vector< double > > nodedata(nodedatanames.size());
+		vector<vector<double>> nodedata(nodedatanames.size());
 
 		nodes.resize(para->getParH(level)->numberOfNodes);
 		nodedata[0].resize(para->getParH(level)->numberOfNodes);
@@ -1158,7 +1158,7 @@ namespace UnstructuredGridWriter
 		nodedata[4].resize(para->getParH(level)->numberOfNodes);
 		nodedata[5].resize(para->getParH(level)->numberOfNodes);
 
-		for (unsigned int pos=0;pos<para->getParH(level)->numberOfNodes;pos++)
+		for (size_t pos = 0; pos < para->getParH(level)->numberOfNodes; pos++)
 		{
 			if (para->getParH(level)->typeOfGridNode[pos] == GEO_FLUID)
 			{
@@ -1244,7 +1244,7 @@ namespace UnstructuredGridWriter
 		nodedata[4].resize(para->getParH(level)->numberOfNodes);
 		nodedata[5].resize(para->getParH(level)->numberOfNodes);
 
-		for (unsigned int pos=0;pos<para->getParH(level)->numberOfNodes;pos++)
+		for (size_t pos = 0; pos < para->getParH(level)->numberOfNodes; pos++)
 		{
 			if (para->getParH(level)->typeOfGridNode[pos] == GEO_FLUID)
 			{
@@ -1342,9 +1342,9 @@ namespace UnstructuredGridWriter
 			vxmax = 0;
 			//printf("\n test in if I... \n");
 			//////////////////////////////////////////////////////////////////////////
-			if ( ((part+1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
+			if ( ((part+1)*para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
 			{
-				sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+                sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
 			}
 			else
 			{
@@ -1364,7 +1364,7 @@ namespace UnstructuredGridWriter
 			nodedata[5].resize(sizeOfNodes);
 			//////////////////////////////////////////////////////////////////////////
 			//printf("\n test in if II... \n");
-			for (unsigned int pos=startpos;pos<endpos;pos++)
+			for (size_t pos = startpos; pos < endpos; pos++)
 			{
 				if (para->getParH(level)->typeOfGridNode[pos] == GEO_FLUID)
 				{
@@ -1465,9 +1465,9 @@ namespace UnstructuredGridWriter
 			vxmax = 0;
 			//printf("\n test in if I... \n");
 			//////////////////////////////////////////////////////////////////////////
-			if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
+            if (((part + 1) * para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
 			{
-				sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+                sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
 			}
 			else
 			{
@@ -1595,9 +1595,9 @@ namespace UnstructuredGridWriter
 			vxmax = 0;
 			//printf("\n test in if I... \n");
 			//////////////////////////////////////////////////////////////////////////
-			if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
+            if (((part + 1) * para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
 			{
-				sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+                sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
 			}
 			else
 			{
@@ -1728,7 +1728,7 @@ namespace UnstructuredGridWriter
 		nodedatanames.push_back("geo");
 		unsigned int number1,number2,number3,number4,number5,number6,number7,number8;
 		bool neighborsFluid;
-		vector< vector< double > > nodedata(nodedatanames.size());
+		vector< vector<double>> nodedata(nodedatanames.size());
 
 		nodes.resize(para->getParH(level)->numberOfNodes);
 		nodedata[0].resize(para->getParH(level)->numberOfNodes);
@@ -1738,7 +1738,7 @@ namespace UnstructuredGridWriter
 		nodedata[4].resize(para->getParH(level)->numberOfNodes);
 		nodedata[5].resize(para->getParH(level)->numberOfNodes);
 
-		for (unsigned int pos=0;pos<para->getParH(level)->numberOfNodes;pos++)
+		for (size_t pos = 0; pos < para->getParH(level)->numberOfNodes; pos++)
 		{
 			if (para->getParH(level)->typeOfGridNode[pos] == GEO_FLUID)
 			{
@@ -1825,7 +1825,7 @@ namespace UnstructuredGridWriter
 		nodedata[4].resize(para->getParH(level)->numberOfNodes);
 		nodedata[5].resize(para->getParH(level)->numberOfNodes);
 
-		for (unsigned int pos=0;pos<para->getParH(level)->numberOfNodes;pos++)
+		for (size_t pos = 0; pos < para->getParH(level)->numberOfNodes; pos++)
 		{
 			if (para->getParH(level)->typeOfGridNode[pos] == GEO_FLUID)
 			{
@@ -1975,9 +1975,9 @@ namespace UnstructuredGridWriter
 			vxmax = 0;
 			//printf("\n test in if I... \n");
 			//////////////////////////////////////////////////////////////////////////
-			if ( ((part+1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
+            if (((part + 1) * para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
 			{
-				sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+                sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
 			}
 			else
 			{
@@ -2080,9 +2080,9 @@ namespace UnstructuredGridWriter
 			vxmax = 0;
 			//printf("\n test in if I... \n");
 			//////////////////////////////////////////////////////////////////////////
-			if ( ((part+1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
+            if (((part + 1) * para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
 			{
-				sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+                sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
 			}
 			else
 			{
@@ -2192,9 +2192,9 @@ namespace UnstructuredGridWriter
 			vxmax = 0;
 			//printf("\n test in if I... \n");
 			//////////////////////////////////////////////////////////////////////////
-			if ( ((part+1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->numberOfNodes)
+            if (((part + 1) * para->getlimitOfNodesForVTK()) > (uint)para->getParH(level)->numberOfNodes)
 			{
-				sizeOfNodes = para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
+                sizeOfNodes = (uint)para->getParH(level)->numberOfNodes - (part * para->getlimitOfNodesForVTK());
 			}
 			else
 			{
@@ -2319,7 +2319,7 @@ namespace UnstructuredGridWriter
 			wallX3 = 0.0;
 			q      = 0.0;
 			//////////////////////////////////////////////////////////////////////////
-			for (unsigned int typeOfQ = STARTDIR; typeOfQ <= ENDDIR; typeOfQ++)
+            for (size_t typeOfQ = vf::lbm::dir::STARTDIR; typeOfQ <= vf::lbm::dir::ENDDIR; typeOfQ++)
 			{
 				QQ = para->getParH(level)->geometryBC.q27[0];
 				Q.q27[typeOfQ] = &QQ[typeOfQ*sizeOfNodes];
@@ -2423,7 +2423,7 @@ namespace UnstructuredGridWriter
 			wallX3 = 0.0;
 			q      = 0.0;
 			//////////////////////////////////////////////////////////////////////////
-			for (unsigned int typeOfQ = STARTDIR; typeOfQ <= ENDDIR; typeOfQ++)
+            for (size_t typeOfQ = vf::lbm::dir::STARTDIR; typeOfQ <= vf::lbm::dir::ENDDIR; typeOfQ++)
 			{
 				QQ = para->getParH(level)->velocityBC.q27[0];
 				Q.q27[typeOfQ] = &QQ[typeOfQ*sizeOfNodes];
@@ -2528,7 +2528,7 @@ namespace UnstructuredGridWriter
 			wallX3 = 0.0;
 			q      = 0.0;
 			//////////////////////////////////////////////////////////////////////////
-			for (unsigned int typeOfQ = STARTDIR; typeOfQ <= ENDDIR; typeOfQ++)
+            for (size_t typeOfQ = vf::lbm::dir::STARTDIR; typeOfQ <= vf::lbm::dir::ENDDIR; typeOfQ++)
 			{
 				QQ = para->getParH(level)->pressureBC.q27[0];
 				Q.q27[typeOfQ] = &QQ[typeOfQ*sizeOfNodes];
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp
index 3731836f336d91c1bc4cc5f1a8f5ea0a10bee0a6..3cc771e413134e90b0d09d8eeb6dfee791f8a1e2 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp
+++ b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp
@@ -31,25 +31,39 @@
 #include <helper_cuda.h>
 #include <iostream>
 
-void CudaStreamManager::launchStreams(uint numberOfStreams)
+void CudaStreamManager::registerStream(CudaStreamIndex streamIndex)
+{   
+    if(streamIndex != CudaStreamIndex::Legacy)
+        cudaStreams.emplace(streamIndex, nullptr);
+}
+void CudaStreamManager::launchStreams()
 {
-    cudaStreams.resize(numberOfStreams);
-    for (cudaStream_t &stream : cudaStreams)
-        cudaStreamCreate(&stream);
+    for (auto &stream : cudaStreams)
+        cudaStreamCreate(&stream.second);
 }
 
 void CudaStreamManager::terminateStreams()
 {
-    for (cudaStream_t &stream : cudaStreams)
-        cudaStreamDestroy(stream);
+    for (auto &stream : cudaStreams)
+        cudaStreamDestroy(stream.second);
 }
 
-cudaStream_t &CudaStreamManager::getStream(uint streamIndex)
-{ return cudaStreams[streamIndex]; }
-
-int CudaStreamManager::getBorderStreamIndex() { return borderStreamIndex; }
+cudaStream_t &CudaStreamManager::getStream(CudaStreamIndex streamIndex, uint multiStreamIndex)
+{
+    if(streamIndex == CudaStreamIndex::Legacy)  return legacyStream;
+    if(streamIsRegistered(streamIndex))
+    {
+        auto it = cudaStreams.find(streamIndex);
+        for(uint idx=0; idx<multiStreamIndex; idx++) it++;
+        return it->second;
+    }
+    return legacyStream;
+}
 
-int CudaStreamManager::getBulkStreamIndex() { return bulkStreamIndex; }
+bool CudaStreamManager::streamIsRegistered(CudaStreamIndex streamIndex)
+{
+    return cudaStreams.count(streamIndex) > 0;
+}
 
 void CudaStreamManager::createCudaEvents()
 {
@@ -61,12 +75,12 @@ void CudaStreamManager::destroyCudaEvents()
     checkCudaErrors(cudaEventDestroy(startBulkKernel)); 
 }
 
-void CudaStreamManager::triggerStartBulkKernel(int streamIndex)
+void CudaStreamManager::triggerStartBulkKernel(CudaStreamIndex streamIndex, uint multiStreamIndex)
 {
-    checkCudaErrors(cudaEventRecord(startBulkKernel, cudaStreams[streamIndex]));
+    checkCudaErrors(cudaEventRecord(startBulkKernel, getStream(streamIndex, multiStreamIndex)));
 }
 
-void CudaStreamManager::waitOnStartBulkKernelEvent(int streamIndex)
+void CudaStreamManager::waitOnStartBulkKernelEvent(CudaStreamIndex streamIndex, uint multiStreamIndex)
 {
-    checkCudaErrors(cudaStreamWaitEvent(cudaStreams[streamIndex], startBulkKernel));
+    checkCudaErrors(cudaStreamWaitEvent(getStream(streamIndex, multiStreamIndex), startBulkKernel));
 }
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h
index c2d515ab5fe9c24388632a7ca9e1e4c78b7f1467..5c59bcd3a5e6178d6e70a63f803caf8e29f32604 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h
+++ b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h
@@ -30,32 +30,40 @@
 #ifndef STREAM_MANAGER_H
 #define STREAM_MANAGER_H
 
-#include <vector>
-#include "Core/DataTypes.h"
-
+#include <map>
+#include <cuda.h>
 #include <cuda_runtime.h>
+#include "Core/DataTypes.h"
 
+enum class CudaStreamIndex
+    {
+        Legacy,
+        Bulk,
+        SubDomainBorder,
+        Precursor,
+        ActuatorFarm
+    };
 class CudaStreamManager
-{
+{   
 private:
-    std::vector<cudaStream_t> cudaStreams;
+    std::multimap<CudaStreamIndex, cudaStream_t> cudaStreams;
     cudaEvent_t startBulkKernel = NULL;
-    const int borderStreamIndex       = 1;
-    const int bulkStreamIndex         = 0;
+    cudaStream_t legacyStream = CU_STREAM_LEGACY;
+
 
 public:
-    void launchStreams(uint numberOfStreams);
+    void registerStream(CudaStreamIndex streamIndex);
+    void launchStreams();
     void terminateStreams();
-    cudaStream_t &getStream(uint streamIndex);
-
-    int getBorderStreamIndex();
-    int getBulkStreamIndex();
+    cudaStream_t &getStream(CudaStreamIndex streamIndex, uint multiStreamIndex=0);
 
+    bool streamIsRegistered(CudaStreamIndex streamIndex);
     // Events
     void createCudaEvents();
     void destroyCudaEvents();
-    void triggerStartBulkKernel(int streamIndex);
-    void waitOnStartBulkKernelEvent(int strteamIndex);
+
+    void triggerStartBulkKernel(CudaStreamIndex streamIndex, uint multiStreamIndex=0);
+    void waitOnStartBulkKernelEvent(CudaStreamIndex streamIndex, uint multiStreamIndex=0);
 };
 
 #endif
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
index 7687ec926270f23e57608ca5f3084bd26d4de20e..e593d16d6ed1f69ca65a22606a157e7ea9e6b111 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
@@ -43,6 +43,7 @@
 
 #include <basics/config/ConfigurationFile.h>
 
+#include "Logger.h"
 #include "Parameter/CudaStreamManager.h"
 
 Parameter::Parameter() : Parameter(1, 0, {}) {}
@@ -65,6 +66,8 @@ Parameter::Parameter(int numberOfProcesses, int myId, std::optional<const vf::ba
     initGridPaths();
     initGridBasePoints();
     initDefaultLBMkernelAllLevels();
+
+    this->cudaStreamManager = std::make_unique<CudaStreamManager>();
 }
 
 Parameter::~Parameter() = default;
@@ -500,10 +503,10 @@ void Parameter::initLBMSimulationParameter()
         parH[i]->sizePlaneXY      = parH[i]->nx * parH[i]->ny;
         parH[i]->sizePlaneYZ      = parH[i]->ny * parH[i]->nz;
         parH[i]->sizePlaneXZ      = parH[i]->nx * parH[i]->nz;
-        parH[i]->mem_size_real    = sizeof(real) * parH[i]->size_Mat;
-        parH[i]->mem_size_int     = sizeof(unsigned int) * parH[i]->size_Mat;
-        parH[i]->mem_size_bool    = sizeof(bool) * parH[i]->size_Mat;
-        parH[i]->mem_size_real_yz = sizeof(real) * parH[i]->ny * parH[i]->nz;
+//        parH[i]->mem_size_real    = sizeof(real) * parH[i]->size_Mat;         //DEPRECATED: related to full matrix
+//        parH[i]->mem_size_int     = sizeof(unsigned int) * parH[i]->size_Mat; //DEPRECATED: related to full matrix
+//        parH[i]->mem_size_bool    = sizeof(bool) * parH[i]->size_Mat;         //DEPRECATED: related to full matrix
+//        parH[i]->mem_size_real_yz = sizeof(real) * parH[i]->ny * parH[i]->nz; //DEPRECATED: related to full matrix
         parH[i]->isEvenTimestep        = true;
         parH[i]->startz           = parH[i]->gridNZ * ic.myProcessId;
         parH[i]->endz             = parH[i]->gridNZ * ic.myProcessId + parH[i]->gridNZ;
@@ -568,10 +571,10 @@ void Parameter::initLBMSimulationParameter()
         parD[i]->sizePlaneXY      = parH[i]->sizePlaneXY;
         parD[i]->sizePlaneYZ      = parH[i]->sizePlaneYZ;
         parD[i]->sizePlaneXZ      = parH[i]->sizePlaneXZ;
-        parD[i]->mem_size_real    = sizeof(real) * parD[i]->size_Mat;
-        parD[i]->mem_size_int     = sizeof(unsigned int) * parD[i]->size_Mat;
-        parD[i]->mem_size_bool    = sizeof(bool) * parD[i]->size_Mat;
-        parD[i]->mem_size_real_yz = sizeof(real) * parD[i]->ny * parD[i]->nz;
+        //parD[i]->mem_size_real    = sizeof(real) * parD[i]->size_Mat;          //DEPRECATED: related to full matrix
+        //parD[i]->mem_size_int     = sizeof(unsigned int) * parD[i]->size_Mat;  //DEPRECATED: related to full matrix
+        //parD[i]->mem_size_bool    = sizeof(bool) * parD[i]->size_Mat;          //DEPRECATED: related to full matrix
+        //parD[i]->mem_size_real_yz = sizeof(real) * parD[i]->ny * parD[i]->nz;  //DEPRECATED: related to full matrix
         parD[i]->isEvenTimestep        = parH[i]->isEvenTimestep;
         parD[i]->startz           = parH[i]->startz;
         parD[i]->endz             = parH[i]->endz;
@@ -586,6 +589,30 @@ void Parameter::initLBMSimulationParameter()
         parD[i]->distY            = parH[i]->distY;
         parD[i]->distZ            = parH[i]->distZ;
     }
+
+    checkParameterValidityCumulantK17();
+}
+
+void Parameter::checkParameterValidityCumulantK17() const
+{
+    if (this->mainKernel != "CumulantK17")
+        return;
+
+    const real viscosity = this->parH[maxlevel]->vis;
+    const real viscosityLimit = 1.0 / 42.0;
+    if (viscosity > viscosityLimit) {
+        VF_LOG_WARNING("The viscosity (in LB units) at level {} is {:1.3g}. It is recommended to keep it smaller than {:1.3g} "
+                       "for the CumulantK17 collision kernel.",
+                       maxlevel, viscosity, viscosityLimit);
+    }
+
+    const real velocity = this->ic.u0;
+    const real velocityLimit = 0.1;
+    if (velocity > velocityLimit) {
+        VF_LOG_WARNING("The velocity (in LB units) is {:1.4g}. It is recommended to keep it smaller than {:1.4g} for the "
+                       "CumulantK17 collision kernel.",
+                       velocity, velocityLimit);
+    }
 }
 
 void Parameter::copyMeasurePointsArrayToVector(int lev)
@@ -829,7 +856,7 @@ real Parameter::getLengthRatio()
 }
 real Parameter::getForceRatio()
 {
-    return this->getDensityRatio() * this->getVelocityRatio()/this->getTimeRatio();
+    return (this->getDensityRatio()+1.0) * this->getVelocityRatio()/this->getTimeRatio();
 }
 real Parameter::getScaledViscosityRatio(int level)
 {
@@ -859,6 +886,10 @@ real Parameter::getScaledForceRatio(int level)
 {
     return this->getForceRatio()*(level+1);
 }
+real Parameter::getScaledStressRatio(int level)
+{
+    return this->getVelocityRatio()*this->getVelocityRatio();
+}
 void Parameter::setRealX(real RealX)
 {
     ic.RealX = RealX;
@@ -883,6 +914,10 @@ void Parameter::setPressOutZ(unsigned int PressOutZ)
 {
     ic.PressOutZ = PressOutZ;
 }
+void Parameter::setOutflowPressureCorrectionFactor(real pressBCrhoCorrectionFactor)
+{
+    ic.outflowPressureCorrectionFactor = pressBCrhoCorrectionFactor;
+}
 void Parameter::setMaxDev(int maxdev)
 {
     ic.maxdev = maxdev;
@@ -1607,7 +1642,7 @@ void Parameter::setOutflowBoundaryNormalZ(std::string outflowNormalZ)
 void Parameter::setMainKernel(std::string kernel)
 {
     this->mainKernel = kernel;
-    if (kernel.find("Stream") != std::string::npos || kernel.find("Redesigned") != std::string::npos)
+    if ( kernel.find("CumulantK17") != std::string::npos )
         this->kernelNeedsFluidNodeIndicesToRun = true;
 }
 void Parameter::setMultiKernelOn(bool isOn)
@@ -1720,22 +1755,22 @@ unsigned int Parameter::getSizeMat(int level)
 {
     return parH[level]->size_Mat;
 }
-unsigned int Parameter::getMemSizereal(int level)
-{
-    return parH[level]->mem_size_real;
-}
-unsigned int Parameter::getMemSizeInt(int level)
-{
-    return parH[level]->mem_size_int;
-}
-unsigned int Parameter::getMemSizeBool(int level)
-{
-    return parH[level]->mem_size_bool;
-}
-unsigned int Parameter::getMemSizerealYZ(int level)
-{
-    return parH[level]->mem_size_real_yz;
-}
+//unsigned int Parameter::getMemSizereal(int level)      //DEPRECATED: related to full matrix
+//{
+//    return parH[level]->mem_size_real;
+//}
+//unsigned int Parameter::getMemSizeInt(int level)     //DEPRECATED: related to full matrix
+//{
+//    return parH[level]->mem_size_int;
+//}
+//unsigned int Parameter::getMemSizeBool(int level)    //DEPRECATED: related to full matrix
+//{
+//    return parH[level]->mem_size_bool;
+//}
+//unsigned int Parameter::getMemSizerealYZ(int level)  //DEPRECATED: related to full matrix
+//{
+//    return parH[level]->mem_size_real_yz;
+//}
 int Parameter::getFine()
 {
     return fine;
@@ -1916,6 +1951,10 @@ unsigned int Parameter::getPressOutZ()
 {
     return ic.PressOutZ;
 }
+real Parameter::getOutflowPressureCorrectionFactor()
+{
+    return ic.outflowPressureCorrectionFactor;
+}
 int Parameter::getMaxDev()
 {
     return ic.maxdev;
@@ -2657,8 +2696,7 @@ void Parameter::setUseStreams(bool useStreams)
     if (useStreams) {
         if (this->getNumprocs() != 1) {
             this->useStreams = useStreams;
-            this->cudaStreamManager = std::make_unique<CudaStreamManager>();
-            return;
+            return; 
         } else {
             std::cout << "Can't use streams with only one process!" << std::endl;
         }
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
index cbb8bfd68702bc2285947eb76e6d0adc54a5b6c1..fa45b1742f20e32258195c78b630ce95175af938 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
@@ -54,6 +54,8 @@ class ConfigurationFile;
 }
 class CudaStreamManager;
 
+class TransientBCInputFileReader;
+
 //! \struct LBMSimulationParameter
 //! \brief struct holds and manages the LB-parameter of the simulation
 //! \brief For this purpose it holds structures and pointer for host and device data, respectively.
@@ -65,16 +67,78 @@ struct LBMSimulationParameter {
     //////////////////////////////////////////////////////////////////////////
     //! \brief stores the number of threads per GPU block
     uint numberofthreads;
+    //! \brief store all distribution functions for the D3Q27
+    Distributions27 distributions;
+    //////////////////////////////////////////////////////////////////////////
+    //! \brief stores the type for every lattice node (f.e. fluid node)
+    uint *typeOfGridNode;
+    //////////////////////////////////////////////////////////////////////////
+    //! \brief store the neighbors in +X, +Y, +Z, and in diagonal negative direction
+    //! \brief this information is important because we use an indirect addressing scheme
+    uint *neighborX, *neighborY, *neighborZ, *neighborInverse;
+    //////////////////////////////////////////////////////////////////////////
+    //! \brief store the coordinates for every lattice node
+    real *coordinateX, *coordinateY, *coordinateZ;
+    //////////////////////////////////////////////////////////////////////////
+    //! \brief store the macroscopic values (velocity, density, pressure)
+    //! \brief for every lattice node
+    real *velocityX, *velocityY, *velocityZ, *rho, *pressure;
+    //! \brief stores the value for omega
+    real omega;
+    //////////////////////////////////////////////////////////////////////////
+    //! \brief stores the number of nodes (based on indirect addressing scheme)
+    unsigned long long numberOfNodes;
+    //! \brief stores the size of the memory consumption for real/int values of the arrays (e.g. coordinates, velocity)
+    unsigned long long memSizeRealLBnodes, memSizeLonglongLBnodes;
+
+
+
+
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // DEPRECATED
+    //////////////////////////////////////////////////////////////////////////
 
     // distributions///////////
     // Distributions19 d0;
     Distributions27 d0;  // DEPRECATED: distribution functions for full matrix (not sparse)
-    //! \brief store all distribution functions for the D3Q27
-    Distributions27 distributions;
+
+    // typeOfGridNode (formerly known as "geo") /////////////////////
+    int *geo; // DEPRECATED: typeOfGridNode for full matrix (not sparse)
+
+    // k///////////////////////
+    unsigned int *k; // DEPRECATED: index for full matrix
+
+    // memsize/////////////////
+    //unsigned int mem_size_real_yz;
+    //unsigned int mem_size_bool;
+    //unsigned int mem_size_int;
+    //unsigned int mem_size_real;
+
+    //////////////////////////////////////////////////////////////////////////
+
+
+
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // additional logic 
+    //////////////////////////////////////////////////////////////////////////
 
     // distributions F3////////
     Distributions6 g6;
 
+    unsigned int size_Array_SP;
+
+
+    // memsizeSP/////////////////
+
+
+
+    //////////////////////////////////////////////////////////////////////////
+
+
     // advection diffusion //////////////////
     //! \brief store all distribution functions for the D3Q7 advection diffusion field
     Distributions7 distributionsAD7;
@@ -104,22 +168,6 @@ struct LBMSimulationParameter {
     real cStartx, cStarty, cStartz;
     real cFx, cFy, cFz;
 
-    // typeOfGridNode (formerly known as "geo") /////////////////////
-    int *geo; // DEPRECATED: typeOfGridNode for full matrix (not sparse)
-    //! \brief stores the type for every lattice node (f.e. fluid node)
-    unsigned int *typeOfGridNode;
-
-    // k///////////////////////
-    unsigned int *k; // DEPRECATED: index for full matrix
-
-    // neighbor///////////////////////////////////////////////////////////////
-    //! \brief store the neighbors in +X, +Y, +Z, and in diagonal negative direction
-    //! \brief this information is important because we use an indirect addressing scheme
-    uint *neighborX, *neighborY, *neighborZ, *neighborInverse;
-
-    // coordinates////////////////////////////////////////////////////////////
-    //! \brief store the coordinates for every lattice node
-    real *coordinateX, *coordinateY, *coordinateZ;
 
     // body forces////////////
     real *forceX_SP, *forceY_SP, *forceZ_SP;
@@ -138,11 +186,6 @@ struct LBMSimulationParameter {
 
     // macroscopic values//////
     // real *vx, *vy, *vz, *rho;  // DEPRECATED: macroscopic values for full matrix
-    //! \brief store the macroscopic values (velocity, density, pressure)
-    //! \brief for every lattice node
-    real *velocityX, *velocityY, *velocityZ, *rho, *pressure;
-    //! \brief stores the value for omega
-    real omega;
     //! \brief stores the value for viscosity (on level 0)
     real vis;
 
@@ -163,11 +206,6 @@ struct LBMSimulationParameter {
     unsigned int size_Mat;
     unsigned int sizePlaneXY, sizePlaneYZ, sizePlaneXZ;
 
-    // size of sparse matrix//////////
-    //! \brief stores the number of nodes (based on indirect addressing scheme)
-    unsigned int numberOfNodes;
-    unsigned int size_Array_SP;
-
     // size of Plane btw. 2 GPUs//////
     unsigned int sizePlaneSB, sizePlaneRB, startB, endB;
     unsigned int sizePlaneST, sizePlaneRT, startT, endT;
@@ -180,16 +218,6 @@ struct LBMSimulationParameter {
     unsigned int sizePlanePressOUT, startPOUT;
     bool isSetPress;
 
-    // memsizeSP/////////////////
-    //! \brief stores the size of the memory consumption for real/int values of the arrays (e.g. coordinates, velocity)
-    unsigned int mem_size_real_SP;
-    unsigned int mem_size_int_SP;
-
-    // memsize/////////////////
-    unsigned int mem_size_real;
-    unsigned int mem_size_int;
-    unsigned int mem_size_bool;
-    unsigned int mem_size_real_yz;
 
     // print///////////////////
     unsigned int startz, endz;
@@ -218,16 +246,16 @@ struct LBMSimulationParameter {
     OffsetFC offFCBulk;
     unsigned int mem_size_kCF_off;
     unsigned int mem_size_kFC_off;
-
-    // BC's////////////////////
+    
     //! \brief stores the boundary condition data
     QforBoundaryConditions noSlipBC, velocityBC, outflowBC, slipBC, stressBC, pressureBC;
     //! \brief number of lattice nodes for the boundary conditions
-    unsigned int numberOfNoSlipBCnodesRead, numberOfVeloBCnodesRead, numberOfOutflowBCnodesRead, numberOfSlipBCnodesRead, numberOfStressBCnodesRead, numberOfPressureBCnodesRead;
+    unsigned int numberOfNoSlipBCnodesRead, numberOfVeloBCnodesRead, numberOfOutflowBCnodesRead, numberOfSlipBCnodesRead, numberOfStressBCnodesRead, numberOfPressureBCnodesRead, numberOfPrecursorBCnodesRead;
 
     QforBoundaryConditions QpressX0, QpressX1, QpressY0, QpressY1, QpressZ0, QpressZ1; // DEPRECATED
     QforBoundaryConditions propellerBC;
     QforBoundaryConditions geometryBC;
+    QforPrecursorBoundaryConditions precursorBC;
     QforBoundaryConditions geometryBCnormalX, geometryBCnormalY, geometryBCnormalZ;
     QforBoundaryConditions inflowBCnormalX, inflowBCnormalY, inflowBCnormalZ;
     QforBoundaryConditions outflowBCnormalX, outflowBCnormalY, outflowBCnormalZ;
@@ -235,6 +263,8 @@ struct LBMSimulationParameter {
     unsigned int kInletQread, kOutletQread;  // DEPRECATED
 
     WallModelParameters wallModel;
+    std::vector<SPtr<TransientBCInputFileReader>> transientBCInputFileReader;
+    real outflowPressureCorrectionFactor;
 
     // testRoundoffError
     Distributions27 kDistTestRE;
@@ -367,10 +397,19 @@ struct LBMSimulationParameter {
     std::vector<EdgeNodePositions> edgeNodesYtoZ;
 
     ///////////////////////////////////////////////////////
-    uint *fluidNodeIndices;
-    uint numberOfFluidNodes;
-    uint *fluidNodeIndicesBorder;
-    uint numberOfFluidNodesBorder;
+    std::map<CollisionTemplate, uint*>    taggedFluidNodeIndices = {{CollisionTemplate::Default,        nullptr},
+                                                                    {CollisionTemplate::SubDomainBorder,nullptr},
+                                                                    {CollisionTemplate::WriteMacroVars, nullptr},
+                                                                    {CollisionTemplate::ApplyBodyForce, nullptr},
+                                                                    {CollisionTemplate::AllFeatures,    nullptr}};
+    std::map<CollisionTemplate, uint >  numberOfTaggedFluidNodes = {{CollisionTemplate::Default,        0},
+                                                                    {CollisionTemplate::SubDomainBorder,0},
+                                                                    {CollisionTemplate::WriteMacroVars, 0},
+                                                                    {CollisionTemplate::ApplyBodyForce, 0},
+                                                                    {CollisionTemplate::AllFeatures,    0}};
+
+    std::vector<CollisionTemplate> allocatedBulkFluidNodeTags = {};
+
 };
 
 //! \brief Class for LBM-parameter management
@@ -471,6 +510,7 @@ public:
     void setpressBcPos(std::string pressBcPos);
     void setpressBcQs(std::string pressBcQs);
     void setpressBcValue(std::string pressBcValue);
+    void setOutflowPressureCorrectionFactor(real correctionFactor);
     void setpressBcValues(std::string pressBcValues);
     void setvelBcQs(std::string velBcQs);
     void setvelBcValues(std::string velBcValues);
@@ -527,7 +567,6 @@ public:
     void setUseWale(bool useWale);
     void setTurbulenceModel(TurbulenceModel turbulenceModel);
     void setUseTurbulentViscosity(bool useTurbulentViscosity);
-    void setUseAMD(bool useAMD);
     void setSGSConstant(real SGSConstant);
     void setHasWallModelMonitor(bool hasWallModelMonitor);
     void setUseInitNeq(bool useInitNeq);
@@ -726,10 +765,10 @@ public:
     unsigned int getPressOutID();
     unsigned int getPressInZ();
     unsigned int getPressOutZ();
-    unsigned int getMemSizereal(int level);
-    unsigned int getMemSizeInt(int level);
-    unsigned int getMemSizeBool(int level);
-    unsigned int getMemSizerealYZ(int level);
+//    unsigned int getMemSizereal(int level);    //DEPRECATED: related to full matrix
+//    unsigned int getMemSizeInt(int level);     //DEPRECATED: related to full matrix
+//    unsigned int getMemSizeBool(int level);    //DEPRECATED: related to full matrix
+//    unsigned int getMemSizerealYZ(int level);  //DEPRECATED: related to full matrix
     unsigned int getSizeMat(int level);
     unsigned int getTimestepStart();
     unsigned int getTimestepInit();
@@ -765,6 +804,8 @@ public:
     real getScaledDensityRatio(int level);
     //! \returns the pressure ratio in SI/LB units scaled to the respective level
     real getScaledPressureRatio(int level);
+    //! \returns the stress ratio in SI/LB units scaled to the respective level
+    real getScaledStressRatio(int level);
     //! \returns the time ratio in SI/LB units scaled to the respective level
     real getScaledTimeRatio(int level);
     //! \returns the length ratio in SI/LB units scaled to the respective level
@@ -853,6 +894,7 @@ public:
     std::string getOutflowBoundaryNormalX();
     std::string getOutflowBoundaryNormalY();
     std::string getOutflowBoundaryNormalZ();
+    real getOutflowPressureCorrectionFactor();
     // CUDA random number
     curandState *getRandomState();
     // Kernel
@@ -896,6 +938,8 @@ private:
 
     void setPathAndFilename(std::string fname);
 
+    void checkParameterValidityCumulantK17() const;
+
 private:
     bool compOn{ false };
     bool diffOn{ false };
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/ParameterTest.cpp b/src/gpu/VirtualFluids_GPU/Parameter/ParameterTest.cpp
index 4025acf7acad362e9f0f3702cb897b9c1b6dbf3b..72a12ae880556e6e257eb69dee4e806617252629 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/ParameterTest.cpp
+++ b/src/gpu/VirtualFluids_GPU/Parameter/ParameterTest.cpp
@@ -1,4 +1,3 @@
-#include <gmock/gmock.h>
 #include "basics/tests/testUtilities.h"
 
 #include <filesystem>
@@ -8,7 +7,6 @@
 #include "Parameter.h"
 #include "basics/config/ConfigurationFile.h"
 
-
 TEST(ParameterTest, passingEmptyFileWithoutPath_ShouldNotThrow)
 {
     // assuming that the config files is stored parallel to this file.
@@ -37,7 +35,9 @@ TEST(ParameterTest, check_all_Parameter_CanBePassedToConstructor)
 
     // test optional parameter
     EXPECT_THAT(para.getOutputPath(), testing::Eq("/output/path/"));
-    EXPECT_THAT(para.getGridPath(), testing::Eq("/path/to/grid/")); // ... all grid files (e.g. multi-gpu/ multi-level) could be tested as well
+    EXPECT_THAT(
+        para.getGridPath(),
+        testing::Eq("/path/to/grid/")); // ... all grid files (e.g. multi-gpu/ multi-level) could be tested as well
     EXPECT_THAT(para.getgeoVec(), testing::Eq("/path/to/grid/geoVec.dat"));
     EXPECT_THAT(para.getMaxDev(), testing::Eq(2));
     EXPECT_THAT(para.getDevices(), testing::ElementsAreArray({ 2, 3 }));
@@ -163,7 +163,7 @@ TEST(ParameterTest, setGridPathOverridesDefaultGridPath)
     Parameter para(2, 1);
     para.setGridPath("gridPathTest");
 
-    EXPECT_THAT( para.getGridPath(), testing::Eq("gridPathTest/1/"));
+    EXPECT_THAT(para.getGridPath(), testing::Eq("gridPathTest/1/"));
     EXPECT_THAT(para.getConcentration(), testing::Eq("gridPathTest/1/conc.dat"));
 }
 
@@ -177,9 +177,8 @@ TEST(ParameterTest, setGridPathOverridesConfigFile)
     auto para = Parameter(2, 0, &config);
     para.setGridPath("gridPathTest");
 
-    EXPECT_THAT( para.getGridPath(), testing::Eq("gridPathTest/0/"));
+    EXPECT_THAT(para.getGridPath(), testing::Eq("gridPathTest/0/"));
     EXPECT_THAT(para.getConcentration(), testing::Eq("gridPathTest/0/conc.dat"));
-
 }
 
 TEST(ParameterTest, userMissedSlash)
@@ -189,7 +188,6 @@ TEST(ParameterTest, userMissedSlash)
 
     EXPECT_THAT(para.getGridPath(), testing::Eq("gridPathTest/"));
     EXPECT_THAT(para.getConcentration(), testing::Eq("gridPathTest/conc.dat"));
-
 }
 
 TEST(ParameterTest, userMissedSlashMultiGPU)
@@ -199,4 +197,87 @@ TEST(ParameterTest, userMissedSlashMultiGPU)
 
     EXPECT_THAT(para.getGridPath(), testing::Eq("gridPathTest/0/"));
     EXPECT_THAT(para.getConcentration(), testing::Eq("gridPathTest/0/conc.dat"));
-}
\ No newline at end of file
+}
+
+class ParameterTestCumulantK17 : public testing::Test
+{
+protected:
+    void SetUp() override
+    {
+    }
+
+    bool stdoutContainsWarning()
+    {
+        std::string output = testing::internal::GetCapturedStdout();
+        return output.find("warning") != std::string::npos;
+    }
+
+    Parameter para;
+};
+
+TEST_F(ParameterTestCumulantK17, CumulantK17_VelocityIsTooHigh_expectWarning)
+{
+
+    para.setVelocityLB(0.11);
+    para.setMainKernel("CumulantK17");
+    testing::internal::CaptureStdout();
+
+    para.initLBMSimulationParameter();
+
+    EXPECT_TRUE(stdoutContainsWarning());
+}
+
+TEST_F(ParameterTestCumulantK17, CumulantK17_VelocityIsOk_expectNoWarning)
+{
+    para.setVelocityLB(0.09);
+    para.setMainKernel("CumulantK17");
+    testing::internal::CaptureStdout();
+
+    para.initLBMSimulationParameter();
+
+    EXPECT_FALSE(stdoutContainsWarning());
+}
+
+TEST_F(ParameterTestCumulantK17, NotCumulantK17_VelocityIsTooHigh_expectNoWarning)
+{
+    para.setVelocityLB(42);
+    para.setMainKernel("K");
+    testing::internal::CaptureStdout();
+
+    para.initLBMSimulationParameter();
+
+    EXPECT_FALSE(stdoutContainsWarning());
+}
+
+TEST_F(ParameterTestCumulantK17, CumulantK17_ViscosityIsTooHigh_expectWarning)
+{
+    para.setViscosityLB(0.024);
+    para.setMainKernel("CumulantK17");
+    testing::internal::CaptureStdout();
+
+    para.initLBMSimulationParameter();
+
+    EXPECT_TRUE(stdoutContainsWarning());
+}
+
+TEST_F(ParameterTestCumulantK17, CumulantK17_ViscosityIsOk_expectNoWarning)
+{
+    para.setViscosityLB(0.023);
+    para.setMainKernel("CumulantK17");
+    testing::internal::CaptureStdout();
+
+    para.initLBMSimulationParameter();
+
+    EXPECT_FALSE(stdoutContainsWarning());
+}
+
+TEST_F(ParameterTestCumulantK17, NotCumulantK17_ViscosityIsTooHigh_expectNoWarning)
+{
+    para.setViscosityLB(10);
+    para.setMainKernel("K");
+    testing::internal::CaptureStdout();
+
+    para.initLBMSimulationParameter();
+
+    EXPECT_FALSE(stdoutContainsWarning());
+}
diff --git a/src/gpu/VirtualFluids_GPU/Particles/Particles.cpp b/src/gpu/VirtualFluids_GPU/Particles/Particles.cpp
index 7c710f50afb0ae07edd53ef9d68e294c7af54ac1..e0156e3fbae46282baeb1359c719a077f021cf6b 100644
--- a/src/gpu/VirtualFluids_GPU/Particles/Particles.cpp
+++ b/src/gpu/VirtualFluids_GPU/Particles/Particles.cpp
@@ -141,12 +141,12 @@ void initParticles(Parameter* para)
 			para->getParH(lev)->plp.coordZabsolut[i] = (real)zCoordVec[i]; 
 
 			// find IDs
-			for (unsigned int ii = 0; ii < para->getParH(lev)->numberOfNodes; ii++)
+			for (size_t index = 0; index < para->getParH(lev)->numberOfNodes; index++)
 			{
-				if ((para->getParH(lev)->coordinateX[ii] <= para->getParH(lev)->plp.coordXabsolut[i]) &&
-					((para->getParH(lev)->plp.coordXabsolut[i] - para->getParH(lev)->coordinateX[ii]) <= dx))
+				if ((para->getParH(lev)->coordinateX[index] <= para->getParH(lev)->plp.coordXabsolut[i]) &&
+					((para->getParH(lev)->plp.coordXabsolut[i] - para->getParH(lev)->coordinateX[index]) <= dx))
 				{
-					tempID.push_back(ii);
+					tempID.push_back((int)index);
 				}
 			}
 
@@ -455,7 +455,7 @@ void rearrangeGeometry(Parameter* para, CudaMemoryManager* cudaMemoryManager)
 		int counter2 = 0;
 		//////////////////////////////////////////////////////////////////////////
 		//redefine fluid nodes
-		for (uint index = 0; index < para->getParH(lev)->numberOfNodes; index++)
+		for (size_t index = 0; index < para->getParH(lev)->numberOfNodes; index++)
 		{
 			if (para->getParH(lev)->typeOfGridNode[index] == GEO_FLUID_OLD)
 			{
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9447a8636e801c132df9cef2feced4b5ab4e68de
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.cu
@@ -0,0 +1,629 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ActuatorFarm.cu
+//! \ingroup PreCollisionInteractor
+//! \author Henrik Asmuth, Henry Korb
+//======================================================================================
+#include "ActuatorFarm.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+
+#include "cuda/CudaGrid.h"
+#include "VirtualFluids_GPU/GPU/GeometryUtils.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
+
+#include "Parameter/Parameter.h"
+#include "Parameter/CudaStreamManager.h"
+#include "DataStructureInitializer/GridProvider.h"
+#include "GPU/CudaMemoryManager.h"
+#include "lbm/constants/NumericConstants.h"
+#include "logger/Logger.h"
+
+using namespace vf::lbm::constant;
+
+
+__host__ __device__ __inline__ uint calcNode(uint bladeNode, uint numberOfBladeNodes, uint blade, uint numberOfBlades, uint turbine, uint numberOfTurbines)
+{
+
+    return bladeNode+numberOfBladeNodes*(blade+numberOfBlades*turbine);
+}
+
+__host__ __device__ __inline__ void calcTurbineBladeAndBladeNode(uint node, uint& bladeNode, uint numberOfBladeNodes, uint& blade, uint numberOfBlades, uint& turbine, uint numberOfTurbines)
+{
+    turbine = node/(numberOfBladeNodes*numberOfBlades);
+    uint x_off = turbine*numberOfBladeNodes*numberOfBlades;
+    blade = (node - x_off)/numberOfBlades;
+    uint y_off = numberOfBladeNodes*blade+x_off;
+    bladeNode = (node - y_off)/numberOfBladeNodes;
+}
+
+__host__ __device__ __forceinline__ real distSqrd(real distX, real distY, real distZ)
+{
+    return distX*distX+distY*distY+distZ*distZ;
+}
+
+void swapArrays(real* &arr1, real* &arr2)
+{
+    real* tmp = arr1;
+    arr1 = arr2;
+    arr2 = tmp;
+}
+
+__host__ __device__ __inline__ void rotateFromBladeToGlobal(
+                            real& bladeCoordX_BF, real& bladeCoordY_BF, real& bladeCoordZ_BF, 
+                            real& bladeCoordX_GF, real& bladeCoordY_GF, real& bladeCoordZ_GF,
+                            real& azimuth, real& yaw)
+{
+    real tmpX, tmpY, tmpZ;
+
+    rotateAboutX3D(azimuth, bladeCoordX_BF, bladeCoordY_BF, bladeCoordZ_BF, tmpX, tmpY, tmpZ);
+    rotateAboutZ3D(yaw, tmpX, tmpY, tmpZ, bladeCoordX_GF, bladeCoordY_GF, bladeCoordZ_GF);
+
+}
+
+__host__ __device__ __inline__ void rotateFromGlobalToBlade(
+                            real& bladeCoordX_BF, real& bladeCoordY_BF, real& bladeCoordZ_BF, 
+                            real& bladeCoordX_GF, real& bladeCoordY_GF, real& bladeCoordZ_GF,
+                            real& azimuth, real& yaw)
+{
+    real tmpX, tmpY, tmpZ;
+
+    invRotateAboutZ3D(yaw, bladeCoordX_GF, bladeCoordY_GF, bladeCoordZ_GF, tmpX, tmpY, tmpZ);
+    invRotateAboutX3D(azimuth, tmpX, tmpY, tmpZ, bladeCoordX_BF, bladeCoordY_BF, bladeCoordZ_BF);
+}
+
+__global__ void interpolateVelocities(real* gridCoordsX, real* gridCoordsY, real* gridCoordsZ, 
+                                      uint* neighborsX, uint* neighborsY, uint* neighborsZ, uint* neighborsWSB, 
+                                      real* vx, real* vy, real* vz, 
+                                      real* bladeCoordsX, real* bladeCoordsY, real* bladeCoordsZ,
+                                      real* bladeVelocitiesX, real* bladeVelocitiesY, real* bladeVelocitiesZ, 
+                                      uint numberOfTurbines, uint numberOfBlades, uint numberOfBladeNodes, 
+                                      real* azimuths, real* yaws, real* omegas, 
+                                      real* turbPosX, real* turbPosY, real* turbPosZ,
+                                      uint* bladeIndices, real velocityRatio, real invDeltaX)
+{
+
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = vf::gpu::getNodeIndex();
+
+    if(nodeIndex>=numberOfBladeNodes*numberOfBlades*numberOfTurbines) return;
+
+    uint turbine, bladeNode, blade;
+
+    calcTurbineBladeAndBladeNode(nodeIndex, bladeNode, numberOfBladeNodes, blade, numberOfBlades, turbine, numberOfTurbines);
+
+    real bladeCoordX_BF = bladeCoordsX[nodeIndex];
+    real bladeCoordY_BF = bladeCoordsY[nodeIndex];
+    real bladeCoordZ_BF = bladeCoordsZ[nodeIndex];
+
+    real bladeCoordX_GF, bladeCoordY_GF, bladeCoordZ_GF;
+
+    real localAzimuth = azimuths[turbine]+blade*c2Pi/numberOfBlades;
+    real yaw = yaws[turbine];
+
+
+    rotateFromBladeToGlobal(bladeCoordX_BF, bladeCoordY_BF, bladeCoordZ_BF, 
+                            bladeCoordX_GF, bladeCoordY_GF, bladeCoordZ_GF,
+                            localAzimuth, yaw);
+
+    bladeCoordX_GF += turbPosX[turbine];
+    bladeCoordY_GF += turbPosY[turbine];
+    bladeCoordZ_GF += turbPosZ[turbine];
+
+    uint k, ke, kn, kt;
+    uint kne, kte, ktn, ktne;
+
+    k = findNearestCellBSW(bladeIndices[nodeIndex], 
+                           gridCoordsX, gridCoordsY, gridCoordsZ, 
+                           bladeCoordX_GF, bladeCoordY_GF, bladeCoordZ_GF, 
+                           neighborsX, neighborsY, neighborsZ, neighborsWSB);
+        
+    bladeIndices[nodeIndex] = k;
+
+    getNeighborIndicesOfBSW(k, ke, kn, kt, kne, kte, ktn, ktne, neighborsX, neighborsY, neighborsZ);
+
+    real dW, dE, dN, dS, dT, dB;
+
+    real distX = invDeltaX*(bladeCoordX_GF-gridCoordsX[k]);
+    real distY = invDeltaX*(bladeCoordY_GF-gridCoordsY[k]);
+    real distZ = invDeltaX*(bladeCoordZ_GF-gridCoordsZ[k]);
+
+    getInterpolationWeights(dW, dE, dN, dS, dT, dB, distX, distY, distZ);
+
+    real bladeVelX_GF = trilinearInterpolation(dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, vx)*velocityRatio;
+    real bladeVelY_GF = trilinearInterpolation(dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, vy)*velocityRatio;
+    real bladeVelZ_GF = trilinearInterpolation(dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, vz)*velocityRatio;
+
+    real bladeVelX_BF, bladeVelY_BF, bladeVelZ_BF;
+
+    rotateFromGlobalToBlade(bladeVelX_BF, bladeVelY_BF, bladeVelZ_BF, 
+                            bladeVelX_GF, bladeVelY_GF, bladeVelZ_GF, 
+                            localAzimuth, yaw);
+
+    bladeVelocitiesX[nodeIndex] = bladeVelX_BF;
+    bladeVelocitiesY[nodeIndex] = bladeVelY_BF+omegas[turbine]*bladeCoordZ_BF;
+    bladeVelocitiesZ[nodeIndex] = bladeVelZ_BF;
+}
+
+
+__global__ void applyBodyForces(real* gridCoordsX, real* gridCoordsY, real* gridCoordsZ,
+                                real* gridForcesX, real* gridForcesY, real* gridForcesZ, 
+                                real* bladeCoordsX, real* bladeCoordsY, real* bladeCoordsZ, 
+                                real* bladeForcesX, real* bladeForcesY, real* bladeForcesZ,
+                                const uint numberOfTurbines, const uint numberOfBlades, const uint numberOfBladeNodes,
+                                real* azimuths, real* yaws, real* diameters,
+                                real* turbPosX, real* turbPosY, real* turbPosZ,
+                                uint* gridIndices, uint nIndices, 
+                                const real invEpsilonSqrd, const real factorGaussian)
+{
+
+    const uint index = vf::gpu::getNodeIndex();
+
+    if(index>=nIndices) return;
+
+
+    uint gridIndex = gridIndices[index];
+
+    real gridCoordX_GF = gridCoordsX[gridIndex];
+    real gridCoordY_GF = gridCoordsY[gridIndex];
+    real gridCoordZ_GF = gridCoordsZ[gridIndex];
+
+    real gridForceX_RF = c0o1;
+    real gridForceY_RF = c0o1;
+    real gridForceZ_RF = c0o1;
+
+    real dAzimuth = c2Pi/numberOfBlades;
+
+    for(uint turbine = 0; turbine<numberOfTurbines; turbine++)
+    {
+        real radius = c1o2*diameters[turbine];
+        real gridCoordX_RF = gridCoordX_GF - turbPosX[turbine];
+        real gridCoordY_RF = gridCoordY_GF - turbPosY[turbine];
+        real gridCoordZ_RF = gridCoordZ_GF - turbPosZ[turbine];
+
+        if(distSqrd(gridCoordX_RF, gridCoordY_RF, gridCoordZ_RF)*invEpsilonSqrd > radius*radius*invEpsilonSqrd+c7o1)
+            continue;
+
+        real azimuth = azimuths[turbine];
+        real yaw = yaws[turbine];
+
+        for( uint blade=0; blade<numberOfBlades; blade++)
+        { 
+            real localAzimuth = azimuth+blade*dAzimuth;
+
+
+            real gridCoordX_BF, gridCoordY_BF, gridCoordZ_BF;
+
+            rotateFromGlobalToBlade(gridCoordX_BF, gridCoordY_BF, gridCoordZ_BF,
+                                    gridCoordX_RF, gridCoordY_RF, gridCoordZ_RF,
+                                    localAzimuth, yaw);
+            
+            uint node;
+            uint nextNode = calcNode(0, numberOfBladeNodes, blade, numberOfBlades, turbine, numberOfTurbines);
+
+            real last_z = c0o1;
+            real current_z = c0o1;
+            real next_z = bladeCoordsZ[nextNode];
+
+            real x, y, dz, eta, forceX_RF, forceY_RF, forceZ_RF;
+
+            for( uint bladeNode=0; bladeNode<numberOfBladeNodes-1; bladeNode++)
+            {
+                node = nextNode;
+                nextNode = calcNode(bladeNode+1, numberOfBladeNodes, blade, numberOfBlades, turbine, numberOfTurbines);
+
+                x = bladeCoordsX[node];
+                y = bladeCoordsY[node];
+                last_z = current_z;
+                current_z = next_z;
+                next_z = bladeCoordsZ[nextNode];
+
+                dz = c1o2*(next_z-last_z);
+
+                eta = dz*factorGaussian*exp(-distSqrd(x-gridCoordX_BF, y-gridCoordY_BF, current_z-gridCoordZ_BF)*invEpsilonSqrd);
+                rotateFromBladeToGlobal(bladeForcesX[node], bladeForcesY[node], bladeForcesZ[node], 
+                                        forceX_RF, forceY_RF, forceZ_RF, 
+                                        localAzimuth, yaw);
+                                        
+                gridForceX_RF += forceX_RF*eta;
+                gridForceY_RF += forceY_RF*eta;
+                gridForceZ_RF += forceZ_RF*eta;
+            }
+
+            //Handle last node separately
+
+            node = nextNode;
+
+            x = bladeCoordsX[node];
+            y = bladeCoordsY[node];
+            last_z = current_z;
+            current_z = next_z;
+
+            dz = c1o2*(radius-last_z);
+
+            eta = dz*factorGaussian*exp(-distSqrd(x-gridCoordX_BF, y-gridCoordY_BF, current_z-gridCoordZ_BF)*invEpsilonSqrd);
+
+            rotateFromBladeToGlobal(bladeForcesX[node], bladeForcesY[node], bladeForcesZ[node], 
+                                    forceX_RF, forceY_RF, forceZ_RF, 
+                                    localAzimuth, yaw);
+                
+            gridForceX_RF += forceX_RF*eta;
+            gridForceY_RF += forceY_RF*eta;
+            gridForceZ_RF += forceZ_RF*eta;
+        }
+    }
+
+    gridForcesX[gridIndex] += gridForceX_RF;
+    gridForcesY[gridIndex] += gridForceY_RF;
+    gridForcesZ[gridIndex] += gridForceZ_RF;
+}
+
+void ActuatorFarm::addTurbine(real posX, real posY, real posZ, real diameter, real omega, real azimuth, real yaw, std::vector<real> bladeRadii)
+{
+    preInitPosX.push_back(posX);
+    preInitPosY.push_back(posY);
+    preInitPosZ.push_back(posZ);
+    preInitOmegas.push_back(omega);
+    preInitAzimuths.push_back(azimuth);
+    preInitYaws.push_back(yaw);
+    preInitDiameters.push_back(diameter);
+    preInitBladeRadii.push_back(bladeRadii);
+}
+
+void ActuatorFarm::init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager* cudaMemoryManager)
+{
+    if(!para->getIsBodyForce()) throw std::runtime_error("try to allocate ActuatorFarm but BodyForce is not set in Parameter.");
+    this->forceRatio = para->getForceRatio();
+    this->initTurbineGeometries(cudaMemoryManager);
+    this->initBladeCoords(cudaMemoryManager);    
+    this->initBladeIndices(para, cudaMemoryManager);
+    this->initBladeVelocities(cudaMemoryManager);
+    this->initBladeForces(cudaMemoryManager);    
+    this->initBoundingSpheres(para, cudaMemoryManager);  
+    this->streamIndex = 0;
+}
+
+void ActuatorFarm::interact(Parameter* para, CudaMemoryManager* cudaMemoryManager, int level, unsigned int t)
+{
+    if (level != this->level) return;
+
+    cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamIndex::ActuatorFarm, this->streamIndex);
+
+    if(useHostArrays) cudaMemoryManager->cudaCopyBladeCoordsHtoD(this);
+
+    vf::cuda::CudaGrid bladeGrid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, this->numberOfNodes);
+
+    interpolateVelocities<<< bladeGrid.grid, bladeGrid.threads, 0, stream >>>(
+        para->getParD(this->level)->coordinateX, para->getParD(this->level)->coordinateY, para->getParD(this->level)->coordinateZ,        
+        para->getParD(this->level)->neighborX, para->getParD(this->level)->neighborY, para->getParD(this->level)->neighborZ, para->getParD(this->level)->neighborInverse,
+        para->getParD(this->level)->velocityX, para->getParD(this->level)->velocityY, para->getParD(this->level)->velocityZ,
+        this->bladeCoordsXDCurrentTimestep, this->bladeCoordsYDCurrentTimestep, this->bladeCoordsZDCurrentTimestep,  
+        this->bladeVelocitiesXDCurrentTimestep, this->bladeVelocitiesYDCurrentTimestep, this->bladeVelocitiesZDCurrentTimestep,  
+        this->numberOfTurbines, this->numberOfBlades, this->numberOfBladeNodes,
+        this->azimuthsD, this->yawsD, this->omegasD, 
+        this->turbinePosXD, this->turbinePosYD, this->turbinePosZD,
+        this->bladeIndicesD, para->getVelocityRatio(), this->invDeltaX);
+
+    cudaStreamSynchronize(stream);
+    if(useHostArrays) cudaMemoryManager->cudaCopyBladeVelocitiesDtoH(this);
+    this->calcBladeForces();
+    this->swapDeviceArrays();
+
+    if(useHostArrays) cudaMemoryManager->cudaCopyBladeForcesHtoD(this);
+
+    vf::cuda::CudaGrid sphereGrid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, this->numberOfIndices);
+
+    applyBodyForces<<<sphereGrid.grid, sphereGrid.threads, 0, stream>>>(
+        para->getParD(this->level)->coordinateX, para->getParD(this->level)->coordinateY, para->getParD(this->level)->coordinateZ,        
+        para->getParD(this->level)->forceX_SP, para->getParD(this->level)->forceY_SP, para->getParD(this->level)->forceZ_SP,        
+        this->bladeCoordsXDCurrentTimestep, this->bladeCoordsYDCurrentTimestep, this->bladeCoordsZDCurrentTimestep,  
+        this->bladeForcesXDCurrentTimestep, this->bladeForcesYDCurrentTimestep, this->bladeForcesZDCurrentTimestep,
+        this->numberOfTurbines, this->numberOfBlades, this->numberOfBladeNodes,
+        this->azimuthsD, this->yawsD, this->diametersD,
+        this->turbinePosXD, this->turbinePosYD, this->turbinePosZD,
+        this->boundingSphereIndicesD, this->numberOfIndices,
+        this->invEpsilonSqrd, this->factorGaussian);
+    cudaMemoryManager->cudaCopyBladeOrientationsHtoD(this);
+    cudaStreamSynchronize(stream);
+}
+
+
+void ActuatorFarm::free(Parameter* para, CudaMemoryManager* cudaMemoryManager)
+{
+    cudaMemoryManager->cudaFreeBladeGeometries(this);
+    cudaMemoryManager->cudaFreeBladeOrientations(this);
+    cudaMemoryManager->cudaFreeBladeCoords(this);
+    cudaMemoryManager->cudaFreeBladeVelocities(this);
+    cudaMemoryManager->cudaFreeBladeForces(this);
+    cudaMemoryManager->cudaFreeBladeIndices(this);
+    cudaMemoryManager->cudaFreeSphereIndices(this);
+}
+
+
+void ActuatorFarm::calcForcesEllipticWing()
+{
+    real u_rel, v_rel, u_rel_sq;
+    real phi;
+    real Cl = c1o1;
+    real Cd = c0o1;
+    real c0 = 20*c1o10;
+    real c, Cn, Ct;
+    for(uint turbine=0; turbine<this->numberOfTurbines; turbine++)
+    {
+        real diameter = this->diametersH[turbine];
+        for( uint blade=0; blade<this->numberOfBlades; blade++)
+        { 
+            for( uint bladeNode=0; bladeNode<this->numberOfBladeNodes; bladeNode++)
+            {        
+                uint node = calcNode(bladeNode, this->numberOfBladeNodes, blade, this->numberOfBlades, turbine, this->numberOfTurbines);
+
+                u_rel = this->bladeVelocitiesXH[node];
+                v_rel = this->bladeVelocitiesYH[node];
+                u_rel_sq = u_rel*u_rel+v_rel*v_rel;
+                phi = atan2(u_rel, v_rel);
+                
+                real tmp = c4o1*this->bladeRadiiH[bladeNode]/diameter-c1o1;
+                c = c0 * sqrt( c1o1- tmp*tmp );
+                Cn = Cl*cos(phi)+Cd*sin(phi);
+                Ct = Cl*sin(phi)-Cd*cos(phi);
+                real fx = c1o2*u_rel_sq*c*this->density*Cn;
+                real fy = c1o2*u_rel_sq*c*this->density*Ct;
+                this->bladeForcesXH[node] = -fx;
+                this->bladeForcesYH[node] = -fy;
+                this->bladeForcesZH[node] = c0o1;
+                // printf("u %f v %f fx %f fy %f \n", u_rel, v_rel, fx, fy);
+            }
+        }
+        azimuthsH[turbine] = azimuthsH[turbine]+deltaT*omegasH[turbine];
+    }
+}
+
+void ActuatorFarm::calcBladeForces()
+{
+    this->calcForcesEllipticWing();
+}
+
+void ActuatorFarm::getTaggedFluidNodes(Parameter *para, GridProvider* gridProvider)
+{
+    std::vector<uint> indicesInSphere(this->boundingSphereIndicesH, this->boundingSphereIndicesH+this->numberOfIndices);
+    gridProvider->tagFluidNodeIndices(indicesInSphere, CollisionTemplate::AllFeatures, this->level);
+}   
+
+
+void ActuatorFarm::initTurbineGeometries(CudaMemoryManager* cudaMemoryManager)
+{
+    this->numberOfTurbines = uint(this->preInitDiameters.size());
+    this->numberOfNodes = numberOfTurbines*numberOfBladeNodes*numberOfBlades;
+
+    cudaMemoryManager->cudaAllocBladeGeometries(this);
+    cudaMemoryManager->cudaAllocBladeOrientations(this);
+
+    for(uint turbine=0; turbine<this->numberOfTurbines; turbine++)
+    {
+        for(uint node=0; node<this->numberOfBladeNodes; node++)
+        {
+            this->bladeRadiiH[calcNode(node, numberOfBladeNodes, 0, 1, turbine, numberOfTurbines)] = this->preInitBladeRadii[turbine][node];
+        }
+
+    }
+    std::copy(preInitPosX.begin(), preInitPosX.end(), turbinePosXH);
+    std::copy(preInitPosY.begin(), preInitPosY.end(), turbinePosYH);
+    std::copy(preInitPosZ.begin(), preInitPosZ.end(), turbinePosZH);
+    std::copy(preInitDiameters.begin(), preInitDiameters.end(), diametersH);
+
+    cudaMemoryManager->cudaCopyBladeGeometriesHtoD(this);
+    std::copy(preInitAzimuths.begin(), preInitAzimuths.end(), this->azimuthsH);
+    std::copy(preInitOmegas.begin(), preInitOmegas.end(), this->omegasH);
+    std::copy(preInitYaws.begin(), preInitYaws.end(), this->yawsH);
+
+    cudaMemoryManager->cudaCopyBladeOrientationsHtoD(this);
+    this->factorGaussian = pow(this->epsilon*sqrt(cPi),-c3o1)/this->forceRatio;
+}
+
+void ActuatorFarm::initBladeCoords(CudaMemoryManager* cudaMemoryManager)
+{   
+    cudaMemoryManager->cudaAllocBladeCoords(this);
+
+    for(uint turbine=0; turbine<numberOfTurbines; turbine++)
+    {
+        for(uint blade=0; blade<this->numberOfBlades; blade++)
+        {
+            for(uint bladeNode=0; bladeNode<this->numberOfBladeNodes; bladeNode++)
+            {
+                uint node = calcNode(bladeNode, this->numberOfBladeNodes, blade, this->numberOfBlades, turbine, this->numberOfTurbines);
+
+                this->bladeCoordsXH[node] = c0o1;
+                this->bladeCoordsYH[node] = c0o1;
+                this->bladeCoordsZH[node] = this->bladeRadiiH[calcNode(bladeNode, numberOfBladeNodes, 0, 1, turbine, numberOfTurbines)];
+            }
+        }
+    }
+    cudaMemoryManager->cudaCopyBladeCoordsHtoD(this);
+    swapArrays(this->bladeCoordsXDCurrentTimestep, this->bladeCoordsXDPreviousTimestep);
+    swapArrays(this->bladeCoordsYDCurrentTimestep, this->bladeCoordsYDPreviousTimestep);
+    swapArrays(this->bladeCoordsZDCurrentTimestep, this->bladeCoordsZDPreviousTimestep);
+    cudaMemoryManager->cudaCopyBladeCoordsHtoD(this);
+}
+
+void ActuatorFarm::initBladeVelocities(CudaMemoryManager* cudaMemoryManager)
+{   
+    cudaMemoryManager->cudaAllocBladeVelocities(this);
+
+    std::fill_n(this->bladeVelocitiesXH, this->numberOfNodes, c0o1);
+    std::fill_n(this->bladeVelocitiesYH, this->numberOfNodes, c0o1);
+    std::fill_n(this->bladeVelocitiesZH, this->numberOfNodes, c0o1);
+
+    cudaMemoryManager->cudaCopyBladeVelocitiesHtoD(this);
+    swapArrays(this->bladeVelocitiesXDCurrentTimestep, this->bladeVelocitiesXDPreviousTimestep);
+    swapArrays(this->bladeVelocitiesYDCurrentTimestep, this->bladeVelocitiesYDPreviousTimestep);
+    swapArrays(this->bladeVelocitiesZDCurrentTimestep, this->bladeVelocitiesZDPreviousTimestep);
+    cudaMemoryManager->cudaCopyBladeVelocitiesHtoD(this);
+}
+
+void ActuatorFarm::initBladeForces(CudaMemoryManager* cudaMemoryManager)
+{   
+    cudaMemoryManager->cudaAllocBladeForces(this);
+
+    std::fill_n(this->bladeForcesXH, this->numberOfNodes, c0o1);
+    std::fill_n(this->bladeForcesYH, this->numberOfNodes, c0o1);
+    std::fill_n(this->bladeForcesZH, this->numberOfNodes, c0o1);
+
+    cudaMemoryManager->cudaCopyBladeForcesHtoD(this);
+    swapArrays(this->bladeForcesXDCurrentTimestep, this->bladeForcesXDPreviousTimestep);
+    swapArrays(this->bladeForcesYDCurrentTimestep, this->bladeForcesYDPreviousTimestep);
+    swapArrays(this->bladeForcesZDCurrentTimestep, this->bladeForcesZDPreviousTimestep);
+    cudaMemoryManager->cudaCopyBladeForcesHtoD(this);
+}
+
+void ActuatorFarm::initBladeIndices(Parameter* para, CudaMemoryManager* cudaMemoryManager)
+{   
+    cudaMemoryManager->cudaAllocBladeIndices(this);
+
+    std::fill_n(this->bladeIndicesH, this->numberOfNodes, 1);
+
+    cudaMemoryManager->cudaCopyBladeIndicesHtoD(this);
+}
+
+void ActuatorFarm::initBoundingSpheres(Parameter* para, CudaMemoryManager* cudaMemoryManager)
+{
+    std::vector<int> nodesInSpheres;
+
+    for(uint turbine=0; turbine<this->numberOfTurbines; turbine++)
+    {
+        real sphereRadius = c1o2*this->diametersH[turbine]+c4o1*this->epsilon;
+
+        real posX = this->turbinePosXH[turbine];
+        real posY = this->turbinePosYH[turbine];
+        real posZ = this->turbinePosZH[turbine];
+
+        real sphereRadiusSqrd = sphereRadius*sphereRadius;
+            
+        uint minimumNumberOfNodesPerSphere = (uint)(c4o3*cPi*pow(sphereRadius-this->deltaX, c3o1)/pow(this->deltaX, c3o1));
+        uint nodesInThisSphere = 0;
+
+        for (size_t pos = 1; pos <= para->getParH(this->level)->numberOfNodes; pos++)
+        {
+            const real distX = para->getParH(this->level)->coordinateX[pos]-posX;
+            const real distY = para->getParH(this->level)->coordinateY[pos]-posY;
+            const real distZ = para->getParH(this->level)->coordinateZ[pos]-posZ;
+            if(distSqrd(distX,distY,distZ) < sphereRadiusSqrd) 
+            {
+                nodesInSpheres.push_back((int)pos);
+                nodesInThisSphere++;
+            }
+        }
+
+        if(nodesInThisSphere<minimumNumberOfNodesPerSphere)
+        {
+            VF_LOG_CRITICAL("Found only {} nodes in bounding sphere of turbine no. {}, expected at least {}!", nodesInThisSphere, turbine, minimumNumberOfNodesPerSphere);
+            throw std::runtime_error("ActuatorFarm::initBoundingSpheres: Turbine bounding sphere partially out of domain.");
+        }
+    }
+
+    this->numberOfIndices = uint(nodesInSpheres.size());
+
+    cudaMemoryManager->cudaAllocSphereIndices(this);
+    std::copy(nodesInSpheres.begin(), nodesInSpheres.end(), this->boundingSphereIndicesH);
+    cudaMemoryManager->cudaCopySphereIndicesHtoD(this);
+}
+
+void ActuatorFarm::setAllAzimuths(real* _azimuths)
+{ 
+    std::copy_n(_azimuths, this->numberOfTurbines, this->azimuthsH);
+}
+
+void ActuatorFarm::setAllOmegas(real* _omegas)
+{ 
+    std::copy_n(_omegas, this->numberOfTurbines, this->omegasH);
+}
+
+void ActuatorFarm::setAllYaws(real* _yaws)
+{ 
+    std::copy_n(_yaws, this->numberOfTurbines, this->yawsH);
+}
+
+void ActuatorFarm::setAllBladeCoords(real* _bladeCoordsX, real* _bladeCoordsY, real* _bladeCoordsZ)
+{ 
+    std::copy_n(_bladeCoordsX, this->numberOfNodes, this->bladeCoordsXH);
+    std::copy_n(_bladeCoordsY, this->numberOfNodes, this->bladeCoordsYH);
+    std::copy_n(_bladeCoordsZ, this->numberOfNodes, this->bladeCoordsZH);
+}
+
+void ActuatorFarm::setAllBladeVelocities(real* _bladeVelocitiesX, real* _bladeVelocitiesY, real* _bladeVelocitiesZ)
+{ 
+    std::copy_n(_bladeVelocitiesX, this->numberOfNodes, this->bladeVelocitiesXH);
+    std::copy_n(_bladeVelocitiesY, this->numberOfNodes, this->bladeVelocitiesYH);
+    std::copy_n(_bladeVelocitiesZ, this->numberOfNodes, this->bladeVelocitiesZH);
+}
+
+void ActuatorFarm::setAllBladeForces(real* _bladeForcesX, real* _bladeForcesY, real* _bladeForcesZ)
+{ 
+    std::copy_n(_bladeForcesX, this->numberOfNodes, this->bladeForcesXH);
+    std::copy_n(_bladeForcesY, this->numberOfNodes, this->bladeForcesYH);
+    std::copy_n(_bladeForcesZ, this->numberOfNodes, this->bladeForcesZH);
+
+}void ActuatorFarm::setTurbineBladeCoords(uint turbine, real* _bladeCoordsX, real* _bladeCoordsY, real* _bladeCoordsZ)
+{ 
+    std::copy_n(_bladeCoordsX, numberOfBladeNodes*numberOfBlades, &this->bladeCoordsXH[turbine*numberOfBladeNodes*numberOfBlades]);
+    std::copy_n(_bladeCoordsY, numberOfBladeNodes*numberOfBlades, &this->bladeCoordsYH[turbine*numberOfBladeNodes*numberOfBlades]);
+    std::copy_n(_bladeCoordsZ, numberOfBladeNodes*numberOfBlades, &this->bladeCoordsZH[turbine*numberOfBladeNodes*numberOfBlades]);
+}
+
+void ActuatorFarm::setTurbineBladeVelocities(uint turbine, real* _bladeVelocitiesX, real* _bladeVelocitiesY, real* _bladeVelocitiesZ)
+{ 
+    std::copy_n(_bladeVelocitiesX, numberOfBladeNodes*numberOfBlades, &this->bladeVelocitiesXH[turbine*numberOfBladeNodes*numberOfBlades]);
+    std::copy_n(_bladeVelocitiesY, numberOfBladeNodes*numberOfBlades, &this->bladeVelocitiesYH[turbine*numberOfBladeNodes*numberOfBlades]);
+    std::copy_n(_bladeVelocitiesZ, numberOfBladeNodes*numberOfBlades, &this->bladeVelocitiesZH[turbine*numberOfBladeNodes*numberOfBlades]);
+}
+
+void ActuatorFarm::setTurbineBladeForces(uint turbine, real* _bladeForcesX, real* _bladeForcesY, real* _bladeForcesZ)
+{ 
+    std::copy_n(_bladeForcesX, numberOfBladeNodes*numberOfBlades, &this->bladeForcesXH[turbine*numberOfBladeNodes*numberOfBlades]);
+    std::copy_n(_bladeForcesY, numberOfBladeNodes*numberOfBlades, &this->bladeForcesYH[turbine*numberOfBladeNodes*numberOfBlades]);
+    std::copy_n(_bladeForcesZ, numberOfBladeNodes*numberOfBlades, &this->bladeForcesZH[turbine*numberOfBladeNodes*numberOfBlades]);
+}
+
+void ActuatorFarm::swapDeviceArrays()
+{
+    swapArrays(this->bladeCoordsXDPreviousTimestep, this->bladeCoordsXDCurrentTimestep);
+    swapArrays(this->bladeCoordsYDPreviousTimestep, this->bladeCoordsYDCurrentTimestep);
+    swapArrays(this->bladeCoordsZDPreviousTimestep, this->bladeCoordsZDCurrentTimestep);
+
+    swapArrays(this->bladeVelocitiesXDPreviousTimestep, this->bladeVelocitiesXDCurrentTimestep);
+    swapArrays(this->bladeVelocitiesYDPreviousTimestep, this->bladeVelocitiesYDCurrentTimestep);
+    swapArrays(this->bladeVelocitiesZDPreviousTimestep, this->bladeVelocitiesZDCurrentTimestep);
+
+    swapArrays(this->bladeForcesXDPreviousTimestep, this->bladeForcesXDCurrentTimestep);
+    swapArrays(this->bladeForcesYDPreviousTimestep, this->bladeForcesYDCurrentTimestep);
+    swapArrays(this->bladeForcesZDPreviousTimestep, this->bladeForcesZDCurrentTimestep);
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e21cdb6b21efd323f6723e21d6b28614109f1ec
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.h
@@ -0,0 +1,197 @@
+#ifndef ActuatorFarm_H
+#define ActuatorFarm_H
+
+#include "PreCollisionInteractor.h"
+#include "PointerDefinitions.h"
+#include "lbm/constants/NumericConstants.h"
+#include <stdexcept>
+
+using namespace vf::lbm::constant;
+
+class Parameter;
+class GridProvider;
+using namespace vf::lbm::constant;
+
+class ActuatorFarm : public PreCollisionInteractor
+{
+public:
+    ActuatorFarm(
+        const uint _nBlades,
+        const real _density,
+        const uint _nBladeNodes,
+        const real _epsilon,
+        int _level,
+        const real _deltaT,
+        const real _deltaX,
+        const bool _useHostArrays
+    ) :
+        numberOfBlades(_nBlades),
+        density(_density),
+        numberOfBladeNodes(_nBladeNodes), 
+        epsilon(_epsilon),
+        level(_level),
+        useHostArrays(_useHostArrays),
+        numberOfTurbines(0),
+        numberOfNodes(0),
+        PreCollisionInteractor()
+    {
+        this->deltaT = _deltaT*exp2(-this->level);
+        this->deltaX = _deltaX*exp2(-this->level);
+        this->invEpsilonSqrd = 1/(epsilon*epsilon);
+        this->invDeltaX = c1o1/this->deltaX;
+     
+        if(this->epsilon<this->deltaX)
+            throw std::runtime_error("ActuatorFarm::ActuatorFarm: epsilon needs to be larger than dx!");
+    }
+
+    ~ActuatorFarm() override = default;
+    void addTurbine(real turbinePosX, real turbinePosY, real turbinePosZ, real diameter, real omega, real azimuth, real yaw, std::vector<real> bladeRadii);
+    void init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager* cudaManager) override;
+    void interact(Parameter* para, CudaMemoryManager* cudaManager, int level, uint t) override;
+    void free(Parameter* para, CudaMemoryManager* cudaManager) override;
+    void getTaggedFluidNodes(Parameter *para, GridProvider* gridProvider) override;
+
+    void write(uint t);
+
+    real getDensity(){ return this->density; };
+    real getDeltaT(){ return this->deltaT; };
+    real getDeltaX(){ return this->deltaX; };
+
+    uint getNumberOfTurbines(){ return this->numberOfTurbines; };
+    uint getNumberOfNodesPerBlade(){ return this->numberOfBladeNodes; };
+    uint getNumberOfBladesPerTurbine(){ return this->numberOfBlades; };
+
+    uint getNumberOfIndices(){ return this->numberOfIndices; };
+    uint getNumberOfNodes(){ return this->numberOfNodes; };
+
+    real* getAllAzimuths(){ return azimuthsH; };
+    real* getAllOmegas(){ return omegasH; };
+    real* getAllYaws(){ return yawsH; };
+
+    real* getAllTurbinePosX(){ return turbinePosXH; };
+    real* getAllTurbinePosY(){ return turbinePosYH; };
+    real* getAllTurbinePosZ(){ return turbinePosZH; };
+
+    real getTurbineAzimuth(uint turbine){ return azimuthsH[turbine]; };
+    real getTurbineOmega  (uint turbine){ return omegasH[turbine];   };
+    real getTurbineYaw    (uint turbine){ return yawsH[turbine];     };
+
+    real getTurbinePosX(uint turbine){ return turbinePosXH[turbine]; };
+    real getTurbinePosY(uint turbine){ return turbinePosYH[turbine]; };
+    real getTurbinePosZ(uint turbine){ return turbinePosZH[turbine]; };
+
+    real* getAllBladeRadii(){ return this->bladeRadiiH; };
+    real* getAllBladeCoordsX(){ return this->bladeCoordsXH; };
+    real* getAllBladeCoordsY(){ return this->bladeCoordsYH; };
+    real* getAllBladeCoordsZ(){ return this->bladeCoordsZH; };
+    real* getAllBladeVelocitiesX(){ return this->bladeVelocitiesXH; };
+    real* getAllBladeVelocitiesY(){ return this->bladeVelocitiesYH; };
+    real* getAllBladeVelocitiesZ(){ return this->bladeVelocitiesZH; };
+    real* getAllBladeForcesX(){ return this->bladeForcesXH; };
+    real* getAllBladeForcesY(){ return this->bladeForcesYH; };
+    real* getAllBladeForcesZ(){ return this->bladeForcesZH; };
+
+    real* getTurbineBladeRadii(uint turbine){ return &this->bladeRadiiH[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeCoordsX(uint turbine){ return &this->bladeCoordsXH[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeCoordsY(uint turbine){ return &this->bladeCoordsYH[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeCoordsZ(uint turbine){ return &this->bladeCoordsZH[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeVelocitiesX(uint turbine){ return &this->bladeVelocitiesXH[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeVelocitiesY(uint turbine){ return &this->bladeVelocitiesYH[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeVelocitiesZ(uint turbine){ return &this->bladeVelocitiesZH[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeForcesX(uint turbine){ return &this->bladeForcesXH[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeForcesY(uint turbine){ return &this->bladeForcesYH[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeForcesZ(uint turbine){ return &this->bladeForcesZH[turbine*numberOfBladeNodes*numberOfBlades]; };
+
+    real* getAllBladeRadiiDevice(){ return this->bladeRadiiD; };
+    real* getAllBladeCoordsXDevice(){ return this->bladeCoordsXDCurrentTimestep; };
+    real* getAllBladeCoordsYDevice(){ return this->bladeCoordsYDCurrentTimestep; };
+    real* getAllBladeCoordsZDevice(){ return this->bladeCoordsZDCurrentTimestep; };
+    real* getAllBladeVelocitiesXDevice(){ return this->bladeVelocitiesXDCurrentTimestep; };
+    real* getAllBladeVelocitiesYDevice(){ return this->bladeVelocitiesYDCurrentTimestep; };
+    real* getAllBladeVelocitiesZDevice(){ return this->bladeVelocitiesZDCurrentTimestep; };
+    real* getAllBladeForcesXDevice(){ return this->bladeForcesXDCurrentTimestep; };
+    real* getAllBladeForcesYDevice(){ return this->bladeForcesYDCurrentTimestep; };
+    real* getAllBladeForcesZDevice(){ return this->bladeForcesZDCurrentTimestep; };
+
+    real* getTurbineBladeRadiiDevice(uint turbine){ return &this->bladeRadiiD[turbine*numberOfBladeNodes]; };
+    real* getTurbineBladeCoordsXDevice(uint turbine){ return &this->bladeCoordsXDCurrentTimestep[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeCoordsYDevice(uint turbine){ return &this->bladeCoordsYDCurrentTimestep[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeCoordsZDevice(uint turbine){ return &this->bladeCoordsZDCurrentTimestep[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeVelocitiesXDevice(uint turbine){ return &this->bladeVelocitiesXDCurrentTimestep[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeVelocitiesYDevice(uint turbine){ return &this->bladeVelocitiesYDCurrentTimestep[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeVelocitiesZDevice(uint turbine){ return &this->bladeVelocitiesZDCurrentTimestep[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeForcesXDevice(uint turbine){ return &this->bladeForcesXDCurrentTimestep[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeForcesYDevice(uint turbine){ return &this->bladeForcesYDCurrentTimestep[turbine*numberOfBladeNodes*numberOfBlades]; };
+    real* getTurbineBladeForcesZDevice(uint turbine){ return &this->bladeForcesZDCurrentTimestep[turbine*numberOfBladeNodes*numberOfBlades]; };
+
+    void setAllAzimuths(real* _azimuth);
+    void setAllOmegas(real* _omegas);
+    void setAllYaws(real* yaws);
+    
+    void setTurbineAzimuth(uint turbine, real azimuth){ azimuthsH[turbine] = azimuth; };
+    void setTurbineYaw(uint turbine, real yaw){ yawsH[turbine] = yaw; };
+    void setTurbineOmega(uint turbine, real omega){ omegasH[turbine] = omega; };
+
+    void setAllBladeCoords(real* _bladeCoordsX, real* _bladeCoordsY, real* _bladeCoordsZ);
+    void setAllBladeVelocities(real* _bladeVelocitiesX, real* _bladeVelocitiesY, real* _bladeVelocitiesZ);
+    void setAllBladeForces(real* _bladeForcesX, real* _bladeForcesY, real* _bladeForcesZ);
+
+    void setTurbineBladeCoords(uint turbine, real* _bladeCoordsX, real* _bladeCoordsY, real* _bladeCoordsZ);
+    void setTurbineBladeVelocities(uint turbine, real* _bladeVelocitiesX, real* _bladeVelocitiesY, real* _bladeVelocitiesZ);
+    void setTurbineBladeForces(uint turbine, real* _bladeForcesX, real* _bladeForcesY, real* _bladeForcesZ);
+
+    virtual void calcBladeForces();
+
+private:
+    void initTurbineGeometries(CudaMemoryManager* cudaManager);
+    void initBoundingSpheres(Parameter* para, CudaMemoryManager* cudaManager);
+    void initBladeCoords(CudaMemoryManager* cudaManager);
+    void initBladeVelocities(CudaMemoryManager* cudaManager);
+    void initBladeForces(CudaMemoryManager* cudaManager);
+    void initBladeIndices(Parameter* para, CudaMemoryManager* cudaManager);
+
+    void calcForcesEllipticWing();
+    void rotateBlades(real angle, uint turbineID);
+
+    void writeBladeCoords(uint t);
+    void writeBladeForces(uint t);
+    void writeBladeVelocities(uint t);
+
+    void swapDeviceArrays();
+
+public:
+    real* bladeRadiiH;
+    real* bladeRadiiD;
+    real* bladeCoordsXH, * bladeCoordsYH, * bladeCoordsZH;
+    real* bladeCoordsXDPreviousTimestep, * bladeCoordsYDPreviousTimestep, * bladeCoordsZDPreviousTimestep;
+    real* bladeCoordsXDCurrentTimestep, * bladeCoordsYDCurrentTimestep, * bladeCoordsZDCurrentTimestep;    
+    real* bladeVelocitiesXH, * bladeVelocitiesYH, * bladeVelocitiesZH;
+    real* bladeVelocitiesXDPreviousTimestep, * bladeVelocitiesYDPreviousTimestep, * bladeVelocitiesZDPreviousTimestep;
+    real* bladeVelocitiesXDCurrentTimestep, * bladeVelocitiesYDCurrentTimestep, * bladeVelocitiesZDCurrentTimestep;
+    real* bladeForcesXH, * bladeForcesYH, * bladeForcesZH;
+    real* bladeForcesXDPreviousTimestep, * bladeForcesYDPreviousTimestep, * bladeForcesZDPreviousTimestep;
+    real* bladeForcesXDCurrentTimestep, * bladeForcesYDCurrentTimestep, * bladeForcesZDCurrentTimestep;
+    uint* bladeIndicesH;
+    uint* bladeIndicesD; 
+    uint* boundingSphereIndicesH;
+    uint* boundingSphereIndicesD;
+    real* turbinePosXH, *turbinePosYH, *turbinePosZH, *omegasH, *azimuthsH, *yawsH, *diametersH;
+    real* turbinePosXD, *turbinePosYD, *turbinePosZD, *omegasD, *azimuthsD, *yawsD, *diametersD;
+    
+private:
+    std::vector<real> preInitPosX, preInitPosY, preInitPosZ, preInitDiameters, preInitOmegas, preInitAzimuths, preInitYaws;
+    std::vector<std::vector<real>> preInitBladeRadii;
+    const bool useHostArrays;
+    const real density;
+    real deltaT, deltaX;
+    const uint numberOfBladeNodes, numberOfBlades;
+    uint numberOfTurbines;
+    const real epsilon; // in m
+    const int level;
+    uint numberOfIndices;
+    uint numberOfNodes;
+    real forceRatio, factorGaussian, invEpsilonSqrd, invDeltaX;
+    int streamIndex;
+};
+
+#endif
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.cu
deleted file mode 100644
index 71897bd21ea4fb299d3cc0ffa385506d4503f360..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.cu
+++ /dev/null
@@ -1,423 +0,0 @@
-#include "ActuatorLine.h"
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <helper_cuda.h>
-
-#include <cuda/CudaGrid.h>
-#include "VirtualFluids_GPU/GPU/GeometryUtils.h"
-
-#include "Parameter/Parameter.h"
-#include "DataStructureInitializer/GridProvider.h"
-#include "GPU/CudaMemoryManager.h"
-
-__host__ __device__ __inline__ uint calcNode(uint bladeNode, uint nBladeNodes, uint blade, uint nBlades)
-{
-    return bladeNode+blade*nBladeNodes;
-}
-
-__host__ __device__ __inline__ void calcBladeAndBladeNode(uint node, uint& bladeNode, uint nBladeNodes, uint& blade, uint nBlades)
-{
-    blade = node/nBladeNodes;
-    bladeNode = node - blade*nBladeNodes;
-}
-
-__host__ __device__ __forceinline__ real distSqrd(real distX, real distY, real distZ)
-{
-    return distX*distX+distY*distY+distZ*distZ;
-}
-
-__host__ __device__ __inline__ void rotateFromBladeToGlobal(
-                            real& bladeCoordX_BF, real& bladeCoordY_BF, real& bladeCoordZ_BF, 
-                            real& bladeCoordX_GF, real& bladeCoordY_GF, real& bladeCoordZ_GF,
-                            real& azimuth, real& yaw)
-{
-    real tmpX, tmpY, tmpZ;
-
-    rotateAboutX3D(azimuth, bladeCoordX_BF, bladeCoordY_BF, bladeCoordZ_BF, tmpX, tmpY, tmpZ);
-    rotateAboutZ3D(yaw, tmpX, tmpY, tmpZ, bladeCoordX_GF, bladeCoordY_GF, bladeCoordZ_GF);
-
-}
-
-__host__ __device__ __inline__ void rotateFromGlobalToBlade(
-                            real& bladeCoordX_BF, real& bladeCoordY_BF, real& bladeCoordZ_BF, 
-                            real& bladeCoordX_GF, real& bladeCoordY_GF, real& bladeCoordZ_GF,
-                            real& azimuth, real& yaw)
-{
-    real tmpX, tmpY, tmpZ;
-
-    invRotateAboutZ3D(yaw, bladeCoordX_GF, bladeCoordY_GF, bladeCoordZ_GF, tmpX, tmpY, tmpZ);
-    invRotateAboutX3D(azimuth, tmpX, tmpY, tmpZ, bladeCoordX_BF, bladeCoordY_BF, bladeCoordZ_BF);
-}
-
-__global__ void interpolateVelocities(real* gridCoordsX, real* gridCoordsY, real* gridCoordsZ, 
-                                      uint* neighborsX, uint* neighborsY, uint* neighborsZ, uint* neighborsWSB, 
-                                      real* vx, real* vy, real* vz, 
-                                      real* bladeCoordsX, real* bladeCoordsY, real* bladeCoordsZ,
-                                      real* bladeVelocitiesX, real* bladeVelocitiesY, real* bladeVelocitiesZ, 
-                                      uint nBlades, uint nBladeNodes, 
-                                      real azimuth, real yaw, real omega, 
-                                      real turbPosX, real turbPosY, real turbPosZ,
-                                      uint* bladeIndices, real velocityRatio, real invDeltaX)
-{
-    const uint x = threadIdx.x; 
-    const uint y = blockIdx.x;
-    const uint z = blockIdx.y;
-
-    const uint nx = blockDim.x;
-    const uint ny = gridDim.x;
-
-    const uint node = nx*(ny*z + y) + x;
-
-    uint bladeNode, blade;
-
-    calcBladeAndBladeNode(node, bladeNode, nBladeNodes, blade, nBlades);
-
-    if(node>=nBladeNodes*nBlades) return;
-
-    real bladeCoordX_BF = bladeCoordsX[node];
-    real bladeCoordY_BF = bladeCoordsY[node];
-    real bladeCoordZ_BF = bladeCoordsZ[node];
-
-    real bladeCoordX_GF, bladeCoordY_GF, bladeCoordZ_GF;
-
-    real localAzimuth = azimuth+blade*c2Pi/nBlades;
-
-    rotateFromBladeToGlobal(bladeCoordX_BF, bladeCoordY_BF, bladeCoordZ_BF, 
-                            bladeCoordX_GF, bladeCoordY_GF, bladeCoordZ_GF,
-                            localAzimuth, yaw);
-
-    bladeCoordX_GF += turbPosX;
-    bladeCoordY_GF += turbPosY;
-    bladeCoordZ_GF += turbPosZ;
-
-    uint k, ke, kn, kt;
-    uint kne, kte, ktn, ktne;
-
-    k = findNearestCellBSW(bladeIndices[node], 
-                           gridCoordsX, gridCoordsY, gridCoordsZ, 
-                           bladeCoordX_GF, bladeCoordY_GF, bladeCoordZ_GF, 
-                           neighborsX, neighborsY, neighborsZ, neighborsWSB);
-        
-    bladeIndices[node] = k;
-
-    getNeighborIndicesOfBSW(k, ke, kn, kt, kne, kte, ktn, ktne, neighborsX, neighborsY, neighborsZ);
-
-    real dW, dE, dN, dS, dT, dB;
-
-    real distX = invDeltaX*(bladeCoordX_GF-gridCoordsX[k]);
-    real distY = invDeltaX*(bladeCoordY_GF-gridCoordsY[k]);
-    real distZ = invDeltaX*(bladeCoordZ_GF-gridCoordsZ[k]);
-
-    getInterpolationWeights(dW, dE, dN, dS, dT, dB, distX, distY, distZ);
-
-    real bladeVelX_GF = trilinearInterpolation(dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, vx)*velocityRatio;
-    real bladeVelY_GF = trilinearInterpolation(dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, vy)*velocityRatio;
-    real bladeVelZ_GF = trilinearInterpolation(dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, vz)*velocityRatio;
-
-    real bladeVelX_BF, bladeVelY_BF, bladeVelZ_BF;
-
-    rotateFromGlobalToBlade(bladeVelX_BF, bladeVelY_BF, bladeVelZ_BF, 
-                            bladeVelX_GF, bladeVelY_GF, bladeVelZ_GF, 
-                            localAzimuth, yaw);
-
-    bladeVelocitiesX[node] = bladeVelX_BF;
-    bladeVelocitiesY[node] = bladeVelY_BF+omega*bladeCoordZ_BF;
-    bladeVelocitiesZ[node] = bladeVelZ_BF;
-}
-
-
-__global__ void applyBodyForces(real* gridCoordsX, real* gridCoordsY, real* gridCoordsZ,
-                                real* gridForcesX, real* gridForcesY, real* gridForcesZ, 
-                                real* bladeCoordsX, real* bladeCoordsY, real* bladeCoordsZ, 
-                                real* bladeForcesX, real* bladeForcesY,real* bladeForcesZ,
-                                uint nBlades, uint nBladeNodes,
-                                real azimuth, real yaw, real omega, 
-                                real turbPosX, real turbPosY, real turbPosZ,
-                                uint* gridIndices, uint nIndices, 
-                                real invEpsilonSqrd, real factorGaussian)
-{
-    const uint x = threadIdx.x; 
-    const uint y = blockIdx.x;
-    const uint z = blockIdx.y;
-
-    const uint nx = blockDim.x;
-    const uint ny = gridDim.x;
-
-    const uint index = nx*(ny*z + y) + x;
-
-    if(index>=nIndices) return;
-
-    uint gridIndex = gridIndices[index];
-
-    real gridCoordX_RF = gridCoordsX[gridIndex] - turbPosX;
-    real gridCoordY_RF = gridCoordsY[gridIndex] - turbPosY;
-    real gridCoordZ_RF = gridCoordsZ[gridIndex] - turbPosZ;
-
-    real gridForceX_RF = c0o1;
-    real gridForceY_RF = c0o1;
-    real gridForceZ_RF = c0o1;
-
-    real dAzimuth = c2Pi/nBlades;
-
-    for( uint blade=0; blade<nBlades; blade++)
-    { 
-        real localAzimuth = azimuth+blade*dAzimuth;
-
-        real gridCoordX_BF, gridCoordY_BF, gridCoordZ_BF;
-
-        rotateFromGlobalToBlade(gridCoordX_BF, gridCoordY_BF, gridCoordZ_BF,
-                                gridCoordX_RF, gridCoordY_RF, gridCoordZ_RF,
-                                localAzimuth, yaw);
-        
-        for( uint bladeNode=0; bladeNode<nBladeNodes; bladeNode++)
-        {
-            uint node = calcNode(bladeNode, nBladeNodes, blade, nBlades);
-
-            real eta = factorGaussian*exp(-distSqrd(bladeCoordsX[node]-gridCoordX_BF, bladeCoordsY[node]-gridCoordY_BF, bladeCoordsZ[node]-gridCoordZ_BF)*invEpsilonSqrd);
-            
-            real forceX_RF, forceY_RF, forceZ_RF;
-
-            rotateFromBladeToGlobal(bladeForcesX[node], bladeForcesY[node], bladeForcesZ[node], 
-                                    forceX_RF, forceY_RF, forceZ_RF, 
-                                    localAzimuth, yaw);
-            
-            gridForceX_RF += forceX_RF*eta;
-            gridForceY_RF += forceY_RF*eta;
-            gridForceZ_RF += forceZ_RF*eta;
-        }
-    }
-
-    atomicAdd(&gridForcesX[gridIndex], gridForceX_RF);
-    atomicAdd(&gridForcesY[gridIndex], gridForceY_RF);
-    atomicAdd(&gridForcesZ[gridIndex], gridForceZ_RF);
-}
-
-
-void ActuatorLine::init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager* cudaMemoryManager)
-{
-    if(!para->getIsBodyForce()) throw std::runtime_error("try to allocate ActuatorLine but BodyForce is not set in Parameter.");
-    this->initBladeRadii(cudaMemoryManager);
-    this->initBladeCoords(cudaMemoryManager);    
-    this->initBladeIndices(para, cudaMemoryManager);
-    this->initBladeVelocities(cudaMemoryManager);
-    this->initBladeForces(cudaMemoryManager);    
-    this->initBoundingSphere(para, cudaMemoryManager);
-}
-
-
-void ActuatorLine::interact(Parameter* para, CudaMemoryManager* cudaMemoryManager, int level, unsigned int t)
-{
-    if (level != this->level) return;
-
-    cudaMemoryManager->cudaCopyBladeCoordsHtoD(this);
-
-    vf::cuda::CudaGrid bladeGrid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, this->nNodes);
-
-    interpolateVelocities<<< bladeGrid.grid, bladeGrid.threads >>>(
-        para->getParD(this->level)->coordinateX, para->getParD(this->level)->coordinateY, para->getParD(this->level)->coordinateZ,        
-        para->getParD(this->level)->neighborX, para->getParD(this->level)->neighborY, para->getParD(this->level)->neighborZ, para->getParD(this->level)->neighborInverse,
-        para->getParD(this->level)->velocityX, para->getParD(this->level)->velocityY, para->getParD(this->level)->velocityZ,
-        this->bladeCoordsXD, this->bladeCoordsYD, this->bladeCoordsZD,  
-        this->bladeVelocitiesXD, this->bladeVelocitiesYD, this->bladeVelocitiesZD,  
-        this->nBlades, this->nBladeNodes,
-        this->azimuth, this->yaw, this->omega, 
-        this->turbinePosX, this->turbinePosY, this->turbinePosZ,
-        this->bladeIndicesD, para->getVelocityRatio(), this->invDeltaX);
-
-    cudaMemoryManager->cudaCopyBladeVelocitiesDtoH(this);
-
-    this->calcBladeForces();
-
-    cudaMemoryManager->cudaCopyBladeForcesHtoD(this);
-
-    vf::cuda::CudaGrid sphereGrid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, this->nIndices);
-
-    applyBodyForces<<<sphereGrid.grid, sphereGrid.threads>>>(
-        para->getParD(this->level)->coordinateX, para->getParD(this->level)->coordinateY, para->getParD(this->level)->coordinateZ,        
-        para->getParD(this->level)->forceX_SP, para->getParD(this->level)->forceY_SP, para->getParD(this->level)->forceZ_SP,        
-        this->bladeCoordsXD, this->bladeCoordsYD, this->bladeCoordsZD,  
-        this->bladeForcesXD, this->bladeForcesYD, this->bladeForcesZD,
-        this->nBlades, this->nBladeNodes,
-        this->azimuth, this->yaw, this->omega, 
-        this->turbinePosX, this->turbinePosY, this->turbinePosZ,
-        this->boundingSphereIndicesD, this->nIndices,
-        this->invEpsilonSqrd, this->factorGaussian);
-
-    this->azimuth = fmod(this->azimuth+this->omega*this->deltaT,c2Pi);
-}
-
-
-void ActuatorLine::free(Parameter* para, CudaMemoryManager* cudaMemoryManager)
-{
-    cudaMemoryManager->cudaFreeBladeRadii(this);
-    cudaMemoryManager->cudaFreeBladeCoords(this);
-    cudaMemoryManager->cudaFreeBladeVelocities(this);
-    cudaMemoryManager->cudaFreeBladeForces(this);
-    cudaMemoryManager->cudaFreeBladeIndices(this);
-    cudaMemoryManager->cudaFreeSphereIndices(this);
-}
-
-
-void ActuatorLine::calcForcesEllipticWing()
-{
-    uint node;
-    real u_rel, v_rel, u_rel_sq;
-    real phi;
-    real Cl = c1o1;
-    real Cd = c0o1;
-    real c0 = c1o1;
-
-    real c, Cn, Ct;
-
-    for( uint blade=0; blade<this->nBlades; blade++)
-    { 
-        for( uint bladeNode=0; bladeNode<this->nBladeNodes; bladeNode++)
-        {        
-            node = calcNode(bladeNode, this->nBladeNodes, blade, this->nBlades);
-
-            u_rel = this->bladeVelocitiesXH[node];
-            v_rel = this->bladeVelocitiesYH[node];
-            u_rel_sq = u_rel*u_rel+v_rel*v_rel;
-            phi = atan2(u_rel, v_rel);
-            
-            real tmp = c4o1*this->bladeRadiiH[bladeNode]/this->diameter-c1o1;
-            c = c0 * sqrt( c1o1- tmp*tmp );
-            Cn = Cl*cos(phi)+Cd*sin(phi);
-            Ct = Cl*sin(phi)-Cd*cos(phi);
-        
-            this->bladeForcesXH[node] = -c1o2*u_rel_sq*c*this->density*Cn;
-            this->bladeForcesYH[node] = -c1o2*u_rel_sq*c*this->density*Ct;
-            this->bladeForcesZH[node] = c0o1;
-        }
-    }
-}
-
-void ActuatorLine::calcBladeForces()
-{
-    this->calcForcesEllipticWing();
-}
-
-void ActuatorLine::initBladeRadii(CudaMemoryManager* cudaMemoryManager)
-{   
-    cudaMemoryManager->cudaAllocBladeRadii(this);
-
-    real dr = c1o2*this->diameter/this->nBladeNodes;  
-
-    for(uint node=0; node<this->nBladeNodes; node++)
-    {
-        this->bladeRadiiH[node] = dr*(node+1);
-    }
-    cudaMemoryManager->cudaCopyBladeRadiiHtoD(this);
-
-    real dxOPiSqrtEps = pow(this->deltaX/(this->epsilon*sqrt(cPi)),c3o1);
-    this->factorGaussian = dr*dxOPiSqrtEps/this->forceRatio;
-}
-
-void ActuatorLine::initBladeCoords(CudaMemoryManager* cudaMemoryManager)
-{   
-    cudaMemoryManager->cudaAllocBladeCoords(this);
-
-    for(uint blade=0; blade<this->nBlades; blade++)
-    {
-        for(uint bladeNode=0; bladeNode<this->nBladeNodes; bladeNode++)
-        {
-            uint node = calcNode(bladeNode, this->nBladeNodes, blade, this->nBlades);
-
-            this->bladeCoordsXH[node] = c0o1;
-            this->bladeCoordsYH[node] = c0o1;
-            this->bladeCoordsZH[node] = this->bladeRadiiH[bladeNode];
-        }
-    }
-    cudaMemoryManager->cudaCopyBladeCoordsHtoD(this);
-}
-
-void ActuatorLine::initBladeVelocities(CudaMemoryManager* cudaMemoryManager)
-{   
-    cudaMemoryManager->cudaAllocBladeVelocities(this);
-
-    for(uint node=0; node<this->nNodes; node++)
-    {
-        this->bladeVelocitiesXH[node] = c0o1;
-        this->bladeVelocitiesYH[node] = c0o1;
-        this->bladeVelocitiesZH[node] = c0o1;
-    }
-    cudaMemoryManager->cudaCopyBladeVelocitiesHtoD(this);
-}
-
-void ActuatorLine::initBladeForces(CudaMemoryManager* cudaMemoryManager)
-{   
-    cudaMemoryManager->cudaAllocBladeForces(this);
-
-    for(uint node=0; node<this->nNodes; node++)
-    {
-        this->bladeForcesXH[node] = c0o1;
-        this->bladeForcesYH[node] = c0o1;
-        this->bladeForcesZH[node] = c0o1;
-    }
-    cudaMemoryManager->cudaCopyBladeForcesHtoD(this);
-}
-
-void ActuatorLine::initBladeIndices(Parameter* para, CudaMemoryManager* cudaMemoryManager)
-{   
-    cudaMemoryManager->cudaAllocBladeIndices(this);
-
-    for(uint node=0; node<this->nNodes; node++)
-
-    {
-        this->bladeIndicesH[node] = 1;
-    }
-    cudaMemoryManager->cudaCopyBladeIndicesHtoD(this);
-}
-
-void ActuatorLine::initBoundingSphere(Parameter* para, CudaMemoryManager* cudaMemoryManager)
-{
-    // Actuator line exists only on 1 level
-    std::vector<int> nodesInSphere;
-    real sphereRadius = c1o2*this->diameter+c4o1*this->epsilon;
-    real sphereRadiusSqrd = sphereRadius*sphereRadius;
-
-    for (uint j = 1; j <= para->getParH(this->level)->numberOfNodes; j++)
-    {
-        const real distX = para->getParH(this->level)->coordinateX[j]-this->turbinePosX;
-        const real distY = para->getParH(this->level)->coordinateY[j]-this->turbinePosY;
-        const real distZ = para->getParH(this->level)->coordinateZ[j]-this->turbinePosZ;
-        if(distSqrd(distX,distY,distZ) < sphereRadiusSqrd) nodesInSphere.push_back(j);
-    }
-
-    this->nIndices = uint(nodesInSphere.size());
-    cudaMemoryManager->cudaAllocSphereIndices(this);
-    std::copy(nodesInSphere.begin(), nodesInSphere.end(), this->boundingSphereIndicesH);
-    cudaMemoryManager->cudaCopySphereIndicesHtoD(this);
-}
-
-void ActuatorLine::setBladeCoords(real* _bladeCoordsX, real* _bladeCoordsY, real* _bladeCoordsZ)
-{ 
-
-    for(uint node=0; node<this->nNodes; node++)
-    {
-        this->bladeCoordsXH[node] = _bladeCoordsX[node];
-        this->bladeCoordsYH[node] = _bladeCoordsY[node];
-        this->bladeCoordsZH[node] = _bladeCoordsZ[node];
-    }
-}
-
-void ActuatorLine::setBladeVelocities(real* _bladeVelocitiesX, real* _bladeVelocitiesY, real* _bladeVelocitiesZ)
-{ 
-    for(uint node=0; node<this->nNodes; node++)
-    {
-        this->bladeVelocitiesXH[node] = _bladeVelocitiesX[node];
-        this->bladeVelocitiesYH[node] = _bladeVelocitiesY[node];
-        this->bladeVelocitiesZH[node] = _bladeVelocitiesZ[node];
-    }
-}
-
-void ActuatorLine::setBladeForces(real* _bladeForcesX, real* _bladeForcesY, real* _bladeForcesZ)
-{ 
-    for(uint node=0; node<this->nNodes; node++)
-    {
-        this->bladeForcesXH[node] = _bladeForcesX[node];
-        this->bladeForcesYH[node] = _bladeForcesY[node];
-        this->bladeForcesZH[node] = _bladeForcesZ[node];
-    }
-}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h
deleted file mode 100644
index b44c89c5020eb206baa3bba1994b1e45f760c3bb..0000000000000000000000000000000000000000
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef ActuatorLine_H
-#define ActuatorLine_H
-
-#include "PreCollisionInteractor.h"
-#include "PointerDefinitions.h"
-#include "VirtualFluids_GPU_export.h"
-#include "lbm/constants/NumericConstants.h"
-
-class Parameter;
-class GridProvider;
-
-using namespace vf::lbm::constant;
-class VIRTUALFLUIDS_GPU_EXPORT ActuatorLine : public PreCollisionInteractor
-{
-public:
-    ActuatorLine(
-        const uint _nBlades,
-        const real _density,
-        const uint _nBladeNodes,
-        const real _epsilon,
-        real _turbinePosX, real _turbinePosY, real _turbinePosZ,
-        const real _diameter,
-        int _level,
-        const real _deltaT,
-        const real _deltaX
-    ) : nBlades(_nBlades),
-        density(_density),
-        nBladeNodes(_nBladeNodes), 
-        epsilon(_epsilon),
-        turbinePosX(_turbinePosX), turbinePosY(_turbinePosY), turbinePosZ(_turbinePosZ),
-        diameter(_diameter),
-        level(_level),
-        PreCollisionInteractor()
-    {
-        this->deltaT = _deltaT*exp2(-this->level);
-        this->deltaX = _deltaX*exp2(-this->level);
-        this->invDeltaX = c1o1/this->deltaX;
-        this->forceRatio = this->density*pow(this->deltaX,4)*pow(this->deltaT,-2);
-        this->invEpsilonSqrd = c1o1/(this->epsilon*this->epsilon);
-        this->nNodes = this->nBladeNodes*this->nBlades;
-        this->omega = c1o1;
-        this->azimuth = c0o1;
-        this->yaw = c0o1;
-    };
-
-    virtual ~ActuatorLine(){};
-
-    void init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager* cudaMemoryManager) override;
-    void interact(Parameter* para, CudaMemoryManager* cudaMemoryManager, int level, uint t) override;
-    void free(Parameter* para, CudaMemoryManager* cudaMemoryManager) override;
-    void write(uint t);
-
-    uint getNBladeNodes(){ return this->nBladeNodes; };
-    uint getNBlades(){ return this->nBlades;};
-    uint getNIndices(){ return this->nIndices; };
-    uint getNNodes(){ return this->nNodes; };
-    real getOmega(){ return this->omega; };
-    real getAzimuth(){ return this->azimuth; };
-    real getYaw(){ return this->yaw; };
-    real getDensity(){ return this->density; };
-    real getPositionX(){ return this->turbinePosX; };
-    real getPositionY(){ return this->turbinePosY; };
-    real getPositionZ(){ return this->turbinePosZ; };
-    real* getBladeRadii(){ return this->bladeRadiiH; };
-    real* getBladeCoordsX(){ return this->bladeCoordsXH; };
-    real* getBladeCoordsY(){ return this->bladeCoordsYH; };
-    real* getBladeCoordsZ(){ return this->bladeCoordsZH; };
-    real* getBladeVelocitiesX(){ return this->bladeVelocitiesXH; };
-    real* getBladeVelocitiesY(){ return this->bladeVelocitiesYH; };
-    real* getBladeVelocitiesZ(){ return this->bladeVelocitiesZH; };
-    real* getBladeForcesX(){ return this->bladeForcesXH; };
-    real* getBladeForcesY(){ return this->bladeForcesYH; };
-    real* getBladeForcesZ(){ return this->bladeForcesZH; };
-
-    void setOmega(real _omega){ this->omega = _omega; };
-    void setAzimuth(real _azimuth){ this->azimuth = _azimuth; };
-    void setYaw(real _yaw){ this->yaw = _yaw; };
-    void setBladeCoords(real* _bladeCoordsX, real* _bladeCoordsY, real* _bladeCoordsZ);
-    void setBladeVelocities(real* _bladeVelocitiesX, real* _bladeVelocitiesY, real* _bladeVelocitiesZ);
-    void setBladeForces(real* _bladeForcesX, real* _bladeForcesY, real* _bladeForcesZ);
-    virtual void calcBladeForces();
-
-private:
-    void initBoundingSphere(Parameter* para, CudaMemoryManager* cudaMemoryManager);
-
-    void initBladeRadii(CudaMemoryManager* cudaMemoryManager);
-    void initBladeCoords(CudaMemoryManager* cudaMemoryManager);
-    void initBladeVelocities(CudaMemoryManager* cudaMemoryManager);
-    void initBladeForces(CudaMemoryManager* cudaMemoryManager);
-    void initBladeIndices(Parameter* para, CudaMemoryManager* cudaMemoryManager);
-
-    void calcForcesEllipticWing();
-
-public:
-    real* bladeRadiiH;
-    real* bladeRadiiD;
-    real* bladeCoordsXH, * bladeCoordsYH, * bladeCoordsZH;
-    real* bladeCoordsXD, * bladeCoordsYD, * bladeCoordsZD;
-    real* bladeVelocitiesXH, * bladeVelocitiesYH, * bladeVelocitiesZH;
-    real* bladeVelocitiesXD, * bladeVelocitiesYD, * bladeVelocitiesZD;
-    real* bladeForcesXH, * bladeForcesYH, * bladeForcesZH;
-    real* bladeForcesXD, * bladeForcesYD, * bladeForcesZD;
-    uint* bladeIndicesH;
-    uint* bladeIndicesD; 
-    uint* boundingSphereIndicesH;
-    uint* boundingSphereIndicesD;
-    
-private:
-    const real density;
-    real turbinePosX, turbinePosY, turbinePosZ;
-    real omega, azimuth, yaw, deltaT, deltaX, invDeltaX, forceRatio, factorGaussian, invEpsilonSqrd;
-    const real diameter;
-    const uint nBladeNodes;
-    const uint nBlades;
-    const real epsilon; // in m
-    const int level;
-    uint nIndices, nNodes;
-};
-
-#endif
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PreCollisionInteractor.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PreCollisionInteractor.h
index a9b233f3035890c2617d3a00b639f995be6c218f..f9a87f613e7607301e59a7c1e67eb556418892e4 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PreCollisionInteractor.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PreCollisionInteractor.h
@@ -33,6 +33,7 @@ public:
     virtual void init(Parameter *para, GridProvider *gridProvider, CudaMemoryManager *cudaMemoryManager) = 0;
     virtual void interact(Parameter *para, CudaMemoryManager *cudaMemoryManager, int level, uint t) = 0;
     virtual void free(Parameter *para, CudaMemoryManager *cudaMemoryManager) = 0;
+    virtual void getTaggedFluidNodes(Parameter *para, GridProvider* gridProvider) = 0;
 
 protected:
     uint updateInterval;
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1a8260ef936e2707fb38fbbba71cdbfac692f350
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.cu
@@ -0,0 +1,359 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PrecursorWriter.cu
+//! \ingroup PreCollisionInteractor
+//! \author Henrik Asmuth, Henry Korb
+//======================================================================================
+#include "PrecursorWriter.h"
+#include "basics/writer/WbWriterVtkXmlImageBinary.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#include "cuda/CudaGrid.h"
+#include "LBM/GPUHelperFunctions/KernelUtilities.h"
+
+#include "Core/StringUtilities/StringUtil.h"
+
+#include "Parameter/Parameter.h"
+#include "DataStructureInitializer/GridProvider.h"
+#include "GPU/CudaMemoryManager.h"
+
+using namespace vf::lbm::dir;
+using namespace vf::gpu;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//TODO check everything for multiple level
+void index1d(int& idx, int y, int z, int ny, int nz)
+{
+    idx = y+ny*z;
+}
+
+void index2d(int idx, int& y, int& z, int ny, int nz)
+{
+    z = idx/ny;
+    y = idx-ny*z;
+}
+
+__inline__ __host__ __device__ uint linearIdx(const uint component, const uint node, const uint timestep, const uint numberOfComponents, const uint numberOfNodes)
+{
+    return node+numberOfNodes*(component+numberOfComponents*timestep);
+}
+
+__inline__ __host__ __device__ uint linearIdx(const uint component, const uint node, const uint numberOfNodes)
+{
+    return node+component*numberOfNodes;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void fillArrayVelocities(const uint numberOfPrecursorNodes, 
+                                    uint* indices, 
+                                    real *precursorData,
+                                    real *vx,
+                                    real *vy,
+                                    real *vz,
+                                    real velocityRatio)
+
+
+{
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = vf::gpu::getNodeIndex();
+
+    if(nodeIndex>=numberOfPrecursorNodes) return;
+
+    precursorData[linearIdx(0u, nodeIndex, numberOfPrecursorNodes)] = vx[indices[nodeIndex]]*velocityRatio;
+    precursorData[linearIdx(1u, nodeIndex, numberOfPrecursorNodes)] = vy[indices[nodeIndex]]*velocityRatio;
+    precursorData[linearIdx(2u, nodeIndex, numberOfPrecursorNodes)] = vz[indices[nodeIndex]]*velocityRatio;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void fillArrayDistributions( uint numberOfPrecursorNodes, 
+                                        uint* indices, 
+                                        real* precursorData,
+                                        real* distributions,
+                                        uint* neighborX, uint* neighborY, uint* neighborZ,
+                                        bool isEvenTimestep,
+                                        unsigned long numberOfLBnodes)
+{
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned nodeIndex = vf::gpu::getNodeIndex();
+
+    if(nodeIndex>=numberOfPrecursorNodes) return;
+
+    Distributions27 dist;
+    getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    // ! - Set neighbor indices (necessary for indirect addressing)
+    uint k_000 = indices[nodeIndex];
+    // uint k_M00 = neighborX[k_000];
+    uint k_0M0 = neighborY[k_000];
+    uint k_00M = neighborZ[k_000];
+    // uint k_MM0 = neighborY[k_M00];
+    // uint k_M0M = neighborZ[k_M00];
+    uint k_0MM = neighborZ[k_0M0];
+    // uint k_MMM = neighborZ[k_MM0];
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Get local distributions in PX directions
+    //!
+    precursorData[linearIdx(PrecP00, nodeIndex, numberOfPrecursorNodes)] = (dist.f[DIR_P00])[k_000];
+    precursorData[linearIdx(PrecPP0, nodeIndex, numberOfPrecursorNodes)] = (dist.f[DIR_PP0])[k_000];
+    precursorData[linearIdx(PrecPM0, nodeIndex, numberOfPrecursorNodes)] = (dist.f[DIR_PM0])[k_0M0];
+    precursorData[linearIdx(PrecP0P, nodeIndex, numberOfPrecursorNodes)] = (dist.f[DIR_P0P])[k_000];
+    precursorData[linearIdx(PrecP0M, nodeIndex, numberOfPrecursorNodes)] = (dist.f[DIR_P0M])[k_00M];
+    precursorData[linearIdx(PrecPPP, nodeIndex, numberOfPrecursorNodes)] = (dist.f[DIR_PPP])[k_000];
+    precursorData[linearIdx(PrecPMP, nodeIndex, numberOfPrecursorNodes)] = (dist.f[DIR_PMP])[k_0M0];
+    precursorData[linearIdx(PrecPPM, nodeIndex, numberOfPrecursorNodes)] = (dist.f[DIR_PPM])[k_00M];
+    precursorData[linearIdx(PrecPMM, nodeIndex, numberOfPrecursorNodes)] = (dist.f[DIR_PMM])[k_0MM];
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void PrecursorWriter::init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager* cudaManager)
+{
+    VF_LOG_INFO("PrecursorWriter: Start initializing...");
+    VF_LOG_INFO("Writing yz-planes at x={}m every {}. timestep, starting at t={}", this->xPos, this->tSave, this->tStartOut);
+
+    precursorStructs.resize(para->getMaxLevel()+1);
+    for(int level=0; level<=para->getMaxLevel(); level++)
+    {
+
+        real dx = abs(para->getParH(level)->coordinateX[1]-para->getParH(level)->coordinateX[para->getParH(level)->neighborX[1]]);
+        int maxPoints = (int((yMax-yMin)/dx)+1)* (int((zMax-zMin)/dx)+1);
+
+        real lowestY, lowestZ, highestY, highestZ;
+
+        lowestY = para->getParH(level)->coordinateY[para->getParH(level)->numberOfNodes-1];
+        highestY = para->getParH(level)->coordinateY[1];        
+        
+        lowestZ = para->getParH(level)->coordinateZ[para->getParH(level)->numberOfNodes-1];
+        highestZ = para->getParH(level)->coordinateZ[1];
+
+        std::vector<uint> indicesOnGrid;
+        std::vector<int> indicesOnPlane;
+        std::vector<real> coordY, coordZ;
+
+        for(size_t pos = 1; pos < para->getParH(level)->numberOfNodes; pos++ )
+        {
+            real pointCoordX = para->getParH(level)->coordinateX[pos];
+            real pointCoordY = para->getParH(level)->coordinateY[pos];
+            real pointCoordZ = para->getParH(level)->coordinateZ[pos];
+            if( para->getParH(level)->typeOfGridNode[pos] == GEO_FLUID &&
+                pointCoordX < (dx+xPos) && pointCoordX >= xPos       &&
+                pointCoordY<=yMax && pointCoordY>=yMin               && 
+                pointCoordZ<=zMax && pointCoordZ>=zMin)
+            {
+                highestY = max(highestY, pointCoordY);
+                highestZ = max(highestZ, pointCoordZ);
+
+                lowestY = min(lowestY, pointCoordY);
+                lowestZ = min(lowestZ, pointCoordZ);
+                indicesOnGrid.push_back((uint)pos);    
+                coordY.push_back(pointCoordY);            
+                coordZ.push_back(pointCoordZ);    
+            }
+        }
+        if(indicesOnGrid.size()==0)
+            throw std::runtime_error("PrecursorWriter did not find any points on the grid");
+
+        int ny = int((highestY-lowestY)/dx)+1;
+        int nz = int((highestZ-lowestZ)/dx)+1;
+
+        for(uint i=0;i<indicesOnGrid.size(); i++)
+        {
+                int idxY = int((coordY[i]-lowestY)/dx);
+                int idxZ = int((coordZ[i]-lowestZ)/dx);
+                int idx;
+                index1d(idx, idxY, idxZ, ny, nz);
+                indicesOnPlane.push_back(idx);
+        }
+
+        precursorStructs[level] = SPtr<PrecursorStruct>(new PrecursorStruct);
+        precursorStructs[level]->numberOfPointsInBC = (uint)indicesOnGrid.size();
+        precursorStructs[level]->indicesOnPlane = (int*) malloc(precursorStructs[level]->numberOfPointsInBC*sizeof(int));
+        precursorStructs[level]->spacing = makeUbTuple(dx, dx, tSave*para->getTimeRatio()*pow(2,-level));
+        precursorStructs[level]->origin = makeUbTuple(lowestY, lowestZ);
+        precursorStructs[level]->extent = makeUbTuple(0, ny-1, 0, nz-1);
+        precursorStructs[level]->numberOfPointsInData = ny*nz;
+        precursorStructs[level]->numberOfTimestepsPerFile = min(para->getlimitOfNodesForVTK()/(ny*nz), maxtimestepsPerFile);
+        precursorStructs[level]->numberOfFilesWritten = 0;
+        precursorStructs[level]->numberOfTimestepsBuffered = 0;
+        
+        switch (outputVariable)
+        {
+        case OutputVariable::Velocities:
+            precursorStructs[level]->numberOfQuantities = 3;
+            break;
+        case OutputVariable::Distributions:
+            precursorStructs[level]->numberOfQuantities = 9;
+            break;
+        
+        default:
+            break;
+        }
+
+        cudaManager->cudaAllocPrecursorWriter(this, level);
+    
+        std::copy(indicesOnGrid.begin(), indicesOnGrid.end(), precursorStructs[level]->indicesH);
+        std::copy(indicesOnPlane.begin(), indicesOnPlane.end(), precursorStructs[level]->indicesOnPlane);
+
+        cudaManager->cudaCopyPrecursorWriterIndicesHtoD(this, level);
+
+        VF_LOG_INFO("Found {} points in precursor plane on level {}", precursorStructs[level]->numberOfPointsInBC, level);
+    }
+    VF_LOG_INFO("PrecursorWriter: Done initializing.");
+}
+
+
+void PrecursorWriter::interact(Parameter* para, CudaMemoryManager* cudaManager, int level, uint t)
+{
+    uint t_level         = para->getTimeStep(level, t, true);
+    uint tStartOut_level = tStartOut*pow(2, level);
+    uint tEnd_level      = para->getTimestepEnd()*pow(2, level);
+
+    if(t_level>tStartOut_level && ((t_level-tStartOut_level) % tSave)==0)
+    {
+        vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, precursorStructs[level]->numberOfPointsInBC);
+
+        if(this->outputVariable==OutputVariable::Velocities)
+        {
+            fillArrayVelocities<<<grid.grid, grid.threads>>>(   precursorStructs[level]->numberOfPointsInBC, precursorStructs[level]->indicesD, 
+                                                                precursorStructs[level]->bufferD, 
+                                                                para->getParD(level)->velocityX, para->getParD(level)->velocityY, para->getParD(level)->velocityZ,
+                                                                para->getVelocityRatio());
+            getLastCudaError("In PrecursorWriter::interact fillArrayVelocities execution failed");
+        }
+        else if(this->outputVariable==OutputVariable::Distributions)
+        {
+            fillArrayDistributions<<<grid.grid, grid.threads>>>(precursorStructs[level]->numberOfPointsInBC, precursorStructs[level]->indicesD, 
+                                                                precursorStructs[level]->bufferD,
+                                                                para->getParD(level)->distributions.f[0],
+                                                                para->getParD(level)->neighborX, para->getParD(level)->neighborY, para->getParD(level)->neighborZ,
+                                                                para->getEvenOrOdd(level), para->getParD(level)->numberOfNodes);
+            getLastCudaError("In PrecursorWriter::interact fillArrayDistributions execution failed");
+        }
+        cudaManager->cudaCopyPrecursorWriterOutputVariablesDtoH(this, level);
+
+        // switch device buffer and data pointer so precursor data is gathered in buffer and copied from bufferD to bufferH
+        real *tmp = precursorStructs[level]->bufferD;
+        precursorStructs[level]->bufferD = precursorStructs[level]->dataD;
+        precursorStructs[level]->dataD = tmp;
+
+        precursorStructs[level]->numberOfTimestepsBuffered++;
+
+        if(precursorStructs[level]->numberOfTimestepsBuffered >= precursorStructs[level]->numberOfTimestepsPerFile || t == para->getTimestepEnd())
+        {
+        // switch host buffer and data pointer so precursor data is copied in buffer and written from data
+
+            tmp = precursorStructs[level]->bufferH;
+            precursorStructs[level]->bufferH = precursorStructs[level]->dataH;
+            precursorStructs[level]->dataH = tmp;
+
+            writeFuture.wait();
+            writeFuture = std::async(std::launch::async, [this](Parameter* para, uint level, uint timesteps){ this->write(para, level, timesteps); }, para, level, precursorStructs[level]->numberOfTimestepsBuffered);
+            precursorStructs[level]->numberOfTimestepsBuffered = 0;
+        }
+    }
+}
+
+
+void PrecursorWriter::free(Parameter* para, CudaMemoryManager* cudaManager)
+{
+    writeFuture.wait();
+    for(int level=0; level<=para->getMaxLevel(); level++)
+    {
+        if(getPrecursorStruct(level)->numberOfTimestepsBuffered>0)
+            write(para, level, getPrecursorStruct(level)->numberOfTimestepsBuffered);
+
+        cudaManager->cudaFreePrecursorWriter(this, level);
+    }
+}
+
+
+void PrecursorWriter::write(Parameter* para, int level, uint numberOfTimestepsBuffered)
+{
+    std::string fname = this->makeFileName(fileName, level, para->getMyProcessID(), precursorStructs[level]->numberOfFilesWritten) + getWriter()->getFileExtension();
+    std::string wholeName = outputPath + "/" + fname;
+
+    uint numberOfPointsInData = precursorStructs[level]->numberOfPointsInData;
+
+    int startTime = precursorStructs[level]->numberOfFilesWritten*precursorStructs[level]->numberOfTimestepsPerFile;
+
+    UbTupleInt6 extent = makeUbTuple(   val<1>(precursorStructs[level]->extent),    val<2>(precursorStructs[level]->extent), 
+                                        val<3>(precursorStructs[level]->extent),    val<4>(precursorStructs[level]->extent), 
+                                        startTime,                          startTime+(int)numberOfTimestepsBuffered-1);
+
+    UbTupleFloat3 origin = makeUbTuple( val<1>(precursorStructs[level]->origin), val<2>(precursorStructs[level]->origin), 0.f);
+
+    std::vector<std::vector<double>> nodedata;
+    
+    for(uint quant=0; quant<precursorStructs[level]->numberOfQuantities; quant++)
+    {
+        std::vector<double> doubleArr(numberOfPointsInData*numberOfTimestepsBuffered, NAN);
+        for( uint timestep=0; timestep<numberOfTimestepsBuffered; timestep++)
+        {
+            for (uint pos=0; pos < precursorStructs[level]->numberOfPointsInBC; pos++)
+            {
+                int indexOnPlane = precursorStructs[level]->indicesOnPlane[pos]+timestep*numberOfPointsInData;
+                doubleArr[indexOnPlane] = double(precursorStructs[level]->dataH[linearIdx(quant, pos, timestep, precursorStructs[level]->numberOfQuantities, precursorStructs[level]->numberOfPointsInBC)]);
+            }
+        }
+        nodedata.push_back(doubleArr);
+    }
+
+    std::vector<std::vector<double>> celldata;
+    getWriter()->writeData(wholeName, nodedatanames, celldatanames, nodedata, celldata, extent, origin, precursorStructs[level]->spacing, extent, this->writePrecision);
+    precursorStructs[level]->numberOfFilesWritten++;
+}
+
+std::string PrecursorWriter::makeFileName(std::string fileName, int level, int id, uint numberOfFilesWritten)
+{
+    return fileName + "_lev_" + StringUtil::toString<int>(level)
+                    + "_ID_" + StringUtil::toString<int>(id)
+                    + "_File_" + StringUtil::toString<int>(numberOfFilesWritten);
+}
+
+void PrecursorWriter::getTaggedFluidNodes(Parameter *para, GridProvider* gridProvider)
+{
+    for(uint level=0; level<(uint)para->getMaxLevel(); level++)
+    {
+        if(outputVariable==OutputVariable::Velocities)
+        {
+            std::vector<uint> indices(precursorStructs[level]->indicesH, precursorStructs[level]->indicesH+precursorStructs[level]->numberOfPointsInBC);
+            gridProvider->tagFluidNodeIndices(indices, CollisionTemplate::WriteMacroVars, level);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..264023b58ba6db46b50f6a85b334c530864a0b8f
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.h
@@ -0,0 +1,161 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PrecursorWriter.h
+//! \author Henry Korb, Henrik Asmuth
+//! \date 05/12/2022
+//! \brief Probe writing planes of data to be used as inflow data in successor simulation using PrecursorBC
+//!
+//! The probe writes out yz-planes at a specific x position ( \param xPos ) of either velocity or distributions 
+//! that can be read by PrecursorBC as inflow data.
+//=======================================================================================
+
+
+#ifndef PRECURSORPROBE_H_
+#define PRECURSORPROBE_H_
+
+#include "PreCollisionInteractor.h"
+#include "WbWriterVtkXmlImageBinary.h"
+#include "LBM/LB.h"
+#include <string>
+#include <vector>
+#include <future>
+#include "PointerDefinitions.h"
+#include "Logger.h"
+
+class Parameter;
+class CudaMemoryManager;
+class GridProvider;
+
+enum class OutputVariable {
+   //! - Velocities
+    Velocities,
+    //! - Distributions
+    Distributions    
+};
+
+static constexpr uint PrecP00 = 0;
+static constexpr uint PrecPP0 = 1;
+static constexpr uint PrecPM0 = 2;
+static constexpr uint PrecP0P = 3;
+static constexpr uint PrecP0M = 4;
+static constexpr uint PrecPPP = 5;
+static constexpr uint PrecPMP = 6;
+static constexpr uint PrecPPM = 7;
+static constexpr uint PrecPMM = 8;
+
+struct PrecursorStruct
+{
+    uint numberOfPointsInBC, numberOfPointsInData, numberOfTimestepsPerFile, numberOfFilesWritten, numberOfTimestepsBuffered;
+    uint *indicesH, *indicesD;
+    real *dataH, *dataD;
+    real *bufferH, *bufferD;
+    uint numberOfQuantities;
+    UbTupleInt4 extent;
+    UbTupleFloat2 origin;
+    UbTupleFloat3 spacing;
+    int* indicesOnPlane;
+    cudaStream_t stream;
+};
+
+class PrecursorWriter : public PreCollisionInteractor
+{
+public:
+    PrecursorWriter(
+        const std::string _fileName,
+        const std::string _outputPath,
+        real _xPos,
+        real _yMin, real _yMax,
+        real _zMin, real _zMax,
+        uint _tStartOut,
+        uint _tSave,
+        OutputVariable _outputVariable,
+        uint _maxTimestepsPerFile=uint(1e4)
+    ): 
+    fileName(_fileName), 
+    outputPath(_outputPath), 
+    xPos(_xPos),
+    yMin(_yMin),
+    yMax(_yMax),
+    zMin(_zMin),
+    zMax(_zMax),
+    tStartOut(_tStartOut), 
+    tSave(_tSave),
+    outputVariable(_outputVariable),
+    maxtimestepsPerFile(_maxTimestepsPerFile)
+    {
+        nodedatanames = determineNodeDataNames();
+        writeFuture = std::async([](){});
+    };
+
+    void init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager* cudaManager) override;
+    void interact(Parameter* para, CudaMemoryManager* cudaManager, int level, uint t) override;
+    void free(Parameter* para, CudaMemoryManager* cudaManager) override;
+    void getTaggedFluidNodes(Parameter *para, GridProvider* gridProvider) override;
+
+    OutputVariable getOutputVariable(){ return this->outputVariable; }
+
+    SPtr<PrecursorStruct> getPrecursorStruct(int level){return precursorStructs[level];}
+    static std::string makeFileName(std::string fileName, int level, int id, uint part);
+
+    void setWritePrecision(uint _writePrecision){ this->writePrecision=_writePrecision;}
+    
+private:
+    WbWriterVtkXmlImageBinary* getWriter(){ return WbWriterVtkXmlImageBinary::getInstance(); };
+    void write(Parameter* para, int level, uint numberOfTimestepsBuffered);
+
+    std::vector<std::string> determineNodeDataNames()
+    {
+        switch (outputVariable)
+        {
+        case OutputVariable::Velocities:
+            return {"vx", "vy", "vz"};
+            break;       
+        case OutputVariable::Distributions:
+            return {"fP00", "fPP0", "fPM0", "fP0P", "fP0M", "fPPP", "fPMP", "fPPM", "fPMM"};
+            break;
+        
+        default:
+            throw std::runtime_error("Invalid OutputVariable for PrecursorWriter");
+            break;
+        }
+    }
+
+private:
+    std::vector<SPtr<PrecursorStruct>> precursorStructs;
+    std::string fileName, outputPath;
+    std::vector<std::string> nodedatanames;
+    std::vector<std::string> celldatanames;
+    uint tStartOut, tSave, maxtimestepsPerFile;
+    real xPos, yMin, yMax, zMin, zMax;
+    OutputVariable outputVariable;
+    std::future<void> writeFuture;
+    uint writePrecision = 8;
+};
+
+#endif //PRECURSORPROBE_H_
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.cu
index f5b520acfad74f6787e9e657fce3ccdceed9d539..e89d392b5d4bf5983f9bb47642fef81d0f06cc89 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.cu
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.cu
@@ -15,6 +15,7 @@
 #include "Parameter/Parameter.h"
 #include "DataStructureInitializer/GridProvider.h"
 #include "GPU/CudaMemoryManager.h"
+#include "GPU/GPU_Interface.h"
 
 #include <algorithm>
 
@@ -235,7 +236,7 @@ void PlanarAverageProbe::findPoints(Parameter* para, GridProvider* gridProvider,
                                 }
 
     // Find all points along the normal direction
-    for(uint j=1; j<para->getParH(level)->numberOfNodes; j++ )
+    for(size_t j = 1; j < para->getParH(level)->numberOfNodes; j++ )
     {
         if(para->getParH(level)->typeOfGridNode[j] == GEO_FLUID)
         {   
@@ -250,16 +251,16 @@ void PlanarAverageProbe::findPoints(Parameter* para, GridProvider* gridProvider,
     std::sort(pointCoordsNormal->begin(), pointCoordsNormal->end());
     
     // Find all pointCoords in the first plane 
-    for(uint j=1; j<para->getParH(level)->numberOfNodes; j++ )
+    for(size_t pos = 1; pos < para->getParH(level)->numberOfNodes; pos++ )
     {
-        if( para->getParH(level)->typeOfGridNode[j] == GEO_FLUID && pointCoordsNormal_par[j] == pointCoordsNormal->at(0)) 
+        if( para->getParH(level)->typeOfGridNode[pos] == GEO_FLUID && pointCoordsNormal_par[pos] == pointCoordsNormal->at(0)) 
         {
             //not needed in current state, might become relevant for two-point correlations
             // pointCoordsNormal->push_back( pointCoordsNormal_par[j] ); 
             // pointCoordsInplane1->push_back( pointCoordsInplane1_par[j] );
             // pointCoordsInplane2->push_back( pointCoordsInplane2_par[j] );
 
-            probeIndices_level.push_back(j);
+            probeIndices_level.push_back((int)pos);
         }
     }
 }
@@ -268,6 +269,23 @@ void PlanarAverageProbe::findPoints(Parameter* para, GridProvider* gridProvider,
 
 void PlanarAverageProbe::calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, uint t_level, int level)
 {   
+    // Compute macroscopic variables in entire domain
+    CalcMacCompSP27(
+        para->getParD(level)->velocityX, 
+        para->getParD(level)->velocityY, 
+        para->getParD(level)->velocityZ,
+        para->getParD(level)->rho, 
+        para->getParD(level)->pressure, 
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->neighborX, 
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ, 
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->numberofthreads, 
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("In PlanarAverageProbe Kernel CalcMacSP27 execution failed");
+
     // Definition of normal and inplane directions for moveIndices kernels
     uint *neighborNormal, *neighborInplane1, *neighborInplane2;
     if( this->planeNormal == 'x' )
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.h
index d11f8e76e4d13113b201af5494b7d0cfcfe18353..3d3533f74501e776f9150c83c9d9101a0be7ecbc 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.h
@@ -74,6 +74,7 @@ public:
         planeNormal(_planeNormal)
 
     {   
+        if (_tStartTmpAvg<_tStartAvg)   throw std::runtime_error("Probe: tStartTmpAvg must be larger than tStartAvg!");
         if(!(_planeNormal == 'x' || _planeNormal == 'y' || _planeNormal == 'z')) 
             throw std::runtime_error("PlanarAverageProbe: planeNormal must be 'x', 'y' or 'z'!");
     }
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.cu
index 7d1c0205219737e4b28acbb1a893a0a6071ae9de..f55045505bff0e3b5b0b1426be4e9e1a3832d088 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.cu
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.cu
@@ -76,11 +76,11 @@ void PlaneProbe::findPoints(Parameter* para, GridProvider* gridProvider, std::ve
                             int level)
 {
     real dx = abs(para->getParH(level)->coordinateX[1]-para->getParH(level)->coordinateX[para->getParH(level)->neighborX[1]]);
-    for(uint j=1; j<para->getParH(level)->numberOfNodes; j++ )
+    for(size_t pos = 1; pos < para->getParH(level)->numberOfNodes; pos++ )
     {
-        real pointCoordX = para->getParH(level)->coordinateX[j];
-        real pointCoordY = para->getParH(level)->coordinateY[j];
-        real pointCoordZ = para->getParH(level)->coordinateZ[j];
+        real pointCoordX = para->getParH(level)->coordinateX[pos];
+        real pointCoordY = para->getParH(level)->coordinateY[pos];
+        real pointCoordZ = para->getParH(level)->coordinateZ[pos];
         real distX = pointCoordX - this->posX;
         real distY = pointCoordY - this->posY;
         real distZ = pointCoordZ - this->posZ;
@@ -88,7 +88,7 @@ void PlaneProbe::findPoints(Parameter* para, GridProvider* gridProvider, std::ve
         if( distX <= this->deltaX && distY <= this->deltaY && distZ <= this->deltaZ &&
             distX >=0.f && distY >=0.f && distZ >=0.f)
         {
-            probeIndices_level.push_back(j);
+            probeIndices_level.push_back((int)pos);
             distX_level.push_back( distX/dx );
             distY_level.push_back( distY/dx );
             distZ_level.push_back( distZ/dx );
@@ -106,4 +106,14 @@ void PlaneProbe::calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* p
     para->getParD(level)->velocityX, para->getParD(level)->velocityY, para->getParD(level)->velocityZ, para->getParD(level)->rho, 
     para->getParD(level)->neighborX, para->getParD(level)->neighborY, para->getParD(level)->neighborZ, 
     probeStruct->quantitiesD, probeStruct->arrayOffsetsD, probeStruct->quantitiesArrayD);
+}
+
+void PlaneProbe::getTaggedFluidNodes(Parameter *para, GridProvider* gridProvider)
+{
+    for(int level=0; level<=para->getMaxLevel(); level++)
+    {
+        SPtr<ProbeStruct> probeStruct = this->getProbeStruct(level);
+        std::vector<uint> probeIndices( probeStruct->pointIndicesH, probeStruct->pointIndicesH+probeStruct->nIndices);
+        gridProvider->tagFluidNodeIndices( probeIndices, CollisionTemplate::WriteMacroVars, level);
+    }
 }
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h
index 3440c01020f9b3505be7148024e47373b76648ff..180169707a6d7f3f7975f6a2bc4009f7c0aba527 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h
@@ -54,7 +54,7 @@ public:
     ): Probe(_probeName, 
              _outputPath,
              _tStartAvg, 
-             0,
+             _tStartAvg+1,
              _tAvg,
              _tStartOut, 
              _tOut,
@@ -72,6 +72,8 @@ public:
         this->deltaZ = _deltaZ; 
     }
 
+    void getTaggedFluidNodes(Parameter *para, GridProvider* gridProvider) override;
+
 private:
     bool isAvailableStatistic(Statistic _variable) override;
 
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.cu
index e78a98f02ac2093fc46b4daa4a2485ed1395275b..89e1f6b87687ed42c079415a5340f1d385c8d62c 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.cu
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.cu
@@ -75,20 +75,20 @@ void PointProbe::findPoints(Parameter* para, GridProvider* gridProvider, std::ve
 {
 
     real dx = abs(para->getParH(level)->coordinateX[1]-para->getParH(level)->coordinateX[para->getParH(level)->neighborX[1]]);
-    for(uint j=1; j<para->getParH(level)->numberOfNodes; j++ )
+    for(size_t pos = 1; pos < para->getParH(level)->numberOfNodes; pos++ )
     {    
         for(uint point=0; point<this->pointCoordsX.size(); point++)
         {
             real pointCoordX = this->pointCoordsX[point];
             real pointCoordY = this->pointCoordsY[point];
             real pointCoordZ = this->pointCoordsZ[point];
-            real distX = pointCoordX-para->getParH(level)->coordinateX[j];
-            real distY = pointCoordY-para->getParH(level)->coordinateY[j];
-            real distZ = pointCoordZ-para->getParH(level)->coordinateZ[j];
+            real distX = pointCoordX-para->getParH(level)->coordinateX[pos];
+            real distY = pointCoordY-para->getParH(level)->coordinateY[pos];
+            real distZ = pointCoordZ-para->getParH(level)->coordinateZ[pos];
             if( distX <=dx && distY <=dx && distZ <=dx &&
                 distX >0.f && distY >0.f && distZ >0.f)
             {
-                probeIndices_level.push_back(j);
+                probeIndices_level.push_back((int)pos);
                 distX_level.push_back( distX/dx );
                 distY_level.push_back( distY/dx );
                 distZ_level.push_back( distZ/dx );
@@ -140,4 +140,14 @@ void PointProbe::addProbePointsFromXNormalPlane(real pos_x, real pos0_y, real po
     }
     printf("Added %u  points \n",  n_y*n_z);
 
+}
+
+void PointProbe::getTaggedFluidNodes(Parameter *para, GridProvider* gridProvider)
+{
+    for(int level=0; level<=para->getMaxLevel(); level++)
+    {
+        SPtr<ProbeStruct> probeStruct = this->getProbeStruct(level);
+        std::vector<uint> probeIndices( probeStruct->pointIndicesH, probeStruct->pointIndicesH+probeStruct->nIndices);
+        gridProvider->tagFluidNodeIndices( probeIndices, CollisionTemplate::WriteMacroVars, level);
+    }
 }
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h
index 6a6fbe76f089acfafc22672dd3e9d71bd193a3b3..08c359705f03b20fbd3276fe209b6ff4d782a5e5 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h
@@ -64,6 +64,7 @@ public:
 
     void addProbePointsFromList(std::vector<real>& _pointCoordsX, std::vector<real>& _pointCoordsY, std::vector<real>& _pointCoordsZ);
     void addProbePointsFromXNormalPlane(real pos_x, real pos0_y, real pos0_z, real pos1_y, real pos1_z, uint n_y, uint n_z);
+    void getTaggedFluidNodes(Parameter *para, GridProvider* gridProvider) override;
     
 private:
     bool isAvailableStatistic(Statistic _variable) override;
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.cu
index cc027b07bded01455437e65e08ccdcd51bcf7dc0..03c18f5a9a2133bec244053113209abc70469a2a 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.cu
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.cu
@@ -187,7 +187,7 @@ void Probe::init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager*
     this->velocityRatio      = std::bind(&Parameter::getScaledVelocityRatio,        para, _1); 
     this->densityRatio       = std::bind(&Parameter::getScaledDensityRatio,         para, _1);
     this->forceRatio         = std::bind(&Parameter::getScaledForceRatio,           para, _1);
-    this->stressRatio        = std::bind(&Parameter::getScaledPressureRatio,        para, _1);
+    this->stressRatio        = std::bind(&Parameter::getScaledStressRatio,          para, _1);
     this->viscosityRatio     = std::bind(&Parameter::getScaledViscosityRatio,       para, _1);
     this->nondimensional     = std::bind(&Probe::getNondimensionalConversionFactor, this, _1);
 
@@ -315,6 +315,12 @@ void Probe::free(Parameter* para, CudaMemoryManager* cudaMemoryManager)
     }
 }
 
+void Probe::getTaggedFluidNodes(Parameter *para, GridProvider* gridProvider)
+{
+    // Do nothing
+};
+
+
 void Probe::addStatistic(Statistic variable)
 {
     if (!this->isAvailableStatistic(variable)) throw std::runtime_error("Probe::addStatistic(): Statistic not available for this probe type!");
@@ -329,6 +335,22 @@ void Probe::addStatistic(Statistic variable)
     }
 }
 
+std::string Probe::makeParallelFileName(int id, int t)
+{
+    return this->probeName + "_bin_ID_" + StringUtil::toString<int>(id) 
+                                           + "_t_" + StringUtil::toString<int>(t) 
+                                           + ".vtk";
+}
+
+std::string Probe::makeGridFileName(int level, int id, int t, uint part)
+{
+    return this->probeName + "_bin_lev_" + StringUtil::toString<int>(level)
+                                         + "_ID_" + StringUtil::toString<int>(id)
+                                         + "_Part_" + StringUtil::toString<int>(part) 
+                                         + "_t_" + StringUtil::toString<int>(t) 
+                                         + ".vtk";
+}
+
 void Probe::addAllAvailableStatistics()
 {
     for( int var=0; var < int(Statistic::LAST); var++)
@@ -347,119 +369,76 @@ void Probe::write(Parameter* para, int level, int t)
     std::vector<std::string> fnames;
     for (uint i = 1; i <= numberOfParts; i++)
 	{
-        std::string fname = this->probeName + "_bin_lev_" + StringUtil::toString<int>(level)
-                                         + "_ID_" + StringUtil::toString<int>(para->getMyProcessID())
-                                         + "_Part_" + StringUtil::toString<int>(i);
-        if(!this->outputTimeSeries) fname += "_t_" + StringUtil::toString<int>(t_write);
-        fname += ".vtk";
-		fnames.push_back(fname);
-        this->fileNamesForCollectionFile.push_back(fname);
+        this->writeGridFile(para, level, t_write, i);
     }
-    this->writeGridFiles(para, level, fnames, t);
-
-    if(level == 0 && !this->outputTimeSeries) this->writeCollectionFile(para, t);
+    if(level == 0&& !this->outputTimeSeries) this->writeParallelFile(para, t);
 }
 
-void Probe::writeCollectionFile(Parameter* para, int t)
+void Probe::writeParallelFile(Parameter* para, int t)
 {
     int t_write = this->fileNameLU ? t: t/this->tOut; 
-    std::string filename = this->probeName + "_bin_ID_" + StringUtil::toString<int>(para->getMyProcessID()) 
-                                           + "_t_" + StringUtil::toString<int>(t_write) 
-                                           + ".vtk";
-
-    std::ofstream file;
-
-    file.open(this->outputPath + "/" + filename + ".pvtu" );
-
-    //////////////////////////////////////////////////////////////////////////
-    
-    file << "<VTKFile type=\"PUnstructuredGrid\" version=\"1.0\" byte_order=\"LittleEndian\" header_type=\"UInt64\">" << std::endl;
-    file << "  <PUnstructuredGrid GhostLevel=\"1\">" << std::endl;
-
-    file << "    <PPointData>" << std::endl;
-
-    for(std::string varName: this->getVarNames()) //TODO
-    {
-        file << "       <DataArray type=\"Float64\" Name=\""<< varName << "\" /> " << std::endl;
-    }
-    file << "    </PPointData>" << std::endl;
-
-    file << "    <PPoints>" << std::endl;
-    file << "      <PDataArray type=\"Float32\" Name=\"Points\" NumberOfComponents=\"3\"/>" << std::endl;
-    file << "    </PPoints>" << std::endl;
-
-    for( auto& fname : this->fileNamesForCollectionFile )
-    {
-        const auto filenameWithoutPath=fname.substr( fname.find_last_of('/') + 1 );
-        file << "    <Piece Source=\"" << filenameWithoutPath << ".bin.vtu\"/>" << std::endl;
-    }
-
-    file << "  </PUnstructuredGrid>" << std::endl;
-    file << "</VTKFile>" << std::endl;
+    std::string filename = this->outputPath + "/" + this->makeParallelFileName(para->getMyProcessID(), t_write);
 
-    //////////////////////////////////////////////////////////////////////////
+    std::vector<std::string> nodedatanames = this->getVarNames();
+    std::vector<std::string> cellNames;
 
-    file.close();
+    getWriter()->writeParallelFile(filename, fileNamesForCollectionFile, nodedatanames, cellNames);
 
     this->fileNamesForCollectionFile.clear();
 }
 
-void Probe::writeGridFiles(Parameter* para, int level, std::vector<std::string>& fnames, int t)
+void Probe::writeGridFile(Parameter* para, int level, int t, uint part)
 {
+    std::string fname = this->outputPath + "/" + this->makeGridFileName(level, para->getMyProcessID(), t, part);
+
     std::vector< UbTupleFloat3 > nodes;
     std::vector< std::string > nodedatanames = this->getVarNames();
 
-    uint startpos = 0;
-    uint endpos = 0;
-    uint sizeOfNodes = 0;
     std::vector< std::vector< double > > nodedata(nodedatanames.size());
 
     SPtr<ProbeStruct> probeStruct = this->getProbeStruct(level);
 
-    for (uint part = 0; part < fnames.size(); part++)
-    {        
-        startpos = part * para->getlimitOfNodesForVTK();
-        uint nDataPoints = this->outputTimeSeries? this->tProbe: probeStruct->nPoints;
-        sizeOfNodes = min(para->getlimitOfNodesForVTK(), nDataPoints - startpos);
-        endpos = startpos + sizeOfNodes;
+    uint startpos = (part-1) * para->getlimitOfNodesForVTK();
+    uint sizeOfNodes = min(para->getlimitOfNodesForVTK(), probeStruct->nPoints - startpos);
+    uint endpos = startpos + sizeOfNodes;
 
-        //////////////////////////////////////////////////////////////////////////
-        nodes.resize(sizeOfNodes);
+    //////////////////////////////////////////////////////////////////////////
+    nodes.resize(sizeOfNodes);
 
-        for (uint pos = startpos; pos < endpos; pos++)
-        {
-            nodes[pos-startpos] = makeUbTuple(  float(probeStruct->pointCoordsX[pos]),
-                                                float(probeStruct->pointCoordsY[pos]),
-                                                float(probeStruct->pointCoordsZ[pos]));
-        }
+    for (uint pos = startpos; pos < endpos; pos++)
+    {
+        nodes[pos-startpos] = makeUbTuple(  float(probeStruct->pointCoordsX[pos]),
+                                            float(probeStruct->pointCoordsY[pos]),
+                                            float(probeStruct->pointCoordsZ[pos]));
+    }
 
-        for( auto it=nodedata.begin(); it!=nodedata.end(); it++) it->resize(sizeOfNodes);
+    for( auto it=nodedata.begin(); it!=nodedata.end(); it++) it->resize(sizeOfNodes);
 
-        for( int var=0; var < int(Statistic::LAST); var++){           
-            if(this->quantities[var])
-            {
-                Statistic statistic = static_cast<Statistic>(var);
-                real coeff;
+    for( int var=0; var < int(Statistic::LAST); var++){           
+        if(this->quantities[var])
+        {
+            Statistic statistic = static_cast<Statistic>(var);
+            real coeff;
 
-                std::vector<PostProcessingVariable> postProcessingVariables = this->getPostProcessingVariables(statistic);
-                uint n_arrs = uint(postProcessingVariables.size());
+            std::vector<PostProcessingVariable> postProcessingVariables = this->getPostProcessingVariables(statistic);
+            uint n_arrs = uint(postProcessingVariables.size());
 
-                uint arrOff = probeStruct->arrayOffsetsH[var];
-                uint arrLen = probeStruct->nPoints;
+            uint arrOff = probeStruct->arrayOffsetsH[var];
+            uint arrLen = probeStruct->nPoints;
+
+            for(uint arr=0; arr<n_arrs; arr++)
+            {
+                coeff = postProcessingVariables[arr].conversionFactor(level);
                 
-                for(uint arr=0; arr<n_arrs; arr++)
+                for (uint pos = startpos; pos < endpos; pos++)
                 {
-                    coeff = postProcessingVariables[arr].conversionFactor(level);
-                    
-                    for (uint pos = startpos; pos < endpos; pos++)
-                    {
-                        nodedata[arrOff+arr][pos-startpos] = double(probeStruct->quantitiesArrayH[(arrOff+arr)*arrLen+pos]*coeff);
-                    }
+                    nodedata[arrOff+arr][pos-startpos] = double(probeStruct->quantitiesArrayH[(arrOff+arr)*arrLen+pos]*coeff);
                 }
             }
         }
-        WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(this->outputPath + "/" + fnames[part], nodes, nodedatanames, nodedata);
     }
+    std::string fullName = getWriter()->writeNodesWithNodeData(fname, nodes, nodedatanames, nodedata);
+    this->fileNamesForCollectionFile.push_back(fullName.substr(fullName.find_last_of('/') + 1));
 }
 
 std::vector<std::string> Probe::getVarNames()
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h
index 9cb0bd43e27fb7a28cae9c363ce245fbd9cc5677..aaf294e87d23c64707a16692b9337d6e9ff9c896 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h
@@ -49,6 +49,7 @@
 
 #include "PreCollisionInteractor/PreCollisionInteractor.h"
 #include "PointerDefinitions.h"
+#include "WbWriterVtkXmlBinary.h"
 
 //=======================================================================================
 //! \note How to add new Statistics 
@@ -153,12 +154,12 @@ public:
         PreCollisionInteractor()
     {
         if (_tStartOut<_tStartAvg)      throw std::runtime_error("Probe: tStartOut must be larger than tStartAvg!");
-        if (_tStartTmpAvg<_tStartAvg)   throw std::runtime_error("Probe: tStartTmpAvg must be larger than tStartAvg!");
     }
     
     void init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager* cudaMemoryManager) override;
     void interact(Parameter* para, CudaMemoryManager* cudaMemoryManager, int level, uint t) override;
     void free(Parameter* para, CudaMemoryManager* cudaMemoryManager) override;
+    virtual void getTaggedFluidNodes(Parameter *para, GridProvider* gridProvider) override;
 
     SPtr<ProbeStruct> getProbeStruct(int level){ return this->probeParams[level]; }
 
@@ -171,6 +172,8 @@ public:
     void setFileNameToNOut(){this->fileNameLU = false;}
     void setTStartTmpAveraging(uint _tStartTmpAveraging){this->tStartTmpAveraging = _tStartTmpAveraging;}
 
+protected:
+    virtual WbWriterVtkXmlBinary* getWriter(){ return WbWriterVtkXmlBinary::getInstance(); };
     real getNondimensionalConversionFactor(int level);
 
 private:
@@ -188,12 +191,15 @@ private:
                         int level);
     virtual void calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, uint t, int level) = 0;
 
-    void write(Parameter* para, int level, int t);
-    void writeCollectionFile(Parameter* para, int t);
-    void writeGridFiles(Parameter* para, int level, std::vector<std::string >& fnames, int t);
+    virtual void write(Parameter* para, int level, int t);
+    virtual void writeParallelFile(Parameter* para, int t);
+    virtual void writeGridFile(Parameter* para, int level, int t, uint part);
+
     std::vector<std::string> getVarNames();
-    
-private:
+    std::string makeGridFileName(int level, int id, int t, uint part);
+    std::string makeParallelFileName(int id, int t);
+
+protected:
     const std::string probeName;
     const std::string outputPath;
 
@@ -202,7 +208,6 @@ private:
     bool hasDeviceQuantityArray;    //!> flag initiating memCopy in Point and PlaneProbe. Other probes are only based on thrust reduce functions and therefore dont need explict memCopy in interact()
     bool outputTimeSeries;          //!> flag initiating overwrite of output vtk files, skipping collection files and limiting the length of the written data to the current time step (currently only used for WallModelProbe)
     std::vector<std::string> fileNamesForCollectionFile;
-    std::vector<std::string> varNames;
 
     bool fileNameLU = true; //!> if true, written file name contains time step in LU, else is the number of the written probe files
 
@@ -215,7 +220,6 @@ protected:
 
     uint tProbe = 0; //!> counter for number of probe evaluations. Only used when outputting timeseries
 
-
     std::function<real(int)> velocityRatio;
     std::function<real(int)> densityRatio;
     std::function<real(int)> forceRatio;
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.cu
index 81da15595baae55aa562bc77e24442a9258d992f..3341111c134ace7ca6ff64eeb7f87b38f8014656 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.cu
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.cu
@@ -171,11 +171,11 @@ void WallModelProbe::findPoints(Parameter* para, GridProvider* gridProvider, std
     {
         if (!para->getIsBodyForce()) throw std::runtime_error("WallModelProbe::findPoints(): bodyforce not allocated!");
         // Find all fluid nodes
-        for(uint j=1; j<para->getParH(level)->numberOfNodes; j++ )
+        for(size_t pos = 1; pos < para->getParH(level)->numberOfNodes; pos++ )
         {
-            if( para->getParH(level)->typeOfGridNode[j] == GEO_FLUID) 
+            if( para->getParH(level)->typeOfGridNode[pos] == GEO_FLUID) 
             {
-                probeIndices_level.push_back(j);
+                probeIndices_level.push_back((int)pos);
             }
         }
     }
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.h
index d6464c5ca2aa60310cc6bb7ca0a210bc12e755ff..4ea90f74c7a0d57af4995e1b5874234967f1e901 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.h
@@ -55,14 +55,17 @@ public:
         uint _tStartOut,
         uint _tOut
     ):  Probe(_probeName, 
-             _outputPath,
-             _tStartAvg,
-             _tStartTmpAvg,
-             _tAvg,
-             _tStartOut, 
-             _tOut,
-             false,
-             true){}
+            _outputPath,
+            _tStartAvg,
+            _tStartTmpAvg,
+            _tAvg,
+            _tStartOut, 
+            _tOut,
+            false,
+            true)
+    {
+        if (_tStartTmpAvg<_tStartAvg)   throw std::runtime_error("Probe: tStartTmpAvg must be larger than tStartAvg!");
+    }
 
 
     void setForceOutputToStress(bool _outputStress){ this->outputStress = _outputStress; }
diff --git a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitCompAD27/InitCompAD27.cu b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitCompAD27/InitCompAD27.cu
index e43fb54a6b56b4d9a501269544cea000df31cdb7..60dbb2228e6d01fdabf7a6e1bfca786e2104d5b0 100644
--- a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitCompAD27/InitCompAD27.cu
+++ b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitCompAD27/InitCompAD27.cu
@@ -2,6 +2,7 @@
 
 #include "InitCompAD27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<PreProcessorStrategy> InitCompAD27::getNewInstance(std::shared_ptr<Parameter> para)
 {
@@ -10,36 +11,21 @@ std::shared_ptr<PreProcessorStrategy> InitCompAD27::getNewInstance(std::shared_p
 
 void InitCompAD27::init(int level)
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Init_Comp_AD_27 << < grid, threads >> >(	para->getParD(level)->neighborX,
-											para->getParD(level)->neighborY,
-											para->getParD(level)->neighborZ,
-											para->getParD(level)->typeOfGridNode,
-											para->getParD(level)->Conc,
-											para->getParD(level)->velocityX,
-											para->getParD(level)->velocityY,
-											para->getParD(level)->velocityZ,
-											para->getParD(level)->numberOfNodes,
-											para->getParD(level)->distributionsAD27.f[0],
-											para->getParD(level)->isEvenTimestep);
-	getLastCudaError("InitAD27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Init_Comp_AD_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->Conc,
+        para->getParD(level)->velocityX,
+        para->getParD(level)->velocityY,
+        para->getParD(level)->velocityZ,
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->distributionsAD27.f[0],
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Init_Comp_AD_27 execution failed");
 }
 
 bool InitCompAD27::checkParameter()
diff --git a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitCompAD7/InitCompAD7.cu b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitCompAD7/InitCompAD7.cu
index 8a53dff5c14adef69aa012bdf1d870d62a9749b2..8097ee13d9064c4104ead8cd8eb5ba529d8972fc 100644
--- a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitCompAD7/InitCompAD7.cu
+++ b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitCompAD7/InitCompAD7.cu
@@ -2,6 +2,7 @@
 
 #include "InitCompAD7_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<InitCompAD7> InitCompAD7::getNewInstance(std::shared_ptr<Parameter> para)
 {
@@ -10,36 +11,21 @@ std::shared_ptr<InitCompAD7> InitCompAD7::getNewInstance(std::shared_ptr<Paramet
 
 void InitCompAD7::init(int level)
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Init_Comp_AD_7 << < grid, threads >> >(	para->getParD(level)->neighborX,
-										para->getParD(level)->neighborY,
-										para->getParD(level)->neighborZ,
-										para->getParD(level)->typeOfGridNode,
-										para->getParD(level)->Conc,
-										para->getParD(level)->velocityX,
-										para->getParD(level)->velocityY,
-										para->getParD(level)->velocityZ,
-										para->getParD(level)->numberOfNodes,
-										para->getParD(level)->distributionsAD7.f[0],
-										para->getParD(level)->isEvenTimestep);
-	getLastCudaError("InitAD7 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Init_Comp_AD_7 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->Conc,
+        para->getParD(level)->velocityX,
+        para->getParD(level)->velocityY,
+        para->getParD(level)->velocityZ,
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->distributionsAD7.f[0],
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Init_Comp_AD_7 execution failed");
 }
 
 bool InitCompAD7::checkParameter()
diff --git a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitCompSP27/InitCompSP27.cu b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitCompSP27/InitCompSP27.cu
index 23ec3e5293ec3a49bf632a720ab554d156dc9674..c4676f28f969e2db8ff7f1910ac784a1c0dab351 100644
--- a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitCompSP27/InitCompSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitCompSP27/InitCompSP27.cu
@@ -2,6 +2,7 @@
 
 #include "InitCompSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<PreProcessorStrategy> InitCompSP27::getNewInstance(std::shared_ptr<Parameter> para)
 {
@@ -10,27 +11,12 @@ std::shared_ptr<PreProcessorStrategy> InitCompSP27::getNewInstance(std::shared_p
 
 void InitCompSP27::init(int level)
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
 
     if( ! para->getUseInitNeq() )
     {
-        LB_Init_Comp_SP_27 <<< grid, threads >>> (para->getParD(level)->neighborX,
+        LB_Init_Comp_SP_27 <<< grid.grid, grid.threads >>> (
+            para->getParD(level)->neighborX,
             para->getParD(level)->neighborY,
             para->getParD(level)->neighborZ,
             para->getParD(level)->typeOfGridNode,
@@ -41,11 +27,12 @@ void InitCompSP27::init(int level)
             para->getParD(level)->numberOfNodes,
             para->getParD(level)->distributions.f[0],
             para->getParD(level)->isEvenTimestep);
-        getLastCudaError("LBInitSP27 execution failed");
+        getLastCudaError("LB_Init_Comp_SP_27 execution failed");
     }
     else
     {
-        LB_Init_Comp_Neq_SP_27 <<< grid, threads >>> (para->getParD(level)->neighborX,
+        LB_Init_Comp_Neq_SP_27 <<< grid.grid, grid.threads >>> (
+            para->getParD(level)->neighborX,
             para->getParD(level)->neighborY,
             para->getParD(level)->neighborZ,
             para->getParD(level)->neighborInverse,
@@ -59,7 +46,7 @@ void InitCompSP27::init(int level)
             para->getParD(level)->omega,
             para->getParD(level)->isEvenTimestep);
         cudaDeviceSynchronize();
-        getLastCudaError("LBInitNeqSP27 execution failed");
+        getLastCudaError("LB_Init_Comp_Neq_SP_27 execution failed");
     }
 
 
diff --git a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitF3/InitF3.cu b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitF3/InitF3.cu
index cb6b40b4371a206c6d1e031822338621c4907be1..14d6b725337aa8b9af279bf794ff1c0912516b64 100644
--- a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitF3/InitF3.cu
+++ b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitF3/InitF3.cu
@@ -2,6 +2,7 @@
 
 #include "InitF3_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<PreProcessorStrategy> InitF3::getNewInstance(std::shared_ptr<Parameter> para)
 {
@@ -10,36 +11,21 @@ std::shared_ptr<PreProcessorStrategy> InitF3::getNewInstance(std::shared_ptr<Par
 
 void InitF3::init(int level)
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Init_F3 << < grid, threads >> >(	para->getParD(level)->neighborX,
-										para->getParD(level)->neighborY,
-										para->getParD(level)->neighborZ,
-										para->getParD(level)->typeOfGridNode,
-										para->getParD(level)->rho,
-										para->getParD(level)->velocityX,
-										para->getParD(level)->velocityY,
-										para->getParD(level)->velocityZ,
-										para->getParD(level)->numberOfNodes,
-										para->getParD(level)->g6.g[0],
-										para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LBInitF3 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Init_F3 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->rho,
+        para->getParD(level)->velocityX,
+        para->getParD(level)->velocityY,
+        para->getParD(level)->velocityZ,
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->g6.g[0],
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Init_F3 execution failed");
 }
 
 bool InitF3::checkParameter()
diff --git a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitIncompAD27/InitIncompAD27.cu b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitIncompAD27/InitIncompAD27.cu
index 419ae80b96be57f8dc9c4ebecaccac0d435f00e0..ea700010960b11a1facdda18c35f220f43eb6a66 100644
--- a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitIncompAD27/InitIncompAD27.cu
+++ b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitIncompAD27/InitIncompAD27.cu
@@ -2,6 +2,7 @@
 
 #include "InitIncompAD27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<PreProcessorStrategy> InitIncompAD27::getNewInstance(std::shared_ptr<Parameter> para)
 {
@@ -10,36 +11,21 @@ std::shared_ptr<PreProcessorStrategy> InitIncompAD27::getNewInstance(std::shared
 
 void InitIncompAD27::init(int level)
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Init_Incomp_AD_27 << < grid, threads >> >(	para->getParD(level)->neighborX,
-												para->getParD(level)->neighborY,
-												para->getParD(level)->neighborZ,
-												para->getParD(level)->typeOfGridNode,
-												para->getParD(level)->Conc,
-												para->getParD(level)->velocityX,
-												para->getParD(level)->velocityY,
-												para->getParD(level)->velocityZ,
-												para->getParD(level)->numberOfNodes,
-												para->getParD(level)->distributionsAD27.f[0],
-												para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LBInitIncompAD27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Init_Incomp_AD_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->Conc,
+        para->getParD(level)->velocityX,
+        para->getParD(level)->velocityY,
+        para->getParD(level)->velocityZ,
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->distributionsAD27.f[0],
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Init_Incomp_AD_27 execution failed");
 }
 
 bool InitIncompAD27::checkParameter()
diff --git a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitIncompAD7/InitIncompAD7.cu b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitIncompAD7/InitIncompAD7.cu
index 795cd0496a207e0861e35e4f310481950a037caf..d7c08e6932cacf2fb5a946010c1855212f1631fc 100644
--- a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitIncompAD7/InitIncompAD7.cu
+++ b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitIncompAD7/InitIncompAD7.cu
@@ -2,6 +2,7 @@
 
 #include "InitIncompAD7_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<PreProcessorStrategy> InitIncompAD7::getNewInstance(std::shared_ptr<Parameter> para)
 {
@@ -10,36 +11,21 @@ std::shared_ptr<PreProcessorStrategy> InitIncompAD7::getNewInstance(std::shared_
 
 void InitIncompAD7::init(int level)
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);	
-
-	LB_Init_Incomp_AD_7 << < grid, threads >> >(	para->getParD(level)->neighborX,
-												para->getParD(level)->neighborY,
-												para->getParD(level)->neighborZ,
-												para->getParD(level)->typeOfGridNode,
-												para->getParD(level)->Conc,
-												para->getParD(level)->velocityX,
-												para->getParD(level)->velocityY,
-												para->getParD(level)->velocityZ,
-												para->getParD(level)->numberOfNodes,
-												para->getParD(level)->distributionsAD27.f[0],
-												para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LBInitIncompAD7 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Init_Incomp_AD_7 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->Conc,
+        para->getParD(level)->velocityX,
+        para->getParD(level)->velocityY,
+        para->getParD(level)->velocityZ,
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->distributionsAD27.f[0],
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Init_Incomp_AD_7 execution failed");
 }
 
 bool InitIncompAD7::checkParameter()
diff --git a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitSP27/InitSP27.cu b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitSP27/InitSP27.cu
index 0538c7ab89eb750a40cfc47486dc0891d4493976..078ad24f24659bf10a3dc9ed90bfd62b5e021187 100644
--- a/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitSP27/InitSP27.cu
+++ b/src/gpu/VirtualFluids_GPU/PreProcessor/PreProcessorStrategy/InitSP27/InitSP27.cu
@@ -2,6 +2,7 @@
 
 #include "InitSP27_Device.cuh"
 #include "Parameter/Parameter.h"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<PreProcessorStrategy> InitSP27::getNewInstance(std::shared_ptr<Parameter> para)
 {
@@ -10,36 +11,21 @@ std::shared_ptr<PreProcessorStrategy> InitSP27::getNewInstance(std::shared_ptr<P
 
 void InitSP27::init(int level)
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->numberOfNodes;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Init_SP_27 << < grid, threads >> >(	para->getParD(level)->neighborX,
-										para->getParD(level)->neighborY,
-										para->getParD(level)->neighborZ,
-										para->getParD(level)->typeOfGridNode,
-										para->getParD(level)->rho,
-										para->getParD(level)->velocityX,
-										para->getParD(level)->velocityY,
-										para->getParD(level)->velocityZ,
-										para->getParD(level)->numberOfNodes,
-										para->getParD(level)->distributions.f[0],
-										para->getParD(level)->isEvenTimestep);
-	getLastCudaError("LBInitSP27 execution failed");
+    vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->numberOfNodes);
+
+    LB_Init_SP_27 <<< grid.grid, grid.threads >>>(
+        para->getParD(level)->neighborX,
+        para->getParD(level)->neighborY,
+        para->getParD(level)->neighborZ,
+        para->getParD(level)->typeOfGridNode,
+        para->getParD(level)->rho,
+        para->getParD(level)->velocityX,
+        para->getParD(level)->velocityY,
+        para->getParD(level)->velocityZ,
+        para->getParD(level)->numberOfNodes,
+        para->getParD(level)->distributions.f[0],
+        para->getParD(level)->isEvenTimestep);
+    getLastCudaError("LB_Init_SP_27 execution failed");
 }
 
 bool InitSP27::checkParameter()
diff --git a/src/gpu/VirtualFluids_GPU/Restart/RestartObject.cpp b/src/gpu/VirtualFluids_GPU/Restart/RestartObject.cpp
index a38535f7bdff0d87a5af74a69f0ed8255c647382..15813b7967a84e45f44eb4d286c41aa99c4ff343 100644
--- a/src/gpu/VirtualFluids_GPU/Restart/RestartObject.cpp
+++ b/src/gpu/VirtualFluids_GPU/Restart/RestartObject.cpp
@@ -9,12 +9,12 @@ void RestartObject::deserialize(const std::string &filename, std::shared_ptr<Par
 {
     deserialize_internal(filename);
 
-    for (int j = para->getCoarse(); j <= para->getFine(); j++) {
+    for (int index1 = para->getCoarse(); index1 <= para->getFine(); index1++) {
         std::vector<real> vec;
         fs.push_back(vec);
 
-        for (unsigned int i = 0; i < (para->getD3Qxx() * para->getParH(j)->numberOfNodes); i++) {
-            para->getParH(j)->distributions.f[0][i] = fs[j][i];
+        for (size_t index2 = 0; index2 < (para->getD3Qxx() * para->getParH(index1)->numberOfNodes); index2++) {
+            para->getParH(index1)->distributions.f[0][index2] = fs[index1][index2];
         }
     }
 }
@@ -24,15 +24,15 @@ void RestartObject::serialize(const std::string &filename, const std::shared_ptr
     if (fs.size() > 0) {
         clear(para);
     }
-    for (int j = para->getCoarse(); j <= para->getFine(); j++) {
+    for (int index1 = para->getCoarse(); index1 <= para->getFine(); index1++) {
         std::vector<real> vec;
         fs.push_back(vec);
 
-        for (unsigned int i = 0; i < (para->getD3Qxx() * para->getParH(j)->numberOfNodes); i++) {
-            if (UbMath::isNaN(para->getParH(j)->distributions.f[0][i])) {
-                fs[j].push_back((real)0.0);
+        for (size_t index2 = 0; index2 < (para->getD3Qxx() * para->getParH(index1)->numberOfNodes); index2++) {
+            if (UbMath::isNaN(para->getParH(index1)->distributions.f[0][index2])) {
+                fs[index1].push_back((real)0.0);
             } else {
-                fs[j].push_back(para->getParH(j)->distributions.f[0][i]);
+                fs[index1].push_back(para->getParH(index1)->distributions.f[0][index2]);
             }
         }
     }
diff --git a/src/lbm/CMakeLists.txt b/src/lbm/CMakeLists.txt
index afa90bdd3f95bb71cf7f1eda6407f9b38766072a..7a9a96ace1c7377b7ad0c67937464d1f2c00cce6 100644
--- a/src/lbm/CMakeLists.txt
+++ b/src/lbm/CMakeLists.txt
@@ -1,12 +1,12 @@
-if(BUILD_VF_CPU)
-    project(lbm LANGUAGES CXX)
 
-    vf_add_library(NAME lbm PUBLIC_LINK basics)
-    target_link_libraries(lbm PRIVATE project_warnings)
-
-    vf_add_tests()
-endif()
+vf_add_library(PUBLIC_LINK basics)
 
 if(BUILD_VF_GPU OR BUILD_VF_GKS)
-    add_subdirectory(cuda)
+    set_target_properties(lbm PROPERTIES CUDA_SEPARABLE_COMPILATION ON POSITION_INDEPENDENT_CODE ON)
+
+    set_source_files_properties(KernelParameter.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(CumulantChimera.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(BGK.cpp PROPERTIES LANGUAGE CUDA)
 endif()
+
+vf_add_tests()
\ No newline at end of file
diff --git a/src/lbm/KernelParameter.cpp b/src/lbm/KernelParameter.cpp
index e039214d218ef19f35e8adf927f36d3a6f1aa355..7bf5a369d0e5d4e673d79dcb30bc22fc2c330e68 100644
--- a/src/lbm/KernelParameter.cpp
+++ b/src/lbm/KernelParameter.cpp
@@ -5,11 +5,8 @@
 #include "MacroscopicQuantities.h"
 
 
-namespace vf
+namespace vf::lbm
 {
-namespace lbm
-{
-
 
 
 inline __host__ __device__ real Distribution27::getDensity_() const
@@ -17,8 +14,6 @@ inline __host__ __device__ real Distribution27::getDensity_() const
     return getDensity(f);
 }
 
-
-
 __host__ __device__ real abs_internal(real value)
 {
 #ifdef __CUDA_ARCH__
@@ -30,4 +25,3 @@ __host__ __device__ real abs_internal(real value)
 
 
 }
-}
diff --git a/src/lbm/KernelParameter.h b/src/lbm/KernelParameter.h
index 95226628110637f3794c8a1f7e6f6c1f6dda937b..18c4f2a4b20b84d9d519993f3ddb54cf612d4306 100644
--- a/src/lbm/KernelParameter.h
+++ b/src/lbm/KernelParameter.h
@@ -11,9 +11,7 @@
 #include <basics/Core/DataTypes.h>
 
 
-namespace vf
-{
-namespace lbm
+namespace vf::lbm
 {
 
 struct Distribution27
@@ -35,9 +33,6 @@ struct KernelParameter
 };
 
 
-
-
-}
 }
 
 #endif
diff --git a/src/lbm/constants/D3Q27.h b/src/lbm/constants/D3Q27.h
index f923695f3756712748e638aee59121a5537456c5..c799331815ff92b41b3daf8433bcc10d026a8738 100644
--- a/src/lbm/constants/D3Q27.h
+++ b/src/lbm/constants/D3Q27.h
@@ -6,92 +6,92 @@
 
 namespace vf::lbm::dir
 {
-    //real, double und float auf real
-static constexpr int STARTDIR = 0;
-static constexpr int ENDDIR   = 26;
+
+static constexpr size_t STARTDIR = 0;
+static constexpr size_t ENDDIR = 26;
 
 // used in the CPU and the GPU version
-static constexpr int DIR_000 = 0;
-static constexpr int DIR_P00 = 1;
-static constexpr int DIR_M00 = 2;
-static constexpr int DIR_0P0 = 3;
-static constexpr int DIR_0M0 = 4;
-static constexpr int DIR_00P = 5;
-static constexpr int DIR_00M = 6;
-static constexpr int DIR_PP0 = 7;
-static constexpr int DIR_MM0 = 8;
-static constexpr int DIR_PM0 = 9;
-static constexpr int DIR_MP0 = 10;
-static constexpr int DIR_P0P = 11;
-static constexpr int DIR_M0M = 12;
-static constexpr int DIR_P0M = 13;
-static constexpr int DIR_M0P = 14;
-static constexpr int DIR_0PP = 15;
-static constexpr int DIR_0MM = 16;
-static constexpr int DIR_0PM = 17;
-static constexpr int DIR_0MP = 18;
-static constexpr int DIR_PPP = 19;
-static constexpr int DIR_MPP = 20;
-static constexpr int DIR_PMP = 21;
-static constexpr int DIR_MMP = 22;
-static constexpr int DIR_PPM = 23;
-static constexpr int DIR_MPM = 24;
-static constexpr int DIR_PMM = 25;
-static constexpr int DIR_MMM = 26;
-
-static constexpr int INV_P00 = DIR_M00;
-static constexpr int INV_M00 = DIR_P00;
-static constexpr int INV_0P0 = DIR_0M0;
-static constexpr int INV_0M0 = DIR_0P0;
-static constexpr int INV_00P = DIR_00M;
-static constexpr int INV_00M = DIR_00P;
-static constexpr int INV_PP0 = DIR_MM0;
-static constexpr int INV_MM0 = DIR_PP0;
-static constexpr int INV_PM0 = DIR_MP0;
-static constexpr int INV_MP0 = DIR_PM0;
-static constexpr int INV_P0P = DIR_M0M;
-static constexpr int INV_M0M = DIR_P0P;
-static constexpr int INV_P0M = DIR_M0P;
-static constexpr int INV_M0P = DIR_P0M;
-static constexpr int INV_0PP = DIR_0MM;
-static constexpr int INV_0MM = DIR_0PP;
-static constexpr int INV_0PM = DIR_0MP;
-static constexpr int INV_0MP = DIR_0PM;
-static constexpr int INV_PPP = DIR_MMM;
-static constexpr int INV_MPP = DIR_PMM;
-static constexpr int INV_PMP = DIR_MPM;
-static constexpr int INV_MMP = DIR_PPM;
-static constexpr int INV_PPM = DIR_MMP;
-static constexpr int INV_MPM = DIR_PMP;
-static constexpr int INV_PMM = DIR_MPP;
-static constexpr int INV_MMM = DIR_PPP;
-
-static constexpr int SGD_P00 = 0;
-static constexpr int SGD_M00 = 1;
-static constexpr int SGD_0P0 = 2;
-static constexpr int SGD_0M0 = 3;
-static constexpr int SGD_00P = 4;
-static constexpr int SGD_00M = 5;
-static constexpr int SGD_PP0 = 6;
-static constexpr int SGD_MM0 = 7;
-static constexpr int SGD_PM0 = 8;
-static constexpr int SGD_MP0 = 9;
-static constexpr int SGD_P0P = 10;
-static constexpr int SGD_M0M = 11;
-static constexpr int SGD_P0M = 12;
-static constexpr int SGD_M0P = 13;
-static constexpr int SGD_0PP = 14;
-static constexpr int SGD_0MM = 15;
-static constexpr int SGD_0PM = 16;
-static constexpr int SGD_0MP = 17;
-static constexpr int SGD_PPP = 18;
-static constexpr int SGD_MPP = 19;
-static constexpr int SGD_PMP = 20;
-static constexpr int SGD_MMP = 21;
-static constexpr int SGD_PPM = 22;
-static constexpr int SGD_MPM = 23;
-static constexpr int SGD_PMM = 24;
-static constexpr int SGD_MMM = 25;
+static constexpr size_t DIR_000 = 0;
+static constexpr size_t DIR_P00 = 1;
+static constexpr size_t DIR_M00 = 2;
+static constexpr size_t DIR_0P0 = 3;
+static constexpr size_t DIR_0M0 = 4;
+static constexpr size_t DIR_00P = 5;
+static constexpr size_t DIR_00M = 6;
+static constexpr size_t DIR_PP0 = 7;
+static constexpr size_t DIR_MM0 = 8;
+static constexpr size_t DIR_PM0 = 9;
+static constexpr size_t DIR_MP0 = 10;
+static constexpr size_t DIR_P0P = 11;
+static constexpr size_t DIR_M0M = 12;
+static constexpr size_t DIR_P0M = 13;
+static constexpr size_t DIR_M0P = 14;
+static constexpr size_t DIR_0PP = 15;
+static constexpr size_t DIR_0MM = 16;
+static constexpr size_t DIR_0PM = 17;
+static constexpr size_t DIR_0MP = 18;
+static constexpr size_t DIR_PPP = 19;
+static constexpr size_t DIR_MPP = 20;
+static constexpr size_t DIR_PMP = 21;
+static constexpr size_t DIR_MMP = 22;
+static constexpr size_t DIR_PPM = 23;
+static constexpr size_t DIR_MPM = 24;
+static constexpr size_t DIR_PMM = 25;
+static constexpr size_t DIR_MMM = 26;
+
+static constexpr size_t INV_P00 = DIR_M00;
+static constexpr size_t INV_M00 = DIR_P00;
+static constexpr size_t INV_0P0 = DIR_0M0;
+static constexpr size_t INV_0M0 = DIR_0P0;
+static constexpr size_t INV_00P = DIR_00M;
+static constexpr size_t INV_00M = DIR_00P;
+static constexpr size_t INV_PP0 = DIR_MM0;
+static constexpr size_t INV_MM0 = DIR_PP0;
+static constexpr size_t INV_PM0 = DIR_MP0;
+static constexpr size_t INV_MP0 = DIR_PM0;
+static constexpr size_t INV_P0P = DIR_M0M;
+static constexpr size_t INV_M0M = DIR_P0P;
+static constexpr size_t INV_P0M = DIR_M0P;
+static constexpr size_t INV_M0P = DIR_P0M;
+static constexpr size_t INV_0PP = DIR_0MM;
+static constexpr size_t INV_0MM = DIR_0PP;
+static constexpr size_t INV_0PM = DIR_0MP;
+static constexpr size_t INV_0MP = DIR_0PM;
+static constexpr size_t INV_PPP = DIR_MMM;
+static constexpr size_t INV_MPP = DIR_PMM;
+static constexpr size_t INV_PMP = DIR_MPM;
+static constexpr size_t INV_MMP = DIR_PPM;
+static constexpr size_t INV_PPM = DIR_MMP;
+static constexpr size_t INV_MPM = DIR_PMP;
+static constexpr size_t INV_PMM = DIR_MPP;
+static constexpr size_t INV_MMM = DIR_PPP;
+
+static constexpr size_t SGD_P00 = 0;
+static constexpr size_t SGD_M00 = 1;
+static constexpr size_t SGD_0P0 = 2;
+static constexpr size_t SGD_0M0 = 3;
+static constexpr size_t SGD_00P = 4;
+static constexpr size_t SGD_00M = 5;
+static constexpr size_t SGD_PP0 = 6;
+static constexpr size_t SGD_MM0 = 7;
+static constexpr size_t SGD_PM0 = 8;
+static constexpr size_t SGD_MP0 = 9;
+static constexpr size_t SGD_P0P = 10;
+static constexpr size_t SGD_M0M = 11;
+static constexpr size_t SGD_P0M = 12;
+static constexpr size_t SGD_M0P = 13;
+static constexpr size_t SGD_0PP = 14;
+static constexpr size_t SGD_0MM = 15;
+static constexpr size_t SGD_0PM = 16;
+static constexpr size_t SGD_0MP = 17;
+static constexpr size_t SGD_PPP = 18;
+static constexpr size_t SGD_MPP = 19;
+static constexpr size_t SGD_PMP = 20;
+static constexpr size_t SGD_MMP = 21;
+static constexpr size_t SGD_PPM = 22;
+static constexpr size_t SGD_MPM = 23;
+static constexpr size_t SGD_PMM = 24;
+static constexpr size_t SGD_MMM = 25;
 
 struct countersForPointerChasing{
     uint counterInverse;
@@ -100,7 +100,7 @@ struct countersForPointerChasing{
     uint counterZ;
 };
 
-const std::map<const int, const countersForPointerChasing> mapForPointerChasing = 
+const std::map<const size_t, const countersForPointerChasing> mapForPointerChasing = 
 {
     {DIR_000, countersForPointerChasing{0, 0, 0, 0}},
     {DIR_P00, countersForPointerChasing{0, 1, 0, 0}},
diff --git a/src/lbm/constants/NumericConstants.h b/src/lbm/constants/NumericConstants.h
index fb7764255201dbd31ac1134e756fc4bfd6e3d982..e642c2c95171927a7f8dc8f1a911d98117af66a6 100644
--- a/src/lbm/constants/NumericConstants.h
+++ b/src/lbm/constants/NumericConstants.h
@@ -18,6 +18,7 @@ static constexpr double c1o8 = 0.125;
 static constexpr double c1o9 = 0.111111111111111;
 static constexpr double c2o9 = 0.222222222222222;
 static constexpr double c4o9 = 0.444444444444444;
+static constexpr double c4o10 = 0.4;
 static constexpr double c1o10 = 0.1;
 static constexpr double c1o12 = 0.083333333333333;
 static constexpr double c1o16 = 0.0625;
@@ -48,6 +49,7 @@ static constexpr double c99o100 = 0.99;
 static constexpr double c1o126 = 0.007936507936508;
 static constexpr double c1o216 = 0.004629629629630;
 static constexpr double c5o4 = 1.25;
+static constexpr double c4o3 = 1.333333333333333;
 static constexpr double c9o4 = 2.25;
 static constexpr double c5o2 = 2.5;
 static constexpr double c9o2 = 4.5;
@@ -99,15 +101,15 @@ static constexpr double c72o1 = 72.;
 static constexpr double c84o1 = 84.;
 static constexpr double c88o1 = 88.;
 static constexpr double c96o1 = 96.;
-static constexpr double c100o1 = 10.;
-static constexpr double c130o1 = 13.;
-static constexpr double c152o1 = 15.;
-static constexpr double c166o1 = 16.;
-static constexpr double c195o1 = 19.;
-static constexpr double c216o1 = 21.;
-static constexpr double c264o1 = 26.;
-static constexpr double c290o1 = 29.;
-static constexpr double c367o1 = 36.;
+static constexpr double c100o1 = 100.;
+static constexpr double c130o1 = 130.;
+static constexpr double c152o1 = 152.;
+static constexpr double c166o1 = 166.;
+static constexpr double c195o1 = 195.;
+static constexpr double c216o1 = 216.;
+static constexpr double c264o1 = 264.;
+static constexpr double c290o1 = 290.;
+static constexpr double c367o1 = 367.;
 
 static constexpr double Op0000002 = 0.0000002;
 static constexpr double c10eM30 = 1e-30;
@@ -137,6 +139,7 @@ static constexpr float c1o8 = 0.125f;
 static constexpr float c1o9 = (1.0f / 9.0f);
 static constexpr float c2o9 = (2.0f / 9.0f);
 static constexpr float c4o9 = (4.0f / 9.0f);
+static constexpr float c4o10 = 0.4f;
 static constexpr float c1o10 = 0.1f;
 static constexpr float c1o12 = (1.0f / 12.0f);
 static constexpr float c1o16 = 0.0625f;
@@ -167,6 +170,7 @@ static constexpr float c99o100 = 0.99f;
 static constexpr float c1o126 = (1.0f / 126.0f);
 static constexpr float c1o216 = (1.0f / 216.0f);
 static constexpr float c5o4 = 1.25f;
+static constexpr float c4o3 = (4.0f / 3.0f);
 static constexpr float c9o4 = 2.25f;
 static constexpr float c5o2 = 2.5f;
 static constexpr float c9o2 = 4.5f;
diff --git a/src/lbm/cuda/CMakeLists.txt b/src/lbm/cuda/CMakeLists.txt
deleted file mode 100644
index 4142b7c3b1c46275c3257e3dfd657cc6b30c841d..0000000000000000000000000000000000000000
--- a/src/lbm/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-project(lbmCuda LANGUAGES CUDA CXX)
-
-
-vf_add_library(NAME lbmCuda BUILDTYPE static PUBLIC_LINK basics FOLDER ../../lbm)
-
-
-set_target_properties(lbmCuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON POSITION_INDEPENDENT_CODE ON)
-
-
-set_source_files_properties(../KernelParameter.cpp PROPERTIES LANGUAGE CUDA)
-set_source_files_properties(../CumulantChimera.cpp PROPERTIES LANGUAGE CUDA)
-set_source_files_properties(../BGK.cpp PROPERTIES LANGUAGE CUDA)
diff --git a/utilities/setup_builder.py b/utilities/setup_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..94e4f805b3e05024742ead72e3ffb28c477c282f
--- /dev/null
+++ b/utilities/setup_builder.py
@@ -0,0 +1,34 @@
+from setuptools import build_meta
+
+class builder(build_meta._BuildMetaBackend):
+
+    def run_setup(self, setup_script='setup.py'):
+        # Note that we can reuse our build directory between calls
+        # Correctness comes first, then optimization later
+        __file__ = setup_script
+        __name__ = '__main__'
+
+        with build_meta._open_setup_script(__file__) as f:
+            code = f.read().replace(r'\r\n', r'\n')
+        args = locals()
+        args["config_args"] = self.extra_args
+        exec(code, args)
+
+
+    def add_settings(self, config_settings):
+        self.extra_args = dict()
+        if config_settings:
+            self.extra_args = config_settings
+
+    def build_wheel(self, wheel_directory, config_settings=None,
+                    metadata_directory=None):
+        self.add_settings(config_settings)
+        return super().build_wheel(wheel_directory, config_settings, metadata_directory)
+
+    def build_sdist(self, sdist_directory, config_settings=None):
+        self.add_settings(config_settings)
+        return super().build_wheel(sdist_directory, config_settings)
+
+build = builder()
+build_wheel = build.build_wheel
+build_sdist = build.build_sdist
\ No newline at end of file