diff --git a/.gitignore b/.gitignore
index f87c8efbbd3b3877bd77212d6c2184db2aa409f1..1e33ea527c553631ad2e37051501af109c407b53 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ dist/
 *.egg-info/
 __pycache__/
 .venv/
+pyfluids*
 
 # IDE
 .vscode/
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b34c5a8f66c1340670b6acd80ea6a9901b2760d1..8e3857684f3d9049d87fba227c6833693ec539a7 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -132,7 +132,7 @@ gcc_9_python:
     - export CCACHE_DIR=$CI_PROJECT_DIR/cache
 
   script:
-    - python3 setup.py bdist_wheel build_ext --build-temp=build
+    - python3 setup.py bdist_wheel build_ext --build-temp=build -DBUILD_VF_CPU=ON
 
 ###############################################################################
 ##                            Container Upload                               ##
diff --git a/CMakePresets.json b/CMakePresets.json
index 0f360fd303cdcad923b01d56df5c6d48ad62ca2c..c53482ec72109f1a672b97797763d027a6ec80bf 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -9,26 +9,29 @@
         {
             "name": "default",
             "hidden": true,
-            "binaryDir": "${sourceDir}/build/",
-            "cacheVariables": {
-                "BUILD_VF_UNIT_TESTS": "ON"
-            }
+            "binaryDir": "${sourceDir}/build/"
         },
         {
-            "name": "default_make",
-            "inherits": "default",
+            "name": "msvc",
+            "hidden": true,
+            "generator": "Visual Studio 16 2019",
+            "architecture": "x64"
+        },
+        {
+            "name": "make",
             "hidden": true,
             "generator": "Unix Makefiles"
         },
         {
-            "name": "default_msvc",
-            "inherits": "default",
+            "name": "unit_tests",
             "hidden": true,
-            "generator": "Visual Studio 16 2019",
-            "architecture": "x64"
+            "cacheVariables": {
+                "BUILD_VF_UNIT_TESTS": "ON"
+            }
         },
         {
-            "name": "default_cpu",
+            "name": "cpu",
+            "inherits": "default",
             "hidden": true,
             "description": "CPU build of VirtualFluids",
             "cacheVariables": {
@@ -37,7 +40,8 @@
             }
         },
         {
-            "name": "default_gpu",
+            "name": "gpu",
+            "inherits": "default",
             "hidden": true,
             "description": "GPU build of VirtualFluids",
             "cacheVariables": {
@@ -46,92 +50,213 @@
             }
         },
         {
-            "name": "default_gpu_numerical_tests",
-            "inherits": [
-                "default_gpu"
-            ],
+            "name": "debug",
             "hidden": true,
-            "description": "GPU numerical tests of VirtualFluids",
             "cacheVariables": {
-                "BUILD_VF_DOUBLE_ACCURACY": "ON",
-                "BUILD_NUMERIC_TESTS": "ON"
+                "CMAKE_BUILD_TYPE": "Debug"
             }
         },
         {
-            "name": "default_all",
+            "name": "release",
             "hidden": true,
-            "description": "All build of VirtualFluids",
-            "inherits": [
-                "default_cpu",
-                "default_gpu"
-            ],
             "cacheVariables": {
-                "BUILD_VF_DOUBLE_ACCURACY": "ON"
+                "CMAKE_BUILD_TYPE": "Release"
             }
         },
         {
-            "name": "cpu_make",
-            "inherits": [
-                "default_make",
-                "default_cpu"
-            ],
-            "displayName": "cpu make configuration"
-        },
-        {
-            "name": "cpu_msvc",
-            "inherits": [
-                "default_msvc",
-                "default_cpu"
-            ],
-            "displayName": "cpu msvc configuration"
+            "name": "min_size_rel",
+            "hidden": true,
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "MinSizeRel"
+            }
         },
         {
-            "name": "gpu_make",
-            "inherits": [
-                "default_make",
-                "default_gpu"
-            ],
-            "displayName": "gpu make configuration"
+            "name": "rel_with_deb_info",
+            "hidden": true,
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "RelWithDebInfo"
+            }
         },
         {
-            "name": "gpu_msvc",
+            "name": "gpu_numerical_tests",
             "inherits": [
-                "default_msvc",
-                "default_gpu"
+                "gpu",
+                "unit_tests"
             ],
-            "displayName": "gpu msvc configuration"
+            "hidden": true,
+            "description": "GPU numerical tests of VirtualFluids",
+            "cacheVariables": {
+                "BUILD_VF_DOUBLE_ACCURACY": "ON",
+                "BUILD_NUMERIC_TESTS": "ON"
+            }
         },
         {
             "name": "all_make",
             "inherits": [
-                "default_make",
-                "default_all"
+                "cpu",
+                "gpu",
+                "unit_tests",
+                "make"
             ],
             "displayName": "all make configuration"
         },
         {
             "name": "all_msvc",
             "inherits": [
-                "default_msvc",
-                "default_all"
+                "cpu",
+                "gpu",
+                "unit_tests",
+                "msvc"
             ],
             "displayName": "all msvc configuration"
         },
         {
             "name": "gpu_numerical_tests_make",
             "inherits": [
-                "default_make",
-                "default_gpu_numerical_tests"
+                "gpu_numerical_tests",
+                "make"
             ],
             "displayName": "gpu numerical tests make configuration"
         },
         {
             "name": "gpu_numerical_tests_msvc",
             "inherits": [
-                "default_msvc",
-                "default_gpu_numerical_tests"
+                "msvc",
+                "gpu_numerical_tests"
             ],
             "displayName": "gpu numerical tests msvc configuration"
+        },
+        {
+            "name": "debug_make_gpu",
+            "displayName": "Debug GPU Make",
+            "inherits": [
+                "gpu",
+                "make",
+                "debug"
+            ]
+        },
+        {
+            "name": "release_make_gpu",
+            "displayName": "Release GPU Make",
+            "inherits": [
+                "gpu",
+                "make",
+                "release"
+            ]
+        },
+        {
+            "name": "min_size_rel_make_gpu",
+            "displayName": "MinSizeRel GPU Make",
+            "inherits": [
+                "gpu",
+                "make",
+                "min_size_rel"
+            ]
+        },
+        {
+            "name": "rel_with_deb_info_make_gpu",
+            "displayName": "RelWithDebInfo GPU Make",
+            "inherits": [
+                "gpu",
+                "make",
+                "rel_with_deb_info"
+            ]
+        },
+        {
+            "name": "debug_msvc_gpu",
+            "displayName": "Debug GPU MSVC",
+            "inherits": [
+                "gpu",
+                "msvc",
+                "debug"
+            ]
+        },
+        {
+            "name": "release_msvc_gpu",
+            "displayName": "Release GPU MSVC",
+            "inherits": [
+                "gpu",
+                "msvc",
+                "release"
+            ]
+        },
+        {
+            "name": "min_size_rel_msvc_gpu",
+            "displayName": "MinSizeRel GPU MSVC",
+            "inherits": [
+                "gpu",
+                "msvc",
+                "min_size_rel"
+            ]
+        },
+        {
+            "name": "rel_with_deb_info_msvc_gpu",
+            "displayName": "RelWithDebInfo GPU MSVC",
+            "inherits": [
+                "gpu",
+                "msvc",
+                "rel_with_deb_info"
+            ]
+        }
+    ],
+    "buildPresets": [
+        {
+            "name": "Default",
+            "hidden": true,
+            "configurePreset": "default",
+            "jobs": 4
+        },
+        {
+            "name": "GPU",
+            "hidden": true,
+            "configurePreset": "gpu",
+            "targets": [
+                "ActuatorLine",
+                "DrivenCavity",
+                "BoundaryLayer"
+            ],
+            "inherits": [
+                "Default"
+            ]
+        },
+        {
+            "name": "Release",
+            "hidden": true,
+            "configurePreset": "release"
+        },
+        {
+            "name": "Debug_Make_GPU",
+            "displayName": "Debug",
+            "description": "Compile GPU version with debug information",
+            "configurePreset": "debug_make_gpu",
+            "inherits": [
+                "GPU"
+            ]
+        },
+        {
+            "name": "MinSizeRel_Make_GPU",
+            "displayName": "MinSizeRel",
+            "configurePreset": "min_size_rel_make_gpu",
+            "inherits": [
+                "GPU"
+            ]
+        },
+        {
+            "name": "RelWithDebInfo_GPU",
+            "displayName": "RelWithDebInfo",
+            "configurePreset": "rel_with_deb_info_make_gpu",
+            "inherits": [
+                "GPU"
+            ]
+        },
+        {
+            "name": "Release_GPU",
+            "description": "Build release version of GPU",
+            "displayName": "Release GPU",
+            "configurePreset": "release_make_gpu",
+            "inherits": [
+                "GPU"
+            ]
         }
     ]
-}
+}
\ No newline at end of file
diff --git a/Python/actuator_line/actuator_line.py b/Python/actuator_line/actuator_line.py
index 6e3c8608617df1267535984d53307dea9184c6ab..ecd0fe0602bba83275798928fabce9339f20763e 100644
--- a/Python/actuator_line/actuator_line.py
+++ b/Python/actuator_line/actuator_line.py
@@ -4,20 +4,11 @@ from pathlib import Path
 from mpi4py import MPI
 from pyfluids import basics, gpu, logger
 #%%
-reference_diameter = 126
-
-length = np.array([29,6,6])*reference_diameter
-viscosity = 1.56e-5
-velocity = 9
-mach = 0.1
-nodes_per_diameter = 32
-
-sim_name = "ActuatorLine"
-config_file = Path(__file__).parent/Path("config.txt")
+sim_name = "ABL"
+config_file = Path(__file__).parent/"configActuatorLine.txt"
 output_path = Path(__file__).parent/Path("output")
 output_path.mkdir(exist_ok=True)
-t_out = 100.
-t_end = 500.
+
 
 #%%
 logger.Logger.initialize_logger()
@@ -25,87 +16,169 @@ basics.logger.Logger.add_stdout()
 basics.logger.Logger.set_debug_level(basics.logger.Level.INFO_LOW)
 basics.logger.Logger.time_stamp(basics.logger.TimeStamp.ENABLE)
 basics.logger.Logger.enable_printed_rank_numbers(True)
-# %%
-comm = gpu.Communicator.get_instance()
 #%%
 grid_factory = gpu.grid_generator.GridFactory.make()
 grid_builder = gpu.grid_generator.MultipleGridBuilder.make_shared(grid_factory)
+communicator = gpu.Communicator.get_instance()
 
-#%%
-dx = reference_diameter/nodes_per_diameter
-
-grid_builder.add_coarse_grid(0.0, 0.0, 0.0, *length, dx)
-grid_builder.set_periodic_boundary_condition(False, False, False)
-grid_builder.build_grids(basics.LbmOrGks.LBM, False)
-#%%
 config = basics.ConfigurationFile()
 config.load(str(config_file))
+
+para = gpu.Parameter(communicator.get_number_of_process(), communicator.get_pid(), config)
+bc_factory = gpu.BoundaryConditionFactory()
+
 #%%
-para = gpu.Parameter(config, comm.get_number_of_process(), comm.get_pid())
+turbine_diameter = config.get_float_value("turbineDiameter", 126)
+boundary_layer_height = config.get_float_value("boundaryLayerHeight", 1000)
+z0 = config.get_float_value("z0", 0.1)
+u_star = config.get_float_value("u_star", 0.4)
+
+kappa = config.get_float_value("vonKarmanConstant", 0.4) # von Karman constant
+
+viscosity = config.get_float_value("viscosity", 1.56e-5)
 
+velocity  = 0.5*u_star/kappa*np.log(boundary_layer_height/z0+1) #0.5 times max mean velocity at the top in m/s
+
+mach = config.get_float_value("Ma", 0.1)
+nodes_per_height = config.get_uint_value("nz", 64)
+
+
+turb_pos = np.array([3,3,3])*turbine_diameter
+epsilon = config.get_float_value("SmearingWidth", 5)
+density = config.get_float_value("Density", 1.225)
+level = 0
+n_blades = 3
+n_blade_nodes = config.get_int_value("NumberOfNodesPerAL", 32)
+
+read_precursor = config.get_bool_value("readPrecursor", False)
+
+if read_precursor:
+    nTReadPrecursor = config.get_int_value("nTimestepsReadPrecursor")
+    use_distributions = config.get_bool_value("useDistributions", False)
+    precursor_directory = config.get_string_value("precursorDirectory")
+
+# all in s
+t_start_out   = config.get_float_value("tStartOut")
+t_out        = config.get_float_value("tOut")
+t_end        = config.get_float_value("tEnd") # total time of simulation
+
+t_start_averaging     =  config.get_float_value("tStartAveraging")
+t_start_tmp_averaging  =  config.get_float_value("tStartTmpAveraging")
+t_averaging          =  config.get_float_value("tAveraging")
+t_start_out_probe      =  config.get_float_value("tStartOutProbe")
+t_out_probe           =  config.get_float_value("tOutProbe")
+
+#%%
+length = np.array([6,4,1])*boundary_layer_height
+dx = boundary_layer_height/nodes_per_height
 dt = dx * mach / (np.sqrt(3) * velocity)
-velocity_lb = velocity * dt / dx # LB units
-viscosity_lb = viscosity * dt / (dx * dx) # LB units
+velocity_ratio = dx/dt
+velocity_LB = velocity / velocity_ratio # LB units
+viscosity_LB = viscosity / (velocity_ratio * dx) # LB units
+pressure_gradient = u_star * u_star / boundary_layer_height
+pressure_gradient_LB = pressure_gradient * (dt*dt)/dx
+
+logger.vf_log_info(f"velocity  [dx/dt] = {velocity_LB}")
+logger.vf_log_info(f"dt   = {dt}")
+logger.vf_log_info(f"dx   = {dx}")
+logger.vf_log_info(f"viscosity [10^8 dx^2/dt] = {viscosity_LB*1e8}")
+logger.vf_log_info(f"u* /(dx/dt) = {u_star*dt/dx}")
+logger.vf_log_info(f"dpdx  = {pressure_gradient}")
+logger.vf_log_info(f"dpdx /(dx/dt^2) = {pressure_gradient_LB}")
+
 
 #%%
-para.set_devices([0])
 para.set_output_prefix(sim_name)
-para.set_output_path(str(output_path))
-para.set_f_name(para.get_output_path() + "/" + para.get_output_prefix())
 para.set_print_files(True)
-para.set_max_level(1)
-#%%
-para.set_velocity(velocity_lb)
-para.set_viscosity(viscosity_lb)    
+
+para.set_forcing(pressure_gradient_LB, 0, 0)
+para.set_velocity_LB(velocity_LB)
+para.set_viscosity_LB(viscosity_LB)    
 para.set_velocity_ratio(dx/dt)
 para.set_viscosity_ratio(dx*dx/dt)
-para.set_main_kernel("TurbulentViscosityCumulantK17CompChim")
-para.set_use_AMD(True)
-para.set_SGS_constant(0.083)
+para.set_density_ratio(1.0)
 
-def init_func(coord_x, coord_y, coord_z):
-    return [0.0, velocity_lb, 0.0, 0.0]
+para.set_main_kernel("TurbulentViscosityCumulantK17CompChim")
 
-para.set_initial_condition(init_func)
-para.set_t_out(int(t_out/dt))
-para.set_t_end(int(t_end/dt))
+para.set_timestep_start_out(int(t_start_out/dt))
+para.set_timestep_out(int(t_out/dt))
+para.set_timestep_end(int(t_end/dt))
 para.set_is_body_force(True)
-
 #%%
-grid_builder.set_velocity_boundary_condition(gpu.SideType.MX, velocity_lb, 0.0, 0.0)
-
-grid_builder.set_velocity_boundary_condition(gpu.SideType.MY, velocity_lb, 0.0, 0.0)
-grid_builder.set_velocity_boundary_condition(gpu.SideType.PY, velocity_lb, 0.0, 0.0)
-
-grid_builder.set_velocity_boundary_condition(gpu.SideType.MZ, velocity_lb, 0.0, 0.0)
-grid_builder.set_velocity_boundary_condition(gpu.SideType.PZ, velocity_lb, 0.0, 0.0)
-
-grid_builder.set_pressure_boundary_condition(gpu.SideType.PX, 0.0)
+tm_factory = gpu.TurbulenceModelFactory(para)
+tm_factory.read_config_file(config)
+#%%
+grid_builder.add_coarse_grid(0.0, 0.0, 0.0, *length, dx)
+grid_builder.set_periodic_boundary_condition(not read_precursor, True, False)
+grid_builder.build_grids(basics.LbmOrGks.LBM, False)
 
+sampling_offset = 2
+if read_precursor:
+    precursor = gpu.create_file_collection(precursor_directory + "/precursor", gpu.FileType.VTK)
+    grid_builder.set_precursor_boundary_condition(gpu.SideType.MX, precursor, nTReadPrecursor, 0, 0, 0)
+
+grid_builder.set_stress_boundary_condition(gpu.SideType.MZ, 0, 0, 1, sampling_offset, z0/dx)
+para.set_has_wall_monitor(True)
+grid_builder.set_slip_boundary_condition(gpu.SideType.PZ, 0, 0, -1)
+
+if read_precursor:
+    grid_builder.set_pressure_boundary_condition(gpu.SideType.PX, 0)
+bc_factory.set_stress_boundary_condition(gpu.StressBC.StressPressureBounceBack)
+bc_factory.set_slip_boundary_condition(gpu.SlipBC.SlipBounceBack) 
+bc_factory.set_pressure_boundary_condition(gpu.PressureBC.OutflowNonReflective)
+if read_precursor:
+    bc_factory.set_precursor_boundary_condition(gpu.PrecursorBC.DistributionsPrecursor if use_distributions else gpu.PrecursorBC.VelocityPrecursor)
+para.set_outflow_pressure_correction_factor(0.0); 
 #%%
-cuda_memory_manager = gpu.CudaMemoryManager(para)
-grid_generator = gpu.GridProvider.make_grid_generator(grid_builder, para, cuda_memory_manager, comm)
+# don't use python init functions, they are very slow! Just kept as an example.
+# Define lambda in bindings and set it here.
+# def init_func(coord_x, coord_y, coord_z):
+#     return [
+#         0.0, 
+#         (u_star/0.4 * np.log(np.maximum(coord_z,z0)/z0) + 2.0*np.sin(np.pi*16*coord_x/length[0])*np.sin(np.pi*8*coord_z/boundary_layer_height)/(np.square(coord_z/boundary_layer_height)+1))  * dt / dx, 
+#         2.0*np.sin(np.pi*16.*coord_x/length[0])*np.sin(np.pi*8.*coord_z/boundary_layer_height)/(np.square(coord_z/boundary_layer_height)+1.)  * dt / dx, 
+#         8.0*u_star/0.4*(np.sin(np.pi*8.0*coord_y/boundary_layer_height)*np.sin(np.pi*8.0*coord_z/boundary_layer_height)+np.sin(np.pi*8.0*coord_x/length[0]))/(np.square(length[2]/2.0-coord_z)+1.) * dt / dx]
+# para.set_initial_condition(init_func)
+para.set_initial_condition_perturbed_log_law(u_star, z0, length[0], length[2], boundary_layer_height, velocity_ratio)
+
 #%%
-turb_pos = np.array([3,3,3])*reference_diameter
+turb_pos = np.array([3,3,3])*turbine_diameter
 epsilon = 5
 density = 1.225
 level = 0
 n_blades = 3
 n_blade_nodes = 32
-alm = gpu.ActuatorLine(n_blades, density, n_blade_nodes, epsilon, *turb_pos, reference_diameter, level, dt, dx)
+alm = gpu.ActuatorLine(n_blades, density, n_blade_nodes, epsilon, *turb_pos, turbine_diameter, level, dt, dx, True)
 para.add_actuator(alm)
 #%%
-point_probe = gpu.probes.PointProbe("pointProbe", str(output_path), 100, 1, 500, 100)
-point_probe.add_probe_points_from_list(np.array([1,2,5])*reference_diameter, np.array([3,3,3])*reference_diameter, np.array([3,3,3])*reference_diameter)
-point_probe.add_statistic(gpu.probes.Statistic.Means)
-
-para.add_probe(point_probe)
-
-plane_probe = gpu.probes.PlaneProbe("planeProbe", str(output_path), 100, 1, 500, 100)
-plane_probe.set_probe_plane(5*reference_diameter, 0, 0, dx, length[1], length[2])
-para.add_probe(plane_probe)
+planar_average_probe = gpu.probes.PlanarAverageProbe("horizontalPlanes", para.get_output_path(), 0, int(t_start_tmp_averaging/dt), int(t_averaging/dt) , int(t_start_out_probe/dt), int(t_out_probe/dt), 'z')
+planar_average_probe.add_all_available_statistics()
+planar_average_probe.set_file_name_to_n_out()
+para.add_probe(planar_average_probe)
 #%%
-sim = gpu.Simulation(para, cuda_memory_manager, comm, grid_generator)
+wall_model_probe = gpu.probes.WallModelProbe("wallModelProbe", para.get_output_path(), 0, int(t_start_tmp_averaging/dt), int(t_averaging/dt/4), int(t_start_out_probe/dt), int(t_out_probe/dt))
+wall_model_probe.add_all_available_statistics()
+wall_model_probe.set_file_name_to_n_out()
+wall_model_probe.set_force_output_to_stress(True)
+if para.get_is_body_force():
+    wall_model_probe.set_evaluate_pressure_gradient(True)
+para.add_probe(wall_model_probe)
+
+plane_locs = [100,]
+if read_precursor: plane_locs.extend([1000, 1500, 2000, 2500, 0])
+
+for n_probe, probe_pos in enumerate(plane_locs):
+    plane_probe = gpu.probes.PlaneProbe(f"planeProbe_{n_probe+1}", para.get_output_path(), int(t_start_averaging/dt), 10, int(t_start_out_probe/dt), int(t_out_probe/dt))
+    plane_probe.set_probe_plane(probe_pos, 0, 0, dx, length[1], length[2])
+    plane_probe.add_all_available_statistics()
+    para.add_probe(plane_probe)
+#%%
+cuda_memory_manager = gpu.CudaMemoryManager(para)
+grid_generator = gpu.GridProvider.make_grid_generator(grid_builder, para, cuda_memory_manager, communicator)
+#%%
+#%%
+sim = gpu.Simulation(para, cuda_memory_manager, communicator, grid_generator, bc_factory, tm_factory)
 #%%
 sim.run()
-MPI.Finalize()
\ No newline at end of file
+MPI.Finalize()
+
diff --git a/Python/actuator_line/config.txt b/Python/actuator_line/config.txt
deleted file mode 100644
index e4c778c4cc048f54c0a32310e6bf4a7343a263fa..0000000000000000000000000000000000000000
--- a/Python/actuator_line/config.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Path = .
-GridPath = .
diff --git a/Python/actuator_line/configActuatorLine.txt b/Python/actuator_line/configActuatorLine.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4b38bc41c1d5b510e7b262423fff861dc1a9c030
--- /dev/null
+++ b/Python/actuator_line/configActuatorLine.txt
@@ -0,0 +1,39 @@
+##################################################
+#informations for Writing
+##################################################
+Path = .
+##################################################
+#informations for reading
+##################################################
+GridPath = .
+##################################################
+Devices = 0 
+##################################################
+tStartOut           = 0
+tOut                = 100000
+tEnd                = 300000
+##################################################
+tStartAveraging     = 0
+tStartTmpAveraging  = 100000
+tAveraging          = 200
+tStartOutProbe      = 0
+tOutProbe           = 1000 
+##################################################
+Ma = 0.1
+nz = 96 
+
+bodyForce = true
+SGSconstant = 0.333
+TurbulenceModel = QR
+
+QuadricLimiterP = 100000.0
+QuadricLimiterM = 100000.0
+QuadricLimiterD = 100000.0
+
+##################################################
+readPrecursor = false
+nTimestepsReadPrecursor = 10
+precursorFile = precursor/Precursor
+
+##################################################
+turbineDiameter = 126.0
diff --git a/Python/boundary_layer/__init__.py b/Python/boundary_layer/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/Python/boundary_layer/boundary_layer.py b/Python/boundary_layer/boundary_layer.py
index 1c01f50946b49bc0ddab7e50065a24aab4ae869f..269ac887440d93a64bc26e2117926a59371874d2 100644
--- a/Python/boundary_layer/boundary_layer.py
+++ b/Python/boundary_layer/boundary_layer.py
@@ -4,34 +4,11 @@ from pathlib import Path
 from mpi4py import MPI
 from pyfluids import basics, gpu, logger
 #%%
-reference_height = 1000 # boundary layer height in m
-
-length = np.array([6,4,1])*reference_height
-viscosity = 1.56e-5
-mach = 0.1
-nodes_per_height = 32
-
-z_0 = 0.1
-u_star = 0.4
-kappa = 0.4
-
-velocity = 0.5*u_star/kappa*np.log(length[2]/z_0+1)
-flow_through_time = length[0]/velocity
-use_AMD = True
-
-
-sim_name = "BoundaryLayer"
-config_file = Path(__file__).parent/Path("config.txt")
+sim_name = "ABL"
+config_file = Path(__file__).parent/"configBoundaryLayer.txt"
 output_path = Path(__file__).parent/Path("output")
 output_path.mkdir(exist_ok=True)
-t_out = 1000.
-t_end = 5000.
 
-t_start_averaging = 0
-t_start_tmp_averaging =  100_000
-t_averaging = 200
-t_start_out_probe = 0
-t_out_probe = 1000
 
 #%%
 logger.Logger.initialize_logger()
@@ -39,95 +16,161 @@ basics.logger.Logger.add_stdout()
 basics.logger.Logger.set_debug_level(basics.logger.Level.INFO_LOW)
 basics.logger.Logger.time_stamp(basics.logger.TimeStamp.ENABLE)
 basics.logger.Logger.enable_printed_rank_numbers(True)
-# %%
-comm = gpu.Communicator.get_instance()
 #%%
 grid_factory = gpu.grid_generator.GridFactory.make()
 grid_builder = gpu.grid_generator.MultipleGridBuilder.make_shared(grid_factory)
+communicator = gpu.Communicator.get_instance()
+
+config = basics.ConfigurationFile()
+config.load(str(config_file))
+
+para = gpu.Parameter(communicator.get_number_of_process(), communicator.get_pid(), config)
+bc_factory = gpu.BoundaryConditionFactory()
 
 #%%
-dx = reference_height/nodes_per_height
-dt = dx * mach / (np.sqrt(3) * velocity)
-velocity_lb = velocity * dt / dx # LB units
-viscosity_lb = viscosity * dt / (dx * dx) # LB units
+boundary_layer_height = config.get_float_value("boundaryLayerHeight", 1000)
+z0 = config.get_float_value("z0", 0.1)
+u_star = config.get_float_value("u_star", 0.4)
 
-pressure_gradient = u_star**2 / reference_height
-pressure_gradient_lb = pressure_gradient * dt**2 / dx
+kappa = config.get_float_value("vonKarmanConstant", 0.4) # von Karman constant
 
-logger.vf_log_info(f"velocity    = {velocity_lb:1.6} dx/dt")
-logger.vf_log_info(f"dt          = {dt:1.6}")
-logger.vf_log_info(f"dx          = {dx:1.6}")
-logger.vf_log_info(f"u*          = {u_star:1.6}")
-logger.vf_log_info(f"dpdx        = {pressure_gradient:1.6}")
-logger.vf_log_info(f"dpdx        = {pressure_gradient_lb:1.6} dx/dt^2")
-logger.vf_log_info(f"viscosity   = {viscosity_lb:1.6} dx^2/dt")
+viscosity = config.get_float_value("viscosity", 1.56e-5)
 
+velocity  = 0.5*u_star/kappa*np.log(boundary_layer_height/z0+1) #0.5 times max mean velocity at the top in m/s
 
-#%%
-config = basics.ConfigurationFile()
-config.load(str(config_file))
-#%%
-para = gpu.Parameter(config, comm.get_number_of_process(), comm.get_pid())
+mach = config.get_float_value("Ma", 0.1)
+nodes_per_height = config.get_uint_value("nz", 64)
+
+
+
+write_precursor = config.get_bool_value("_p", False)
+read_precursor = config.get_bool_value("readPrecursor", False)
+
+if write_precursor:
+    nTWritePrecursor      = config.get_int_value("nTimestepsWritePrecursor")
+    t_start_precursor      = config.get_float_value("tStartPrecursor")
+    pos_x_precursor        = config.get_float_value("posXPrecursor")
 
+if read_precursor:
+    nTReadPrecursor = config.get_int_value("nTimestepsReadPrecursor")
 
+if write_precursor or read_precursor:
+    use_distributions = config.get_bool_value("useDistributions", False)
+    precursor_directory = config.get_string_value("precursorDirectory")
+
+# all in s
+t_start_out   = config.get_float_value("tStartOut")
+t_out        = config.get_float_value("tOut")
+t_end        = config.get_float_value("tEnd") # total time of simulation
+
+t_start_averaging     =  config.get_float_value("tStartAveraging")
+t_start_tmp_averaging  =  config.get_float_value("tStartTmpAveraging")
+t_averaging          =  config.get_float_value("tAveraging")
+t_start_out_probe      =  config.get_float_value("tStartOutProbe")
+t_out_probe           =  config.get_float_value("tOutProbe")
+
+#%%
+length = np.array([6,4,1])*boundary_layer_height
+dx = boundary_layer_height/nodes_per_height
+dt = dx * mach / (np.sqrt(3) * velocity)
+velocity_LB = velocity * dt / dx # LB units
+viscosity_LB = viscosity * dt / (dx * dx) # LB units
+pressure_gradient = u_star * u_star / boundary_layer_height
+pressure_gradient_LB = pressure_gradient * (dt*dt)/dx
+
+logger.vf_log_info(f"velocity  [dx/dt] = {velocity_LB}")
+logger.vf_log_info(f"dt   = {dt}")
+logger.vf_log_info(f"dx   = {dx}")
+logger.vf_log_info(f"viscosity [10^8 dx^2/dt] = {viscosity_LB*1e8}")
+logger.vf_log_info(f"u* /(dx/dt) = {u_star*dt/dx}")
+logger.vf_log_info(f"dpdx  = {pressure_gradient}")
+logger.vf_log_info(f"dpdx /(dx/dt^2) = {pressure_gradient_LB}")
+    
+#%%
 
 #%%
-para.set_devices([0])
 para.set_output_prefix(sim_name)
-para.set_output_path(str(output_path))
-para.set_f_name(para.get_output_path() + "/" + para.get_output_prefix())
 para.set_print_files(True)
-para.set_max_level(1)
-#%%
-para.set_velocity(velocity_lb)
-para.set_viscosity(viscosity_lb)    
+
+para.set_forcing(pressure_gradient_LB, 0, 0)
+para.set_velocity_LB(velocity_LB)
+para.set_viscosity_LB(viscosity_LB)    
 para.set_velocity_ratio(dx/dt)
 para.set_viscosity_ratio(dx*dx/dt)
-para.set_use_AMD(use_AMD)
+para.set_density_ratio(1.0)
 
-para.set_main_kernel("TurbulentViscosityCumulantK17CompChim" if para.get_use_AMD() else "CummulantK17CompChim")
+para.set_main_kernel("TurbulentViscosityCumulantK17CompChim")
 
-para.set_SGS_constant(0.083)
+para.set_timestep_start_out(int(t_start_out/dt))
+para.set_timestep_out(int(t_out/dt))
+para.set_timestep_end(int(t_end/dt))
+para.set_is_body_force(config.get_bool_value("bodyForce"))
+#%%
+tm_factory = gpu.TurbulenceModelFactory(para)
+tm_factory.read_config_file(config)
+#%%
+grid_builder.add_coarse_grid(0.0, 0.0, 0.0, *length, dx)
+grid_builder.set_periodic_boundary_condition(not read_precursor, True, False)
+grid_builder.build_grids(basics.LbmOrGks.LBM, False)
 
+sampling_offset = 2
+if read_precursor:
+    precursor = gpu.create_file_collection(precursor_directory + "/precursor", gpu.FileType.VTK)
+    grid_builder.set_precursor_boundary_condition(gpu.SideType.MX, precursor, nTReadPrecursor, 0, 0, 0)
+
+grid_builder.set_stress_boundary_condition(gpu.SideType.MZ, 0, 0, 1, sampling_offset, z0/dx)
+para.set_has_wall_monitor(True)
+grid_builder.set_slip_boundary_condition(gpu.SideType.PZ, 0, 0, -1)
+
+if read_precursor:
+    grid_builder.set_pressure_boundary_condition(gpu.SideType.PX, 0)
+bc_factory.set_stress_boundary_condition(gpu.StressBC.StressPressureBounceBack)
+bc_factory.set_slip_boundary_condition(gpu.SlipBC.SlipBounceBack) 
+bc_factory.set_pressure_boundary_condition(gpu.PressureBC.OutflowNonReflective)
+bc_factory.set_precursor_boundary_condition(gpu.PrecursorBC.DistributionsPrecursor if use_distributions else gpu.PrecursorBC.VelocityPrecursor)
+para.set_outflow_pressure_correction_factor(0.0); 
+#%%
 def init_func(coord_x, coord_y, coord_z):
     return [
         0.0, 
-        (u_star/kappa*np.log(max(coord_z/z_0,0)+1) + 2*np.sin(np.pi*16*coord_x/length[0])*np.sin(np.pi*8*coord_z/length[2]))/((coord_z/reference_height)**2+0.1)*dt/dx, 
-        2*np.sin(np.pi*16*coord_x/length[0])*np.sin(np.pi*8*coord_z/length[2])/((coord_z/reference_height)**2+0.1)*dt/dx, 
-        8*u_star/kappa*(np.sin(np.pi*8*coord_y/reference_height)*np.sin(np.pi*8*coord_z/reference_height)+np.sin(np.pi*8*coord_x/length[0]))/((length[2]/2-coord_z)**2+0.1)*dt/dx
-        ]
-
+        (u_star/0.4 * np.log(np.maximum(coord_z,z0)/z0) + 2.0*np.sin(np.pi*16*coord_x/length[0])*np.sin(np.pi*8*coord_z/boundary_layer_height)/(np.square(coord_z/boundary_layer_height)+1))  * dt / dx, 
+        2.0*np.sin(np.pi*16.*coord_x/length[0])*np.sin(np.pi*8.*coord_z/boundary_layer_height)/(np.square(coord_z/boundary_layer_height)+1.)  * dt / dx, 
+        8.0*u_star/0.4*(np.sin(np.pi*8.0*coord_y/boundary_layer_height)*np.sin(np.pi*8.0*coord_z/boundary_layer_height)+np.sin(np.pi*8.0*coord_x/length[0]))/(np.square(length[2]/2.0-coord_z)+1.) * dt / dx]
 para.set_initial_condition(init_func)
-para.set_t_out(int(t_out/dt))
-para.set_t_end(int(t_end/dt))
-para.set_is_body_force(True)
-para.set_has_wall_model_monitor(True)
-
 
-grid_builder.add_coarse_grid(0.0, 0.0, 0.0, *length, dx)
-grid_builder.set_periodic_boundary_condition(True, True, False)
-grid_builder.build_grids(basics.LbmOrGks.LBM, False)
 #%%
-sampling_offset = 2
-grid_builder.set_stress_boundary_condition(gpu.SideType.MZ, 0.0, 0.0, 1.0, sampling_offset, z_0/dx)
-grid_builder.set_slip_boundary_condition(gpu.SideType.PZ, 0.0, 0.0, 0.0)
+planar_average_probe = gpu.probes.PlanarAverageProbe("horizontalPlanes", para.get_output_path(), 0, int(t_start_tmp_averaging/dt), int(t_averaging/dt) , int(t_start_out_probe/dt), int(t_out_probe/dt), 'z')
+planar_average_probe.add_all_available_statistics()
+planar_average_probe.set_file_name_to_n_out()
+para.add_probe(planar_average_probe)
+#%%
+wall_model_probe = gpu.probes.WallModelProbe("wallModelProbe", para.get_output_path(), 0, int(t_start_tmp_averaging/dt), int(t_averaging/dt/4), int(t_start_out_probe/dt), int(t_out_probe/dt))
+wall_model_probe.add_all_available_statistics()
+wall_model_probe.set_file_name_to_n_out()
+wall_model_probe.set_force_output_to_stress(True)
+if para.get_is_body_force():
+    wall_model_probe.set_evaluate_pressure_gradient(True)
+para.add_probe(wall_model_probe)
+
+plane_locs = [100,]
+if read_precursor: plane_locs.extend([1000, 1500, 2000, 2500, 0])
+
+for n_probe, probe_pos in enumerate(plane_locs):
+    plane_probe = gpu.probes.PlaneProbe(f"planeProbe_{n_probe+1}", para.get_output_path(), int(t_start_averaging/dt), 10, int(t_start_out_probe/dt), int(t_out_probe/dt))
+    plane_probe.set_probe_plane(probe_pos, 0, 0, dx, length[1], length[2])
+    plane_probe.add_all_available_statistics()
+    para.add_probe(plane_probe)
+
+if write_precursor:
+    precursor_writer = gpu.PrecursorWriter("precursor", para.get_output_path() + precursor_directory, pos_x_precursor, 0,length[1], 0, length[2], t_start_precursor/dt, nTWritePrecursor, gpu.OutputVariable.Distributions if use_distributions else gpu.OutputVariable.Velocities)
+    para.add_probe(precursor_writer)
 
 #%%
 cuda_memory_manager = gpu.CudaMemoryManager(para)
-grid_generator = gpu.GridProvider.make_grid_generator(grid_builder, para, cuda_memory_manager, comm)
-
+grid_generator = gpu.GridProvider.make_grid_generator(grid_builder, para, cuda_memory_manager, communicator)
 #%%
-wall_probe = gpu.probes.WallModelProbe("wallModelProbe", str(output_path), int(t_start_averaging/dt), int(t_start_tmp_averaging/dt), int(t_averaging/dt/4), int(t_start_out_probe/dt), int(t_out_probe/dt))
-wall_probe.add_all_available_statistics()
-wall_probe.set_file_name_to_n_out()
-wall_probe.set_force_output_to_stress(True)
-if para.get_is_body_force():
-    wall_probe.set_evaluate_pressure_gradient(True)
-planar_probe = gpu.probes.PlanarAverageProbe("planarAverageProbe", str(output_path), int(t_start_averaging/dt), int(t_start_tmp_averaging/dt), int(t_averaging/dt), int(t_start_out_probe/dt), int(t_out_probe/dt), "z")
-para.add_probe(wall_probe)
-
 #%%
-sim = gpu.Simulation(para, cuda_memory_manager, comm, grid_generator)
+sim = gpu.Simulation(para, cuda_memory_manager, communicator, grid_generator, bc_factory, tm_factory)
 #%%
 sim.run()
 MPI.Finalize()
\ No newline at end of file
diff --git a/Python/boundary_layer/config.txt b/Python/boundary_layer/config.txt
deleted file mode 100644
index e4c778c4cc048f54c0a32310e6bf4a7343a263fa..0000000000000000000000000000000000000000
--- a/Python/boundary_layer/config.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Path = .
-GridPath = .
diff --git a/Python/boundary_layer/configBoundaryLayer.txt b/Python/boundary_layer/configBoundaryLayer.txt
new file mode 100644
index 0000000000000000000000000000000000000000..83e7861a5fb85ea800d187699f1c6c1409422f0a
--- /dev/null
+++ b/Python/boundary_layer/configBoundaryLayer.txt
@@ -0,0 +1,42 @@
+##################################################
+#informations for Writing
+##################################################
+Path = .
+##################################################
+#informations for reading
+##################################################
+GridPath = .
+##################################################
+Devices = 0 
+##################################################
+tStartOut           = 0
+tOut                = 100000
+tEnd                = 300000
+##################################################
+tStartAveraging     = 0
+tStartTmpAveraging  = 100000
+tAveraging          = 200
+tStartOutProbe      = 0
+tOutProbe           = 1000 
+##################################################
+Ma = 0.1
+nz = 96 
+
+bodyForce = true
+UseAMD = true
+SGSconstant = 0.2
+QuadricLimiterP = 100000.0
+QuadricLimiterM = 100000.0
+QuadricLimiterD = 100000.0
+
+##################################################
+readPrecursor = false
+nTimestepsReadPrecursor = 10
+precursorFile = precursor/Precursor
+
+##################################################
+writePrecursor = false
+nTimestepsWritePrecursor = 10
+
+tStartPrecursor = 100
+posXPrecursor = 3000
\ No newline at end of file
diff --git a/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp b/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
index baa8424c8b1ae6f04c41ec0de52f8c1fd0a9e1d6..a66f0da22edc6268c39d7856307ad5ad91658414 100644
--- a/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
+++ b/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
@@ -28,12 +28,14 @@
 #include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
 #include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
 #include "GridGenerator/grid/BoundaryConditions/Side.h"
+#include "GridGenerator/grid/BoundaryConditions/BoundaryCondition.h"
+
 #include "GridGenerator/grid/GridFactory.h"
 
 #include "GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
 #include "GridGenerator/io/GridVTKWriter/GridVTKWriter.h"
-#include "GridGenerator/io/STLReaderWriter/STLReader.h"
-#include "GridGenerator/io/STLReaderWriter/STLWriter.h"
+#include "GridGenerator/VelocitySetter/VelocitySetter.h"
+
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -49,6 +51,7 @@
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h"
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h"
 #include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
+#include "VirtualFluids_GPU/TurbulenceModels/TurbulenceModelFactory.h"
 
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
 
@@ -99,7 +102,6 @@ void multipleLevel(const std::string& configPath)
     vf::gpu::Communicator& communicator = vf::gpu::Communicator::getInstance();
 
     auto gridFactory = GridFactory::make();
-    gridFactory->setTriangularMeshDiscretizationMethod(TriangularMeshDiscretizationMethod::POINT_IN_OBJECT);
     auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -173,7 +175,10 @@ void multipleLevel(const std::string& configPath)
     gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0);
 
     bcFactory.setVelocityBoundaryCondition(BoundaryConditionFactory::VelocityBC::VelocityAndPressureCompressible);
-    bcFactory.setPressureBoundaryCondition(BoundaryConditionFactory::PressureBC::PressureNonEquilibriumCompressible);
+    bcFactory.setPressureBoundaryCondition(BoundaryConditionFactory::PressureBC::OutflowNonReflectivePressureCorrection);
+
+    SPtr<TurbulenceModelFactory> tmFactory = std::make_shared<TurbulenceModelFactory>(para);
+    tmFactory->readConfigFile(config);
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -192,28 +197,31 @@ void multipleLevel(const std::string& configPath)
     actuator_farm->addTurbine(turbPos[0], turbPos[1], turbPos[2], reference_diameter, omega, 0, 0, bladeRadii);
     para->addActuator( actuator_farm );
 
-    // SPtr<PointProbe> pointProbe = SPtr<PointProbe>( new PointProbe("pointProbe", para->getOutputPath(), 100, 1, 500, 100) );
-    // std::vector<real> probeCoordsX = {reference_diameter,2*reference_diameter,5*reference_diameter};
-    // std::vector<real> probeCoordsY = {3*reference_diameter,3*reference_diameter,3*reference_diameter};
-    // std::vector<real> probeCoordsZ = {3*reference_diameter,3*reference_diameter,3*reference_diameter};
-    // pointProbe->addProbePointsFromList(probeCoordsX, probeCoordsY, probeCoordsZ);
-    // // pointProbe->addProbePointsFromXNormalPlane(2*D, 0.0, 0.0, L_y, L_z, (uint)L_y/dx, (uint)L_z/dx);
+    SPtr<ActuatorLine> actuator_line = std::make_shared<ActuatorLine>(nBlades, density, nBladeNodes, epsilon, turbPos[0], turbPos[1], turbPos[2], reference_diameter, level, dt, dx, true);
+    para->addActuator( actuator_line );
+
+    SPtr<PointProbe> pointProbe = std::make_shared<PointProbe>("pointProbe", para->getOutputPath(), 100, 1, 500, 100);
+    std::vector<real> probeCoordsX = {reference_diameter,2*reference_diameter,5*reference_diameter};
+    std::vector<real> probeCoordsY = {3*reference_diameter,3*reference_diameter,3*reference_diameter};
+    std::vector<real> probeCoordsZ = {3*reference_diameter,3*reference_diameter,3*reference_diameter};
+    pointProbe->addProbePointsFromList(probeCoordsX, probeCoordsY, probeCoordsZ);
+    // pointProbe->addProbePointsFromXNormalPlane(2*D, 0.0, 0.0, L_y, L_z, (uint)L_y/dx, (uint)L_z/dx);
 
-    // pointProbe->addStatistic(Statistic::Means);
-    // pointProbe->addStatistic(Statistic::Variances);
-    // para->addProbe( pointProbe );
+    pointProbe->addStatistic(Statistic::Means);
+    pointProbe->addStatistic(Statistic::Variances);
+    para->addProbe( pointProbe );
 
-    // SPtr<PlaneProbe> planeProbe = SPtr<PlaneProbe>( new PlaneProbe("planeProbe", para->getOutputPath(), 100, 500, 100, 100) );
-    // planeProbe->setProbePlane(5*reference_diameter, 0, 0, dx, L_y, L_z);
-    // planeProbe->addStatistic(Statistic::Means);
-    // para->addProbe( planeProbe );
+    SPtr<PlaneProbe> planeProbe = std::make_shared<PlaneProbe>("planeProbe", para->getOutputPath(), 100, 500, 100, 100);
+    planeProbe->setProbePlane(5*reference_diameter, 0, 0, dx, L_y, L_z);
+    planeProbe->addStatistic(Statistic::Means);
+    para->addProbe( planeProbe );
 
 
     auto cudaMemoryManager = std::make_shared<CudaMemoryManager>(para);
 
     auto gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
 
-    Simulation sim(para, cudaMemoryManager, communicator, *gridGenerator, &bcFactory);
+    Simulation sim(para, cudaMemoryManager, communicator, *gridGenerator, &bcFactory, tmFactory);
     sim.run();
 }
 
diff --git a/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp b/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
index 991025b649d69305c030fe2f1dd1763a2137af9b..1cd1ba068f6ee184f2550d13bf8b4896e5b9ff63 100644
--- a/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
+++ b/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
@@ -19,6 +19,7 @@
 #include "Core/VectorTypes.h"
 
 #include <basics/config/ConfigurationFile.h>
+#include "lbm/constants/NumericConstants.h"
 
 #include <logger/Logger.h>
 
@@ -28,12 +29,13 @@
 #include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
 #include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
 #include "GridGenerator/grid/BoundaryConditions/Side.h"
+#include "GridGenerator/grid/BoundaryConditions/BoundaryCondition.h"
+
 #include "GridGenerator/grid/GridFactory.h"
 
 #include "GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
 #include "GridGenerator/io/GridVTKWriter/GridVTKWriter.h"
-#include "GridGenerator/io/STLReaderWriter/STLReader.h"
-#include "GridGenerator/io/STLReaderWriter/STLWriter.h"
+#include "GridGenerator/VelocitySetter/VelocitySetter.h"
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -44,11 +46,11 @@
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
 #include "VirtualFluids_GPU/Parameter/Parameter.h"
 #include "VirtualFluids_GPU/Output/FileWriter.h"
-#include "VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h"
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h"
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h"
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.h"
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.h"
+#include "VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.h"
 #include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
 #include "VirtualFluids_GPU/TurbulenceModels/TurbulenceModelFactory.h"
 
@@ -60,8 +62,9 @@
 
 std::string path(".");
 
-std::string simulationName("BoundayLayer");
+std::string simulationName("BoundaryLayer");
 
+using namespace vf::lbm::constant;
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -100,24 +103,48 @@ void multipleLevel(const std::string& configPath)
 
     LbmOrGks lbmOrGks = LBM;
 
-    const real H = 1000.0; // boundary layer height in m
+    const real H = config.getValue("boundaryLayerHeight", 1000.0); // boundary layer height in m
 
     const real L_x = 6*H;
     const real L_y = 4*H;
-    const real L_z = 1*H;
+    const real L_z = H;
+
+    const real z0  = config.getValue("z0", 0.1f); // roughness length in m
+    const real u_star = config.getValue("u_star", 0.4f); //friction velocity in m/s
+    const real kappa = config.getValue("vonKarmanConstant", 0.4f); // von Karman constant
 
-    const real z0  = 0.1; // roughness length in m
-    const real u_star = 0.4; //friction velocity in m/s
-    const real kappa = 0.4; // von Karman constant
+    const real viscosity = config.getValue("viscosity", 1.56e-5f);
 
-    const real viscosity = 1.56e-5;
+    const real velocity  = 0.5f*u_star/kappa*log(H/z0+1.f); //0.5 times max mean velocity at the top in m/s
 
-    const real velocity  = 0.5*u_star/kappa*log(L_z/z0); //0.5 times max mean velocity at the top in m/s
+    const real mach = config.getValue<real>("Ma", 0.1);
 
-    const real mach = config.contains("Ma")? config.getValue<real>("Ma"): 0.1;
+    const uint nodes_per_H = config.getValue<uint>("nz", 64);
 
-    const uint nodes_per_H = config.contains("nz")? config.getValue<uint>("nz"): 64;
+    const bool writePrecursor = config.getValue("writePrecursor", false);
+    bool useDistributions;
+    std::string precursorDirectory;
+    int nTWritePrecursor; real tStartPrecursor, posXPrecursor;
+    if(writePrecursor)
+    {
+        nTWritePrecursor      = config.getValue<int>("nTimestepsWritePrecursor");
+        tStartPrecursor      = config.getValue<real>("tStartPrecursor");
+        posXPrecursor        = config.getValue<real>("posXPrecursor");
+        useDistributions     = config.getValue<bool>("useDistributions", false);
+        precursorDirectory = config.getValue<std::string>("precursorDirectory");
 
+
+    }
+
+    const bool readPrecursor = config.getValue("readPrecursor", false);
+    int nTReadPrecursor;
+    if(readPrecursor)
+    {
+        nTReadPrecursor = config.getValue<int>("nTimestepsReadPrecursor");
+        precursorDirectory = config.getValue<std::string>("precursorDirectory");
+        useDistributions     = config.getValue<bool>("useDistributions", false);
+
+    }
     // all in s
     const float tStartOut   = config.getValue<real>("tStartOut");
     const float tOut        = config.getValue<real>("tOut");
@@ -130,7 +157,7 @@ void multipleLevel(const std::string& configPath)
     const float tOutProbe           =  config.getValue<real>("tOutProbe");
 
 
-    const real dx = L_z/real(nodes_per_H);
+    const real dx = H/real(nodes_per_H);
 
     const real dt = dx * mach / (sqrt(3) * velocity);
 
@@ -172,11 +199,8 @@ void multipleLevel(const std::string& configPath)
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    SPtr<TurbulenceModelFactory> tmFactory = SPtr<TurbulenceModelFactory>( new TurbulenceModelFactory(para) );
+    SPtr<TurbulenceModelFactory> tmFactory = std::make_shared<TurbulenceModelFactory>(para);
     tmFactory->readConfigFile( config );
-    
-    // tmFactory->setTurbulenceModel(TurbulenceModel::AMD);
-    // tmFactory->setModelConstant(config.getValue<real>("SGSconstant"));
 
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -187,38 +211,64 @@ void multipleLevel(const std::string& configPath)
     // gridBuilder->addGrid( new Cuboid( 0.0, 0.0, 0.0, L_x,  L_y,  0.3*L_z) , 1 );
     // para->setMaxLevel(2);
 
-    gridBuilder->setPeriodicBoundaryCondition(true, true, false);
+    gridBuilder->setPeriodicBoundaryCondition(!readPrecursor, true, false);
 
 	gridBuilder->buildGrids(lbmOrGks, false); // buildGrids() has to be called before setting the BCs!!!!
 
     uint samplingOffset = 2;
-    // gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
-    gridBuilder->setStressBoundaryCondition(SideType::MZ,
+    
+    if(readPrecursor)
+    {
+        auto precursor = createFileCollection(precursorDirectory + "/precursor", FileType::VTK);
+        gridBuilder->setPrecursorBoundaryCondition(SideType::MX, precursor, nTReadPrecursor);
+
+        gridBuilder->setStressBoundaryCondition(SideType::MZ,
                                             0.0, 0.0, 1.0,              // wall normals
                                             samplingOffset, z0/dx);     // wall model settinng
-    para->setHasWallModelMonitor(true);
-    bcFactory.setStressBoundaryCondition(BoundaryConditionFactory::StressBC::StressPressureBounceBack);
+        para->setHasWallModelMonitor(true);
+        
+        gridBuilder->setSlipBoundaryCondition(SideType::PZ,  0.0f,  0.0f, -1.0f);
+
+        
+        gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.f);
+    } 
+    else
+    {
+        gridBuilder->setSlipBoundaryCondition(SideType::PZ,  0.0,  0.0, -1.0);
 
-    gridBuilder->setSlipBoundaryCondition(SideType::PZ,  0.0,  0.0, 0.0);
+        gridBuilder->setStressBoundaryCondition(SideType::MZ,
+                                            0.0, 0.0, 1.0,              // wall normals
+                                            samplingOffset, z0/dx);     // wall model settinng
+        para->setHasWallModelMonitor(true);
+    }
+
+
+
+    bcFactory.setStressBoundaryCondition(BoundaryConditionFactory::StressBC::StressPressureBounceBack);
     bcFactory.setSlipBoundaryCondition(BoundaryConditionFactory::SlipBC::SlipBounceBack); 
-    
+    bcFactory.setPressureBoundaryCondition(BoundaryConditionFactory::PressureBC::OutflowNonReflective);
+    bcFactory.setPrecursorBoundaryCondition(useDistributions ? BoundaryConditionFactory::PrecursorBC::DistributionsPrecursor : BoundaryConditionFactory::PrecursorBC::VelocityPrecursor);
+    para->setOutflowPressureCorrectionFactor(0.0); 
+
+
 
-    real cPi = 3.1415926535897932384626433832795;
     para->setInitialCondition([&](real coordX, real coordY, real coordZ, real &rho, real &vx, real &vy, real &vz) {
         rho = (real)0.0;
-        vx  = (u_star/0.4 * log(coordZ/z0) + 2.0*sin(cPi*16.0f*coordX/L_x)*sin(cPi*8.0f*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1))  * dt / dx; 
-        vy  = 2.0*sin(cPi*16.0f*coordX/L_x)*sin(cPi*8.0f*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1)  * dt / dx; 
-        vz  = 8.0*u_star/0.4*(sin(cPi*8.0*coordY/H)*sin(cPi*8.0*coordZ/H)+sin(cPi*8.0*coordX/L_x))/(pow(L_z/2.0-coordZ, c2o1)+c1o1) * dt / dx;
+        vx  = rho = c0o1;
+        vx  = (u_star/c4o10 * log(coordZ/z0+c1o1) + c2o1*sin(cPi*c16o1*coordX/L_x)*sin(cPi*c8o1*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1)) * dt/dx; 
+        vy  = c2o1*sin(cPi*c16o1*coordX/L_x)*sin(cPi*c8o1*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1) * dt/dx; 
+        vz  = c8o1*u_star/c4o10*(sin(cPi*c8o1*coordY/H)*sin(cPi*c8o1*coordZ/H)+sin(cPi*c8o1*coordX/L_x))/(pow(c1o2*L_z-coordZ, c2o1)+c1o1) * dt/dx;
     });
+
+
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    SPtr<PlanarAverageProbe> planarAverageProbe = SPtr<PlanarAverageProbe>( new PlanarAverageProbe("planeProbe", para->getOutputPath(), tStartAveraging/dt, tStartTmpAveraging/dt, tAveraging/dt , tStartOutProbe/dt, tOutProbe/dt, 'z') );
+    SPtr<PlanarAverageProbe> planarAverageProbe = SPtr<PlanarAverageProbe>( new PlanarAverageProbe("horizontalPlanes", para->getOutputPath(), 0, tStartTmpAveraging/dt, tAveraging/dt , tStartOutProbe/dt, tOutProbe/dt, 'z') );
     planarAverageProbe->addAllAvailableStatistics();
     planarAverageProbe->setFileNameToNOut();
     para->addProbe( planarAverageProbe );
 
-    para->setHasWallModelMonitor(true);
-    SPtr<WallModelProbe> wallModelProbe = SPtr<WallModelProbe>( new WallModelProbe("wallModelProbe", para->getOutputPath(), tStartAveraging/dt, tStartTmpAveraging/dt, tAveraging/dt/4.0 , tStartOutProbe/dt, tOutProbe/dt) );
+    SPtr<WallModelProbe> wallModelProbe = SPtr<WallModelProbe>( new WallModelProbe("wallModelProbe", para->getOutputPath(), 0, tStartTmpAveraging/dt, tAveraging/dt/4.0 , tStartOutProbe/dt, tOutProbe/dt) );
     wallModelProbe->addAllAvailableStatistics();
     wallModelProbe->setFileNameToNOut();
     wallModelProbe->setForceOutputToStress(true);
@@ -226,6 +276,46 @@ void multipleLevel(const std::string& configPath)
         wallModelProbe->setEvaluatePressureGradient(true);
     para->addProbe( wallModelProbe );
 
+    SPtr<PlaneProbe> planeProbe1 = SPtr<PlaneProbe>( new PlaneProbe("planeProbe_1", para->getOutputPath(), tStartAveraging/dt, 10, tStartOutProbe/dt, tOutProbe/dt) );
+    planeProbe1->setProbePlane(100.0, 0.0, 0, dx, L_y, L_z);
+    planeProbe1->addAllAvailableStatistics();
+    para->addProbe( planeProbe1 );
+
+    if(readPrecursor)
+    {
+        SPtr<PlaneProbe> planeProbe2 = SPtr<PlaneProbe>( new PlaneProbe("planeProbe_2", para->getOutputPath(), tStartAveraging/dt, 10, tStartOutProbe/dt, tOutProbe/dt) );
+        planeProbe2->setProbePlane(1000.0, 0.0, 0, dx, L_y, L_z);
+        planeProbe2->addAllAvailableStatistics();
+        para->addProbe( planeProbe2 );
+
+        SPtr<PlaneProbe> planeProbe3 = SPtr<PlaneProbe>( new PlaneProbe("planeProbe_3", para->getOutputPath(), tStartAveraging/dt, 10, tStartOutProbe/dt, tOutProbe/dt) );
+        planeProbe3->setProbePlane(1500.0, 0.0, 0, dx, L_y, L_z);
+        planeProbe3->addAllAvailableStatistics();
+        para->addProbe( planeProbe3 );
+
+        SPtr<PlaneProbe> planeProbe4 = SPtr<PlaneProbe>( new PlaneProbe("planeProbe_4", para->getOutputPath(), tStartAveraging/dt, 10, tStartOutProbe/dt, tOutProbe/dt) );
+        planeProbe4->setProbePlane(2000.0, 0.0, 0, dx, L_y, L_z);
+        planeProbe4->addAllAvailableStatistics();
+        para->addProbe( planeProbe4 );
+
+        SPtr<PlaneProbe> planeProbe5 = SPtr<PlaneProbe>( new PlaneProbe("planeProbe_5", para->getOutputPath(), tStartAveraging/dt, 10, tStartOutProbe/dt, tOutProbe/dt) );
+        planeProbe5->setProbePlane(2500.0, 0.0, 0, dx, L_y, L_z);
+        planeProbe5->addAllAvailableStatistics();
+        para->addProbe( planeProbe5 );
+
+        SPtr<PlaneProbe> planeProbe6 = SPtr<PlaneProbe>( new PlaneProbe("planeProbe_6", para->getOutputPath(), tStartAveraging/dt, 10, tStartOutProbe/dt, tOutProbe/dt) );
+        planeProbe6->setProbePlane(0.0, L_y/2.0, 0, L_x, dx, L_z);
+        planeProbe6->addAllAvailableStatistics();
+        para->addProbe( planeProbe6 );
+    }
+
+
+    if(writePrecursor)
+    {
+        SPtr<PrecursorWriter> precursorWriter = std::make_shared<PrecursorWriter>("precursor", para->getOutputPath()+precursorDirectory, posXPrecursor, 0, L_y, 0, L_z, tStartPrecursor/dt, nTWritePrecursor, useDistributions? OutputVariable::Distributions: OutputVariable::Velocities);
+        para->addProbe(precursorWriter);
+    }
+
     auto cudaMemoryManager = std::make_shared<CudaMemoryManager>(para);
     auto gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
 
diff --git a/apps/gpu/LBM/BoundaryLayer/configBoundaryLayer.txt b/apps/gpu/LBM/BoundaryLayer/configBoundaryLayer.txt
index a489f0ab89738a193b16fee41c212a5943f6525d..83e7861a5fb85ea800d187699f1c6c1409422f0a 100644
--- a/apps/gpu/LBM/BoundaryLayer/configBoundaryLayer.txt
+++ b/apps/gpu/LBM/BoundaryLayer/configBoundaryLayer.txt
@@ -7,7 +7,7 @@ Path = .
 ##################################################
 GridPath = .
 ##################################################
-Devices = 1 
+Devices = 0 
 ##################################################
 tStartOut           = 0
 tOut                = 100000
@@ -28,3 +28,15 @@ SGSconstant = 0.2
 QuadricLimiterP = 100000.0
 QuadricLimiterM = 100000.0
 QuadricLimiterD = 100000.0
+
+##################################################
+readPrecursor = false
+nTimestepsReadPrecursor = 10
+precursorFile = precursor/Precursor
+
+##################################################
+writePrecursor = false
+nTimestepsWritePrecursor = 10
+
+tStartPrecursor = 100
+posXPrecursor = 3000
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 8fcb7926102d188b44d8c74084235b6f175edf80..4353b019615408705c1896f632291c17f5720c07 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,2 +1,10 @@
 [build-system]
-requires = ["setuptools", "wheel", "scikit-build"]
\ No newline at end of file
+requires = [
+    "wheel",
+    "cmake>=3.1.0",
+    "setuptools",
+    "setuptools_scm[toml]",
+    "cmake_build_extension"
+]
+build-backend = "setup_builder"
+backend-path = ["utilities"]
\ No newline at end of file
diff --git a/pythonbindings/CMakeLists.txt b/pythonbindings/CMakeLists.txt
index 5a84adef027fdfa2953e016693bb64570e48c1ef..f56b2e89ee89bdac76d2f98773d47948bc360aa2 100644
--- a/pythonbindings/CMakeLists.txt
+++ b/pythonbindings/CMakeLists.txt
@@ -1,24 +1,27 @@
 project(VirtualFluidsPython LANGUAGES CUDA CXX)
 IF(BUILD_VF_GPU)
-    pybind11_add_module(pyfluids src/VirtualFluidsModulesGPU.cpp)
+    pybind11_add_module(python_bindings MODULE src/VirtualFluidsModulesGPU.cpp)
+    set_target_properties(python_bindings PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/pythonbindings/pyfluids
+    OUTPUT_NAME "bindings")
     set_source_files_properties(src/VirtualFluidsModulesGPU.cpp PROPERTIES LANGUAGE CUDA)
 
-    target_link_libraries(pyfluids PRIVATE GridGenerator VirtualFluids_GPU basics lbmCuda logger)
-    target_include_directories(pyfluids PRIVATE ${VF_THIRD_DIR}/cuda_samples/)
+    target_link_libraries(python_bindings PRIVATE GridGenerator VirtualFluids_GPU basics lbmCuda logger)
+    target_include_directories(python_bindings PRIVATE ${VF_THIRD_DIR}/cuda_samples/)
 
 ENDIF()
 IF(BUILD_VF_CPU)
-    pybind11_add_module(pyfluids src/VirtualFluidsModulesCPU.cpp)
+    pybind11_add_module(python_bindings src/VirtualFluidsModulesCPU.cpp)
     pybind11_add_module(pymuparser src/muParser.cpp)
 
     # TODO: Move this to MuParser CMakeLists.txt
     set_target_properties(muparser PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-    target_compile_definitions(pyfluids PRIVATE VF_METIS VF_MPI)
+    target_compile_definitions(python_bindings PRIVATE VF_METIS VF_MPI)
     target_compile_definitions(pymuparser PRIVATE VF_METIS VF_MPI)
 
-    target_link_libraries(pyfluids PRIVATE simulationconfig VirtualFluidsCore muparser basics)
+    target_link_libraries(python_bindings PRIVATE simulationconfig VirtualFluidsCore muparser basics)
     target_link_libraries(pymuparser PRIVATE muparser)
 ENDIF()
-target_include_directories(pyfluids PRIVATE ${CMAKE_SOURCE_DIR}/src/)
-target_include_directories(pyfluids PRIVATE ${CMAKE_BINARY_DIR})
\ No newline at end of file
+target_include_directories(python_bindings PRIVATE ${CMAKE_SOURCE_DIR}/src/)
+target_include_directories(python_bindings PRIVATE ${CMAKE_BINARY_DIR})
\ No newline at end of file
diff --git a/pythonbindings/src/VirtualFluidsModulesCPU.cpp b/pythonbindings/src/VirtualFluidsModulesCPU.cpp
index 2fba3da494f568f7d0d0a117a579a45c9c1b9245..9201a8ce9ab2f0e61b64ec0263185e5642feca18 100644
--- a/pythonbindings/src/VirtualFluidsModulesCPU.cpp
+++ b/pythonbindings/src/VirtualFluidsModulesCPU.cpp
@@ -5,7 +5,7 @@ namespace py_bindings
 {
     namespace py = pybind11;
 
-    PYBIND11_MODULE(pyfluids, m)
+    PYBIND11_MODULE(bindings, m)
     {
         cpu::makeModule(m);
     }
diff --git a/pythonbindings/src/VirtualFluidsModulesGPU.cpp b/pythonbindings/src/VirtualFluidsModulesGPU.cpp
index b96971caf381faada76ee676cf60469492d055c2..e0320115e1cf1fcb8c60d19af5a51f3fe92d7562 100644
--- a/pythonbindings/src/VirtualFluidsModulesGPU.cpp
+++ b/pythonbindings/src/VirtualFluidsModulesGPU.cpp
@@ -8,7 +8,7 @@ namespace py_bindings
 {
     namespace py = pybind11;
 
-    PYBIND11_MODULE(pyfluids, m)
+    PYBIND11_MODULE(bindings, m)
     {
         basics::makeModule(m);
         gpu::makeModule(m);
diff --git a/pythonbindings/src/basics/submodules/configuration_file.cpp b/pythonbindings/src/basics/submodules/configuration_file.cpp
index f5a2f87135a17f5eda34a7467d95f9db6b1c21d1..ad30864a41aa6038f3021bdd4d159ca7ee993ec5 100644
--- a/pythonbindings/src/basics/submodules/configuration_file.cpp
+++ b/pythonbindings/src/basics/submodules/configuration_file.cpp
@@ -1,5 +1,5 @@
 #include <pybind11/pybind11.h>
-#include <basics/config/ConfigurationFile.h>
+#include "basics/config/ConfigurationFile.h"
 
 namespace configuration
 {
@@ -9,6 +9,19 @@ namespace configuration
     {
         py::class_<vf::basics::ConfigurationFile>(parentModule, "ConfigurationFile")
         .def(py::init<>())
-        .def("load", &vf::basics::ConfigurationFile::load);
+        .def("load", &vf::basics::ConfigurationFile::load)
+        .def("contains", &vf::basics::ConfigurationFile::contains)
+        .def("get_int_value"   , static_cast<int         (vf::basics::ConfigurationFile::*)(const std::string&) const>(&vf::basics::ConfigurationFile::getValue))
+        .def("get_int_value"   , static_cast<int         (vf::basics::ConfigurationFile::*)(const std::string&, int        ) const>(&vf::basics::ConfigurationFile::getValue))
+        .def("get_uint_value"  , static_cast<uint        (vf::basics::ConfigurationFile::*)(const std::string&) const>(&vf::basics::ConfigurationFile::getValue))
+        .def("get_uint_value"  , static_cast<uint        (vf::basics::ConfigurationFile::*)(const std::string&, uint       ) const>(&vf::basics::ConfigurationFile::getValue))
+        .def("get_float_value" , static_cast<float       (vf::basics::ConfigurationFile::*)(const std::string&) const>(&vf::basics::ConfigurationFile::getValue))
+        .def("get_float_value" , static_cast<float       (vf::basics::ConfigurationFile::*)(const std::string&, float      ) const>(&vf::basics::ConfigurationFile::getValue))
+        .def("get_double_value", static_cast<double      (vf::basics::ConfigurationFile::*)(const std::string&) const>(&vf::basics::ConfigurationFile::getValue))
+        .def("get_double_value", static_cast<double      (vf::basics::ConfigurationFile::*)(const std::string&, double     ) const>(&vf::basics::ConfigurationFile::getValue))
+        .def("get_bool_value"  , static_cast<bool        (vf::basics::ConfigurationFile::*)(const std::string&) const>(&vf::basics::ConfigurationFile::getValue))
+        .def("get_bool_value"  , static_cast<bool        (vf::basics::ConfigurationFile::*)(const std::string&, bool       ) const>(&vf::basics::ConfigurationFile::getValue))
+        .def("get_string_value", static_cast<std::string (vf::basics::ConfigurationFile::*)(const std::string&) const>(&vf::basics::ConfigurationFile::getValue))
+        .def("get_string_value", static_cast<std::string (vf::basics::ConfigurationFile::*)(const std::string&, std::string) const>(&vf::basics::ConfigurationFile::getValue));
     }
 }
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/gpu.cpp b/pythonbindings/src/gpu/gpu.cpp
index c99b59d153e1afc4bad15b74192212a96e45718b..be236654782c5538f9e50f4ead3185c169d7b65c 100644
--- a/pythonbindings/src/gpu/gpu.cpp
+++ b/pythonbindings/src/gpu/gpu.cpp
@@ -6,9 +6,12 @@
 #include "submodules/boundary_conditions.cpp"
 #include "submodules/communicator.cpp"
 #include "submodules/cuda_memory_manager.cpp"
+#include "submodules/probes.cpp"
+#include "submodules/precursor_writer.cpp"
 #include "submodules/grid_provider.cpp"
 #include "submodules/grid_generator.cpp"
-#include "submodules/probes.cpp"
+#include "submodules/turbulence_models.cpp"
+#include "submodules/velocity_setter.cpp"
 
 namespace gpu
 {
@@ -23,11 +26,14 @@ namespace gpu
         actuator_line::makeModule(gpuModule);
         actuator_farm::makeModule(gpuModule);
         boundary_conditions::makeModule(gpuModule);
+        velocity_setter::makeModule(gpuModule);
         communicator::makeModule(gpuModule); 
         cuda_memory_manager::makeModule(gpuModule);
-        grid_provider::makeModule(gpuModule);
         probes::makeModule(gpuModule);
+        precursor_writer::makeModule(gpuModule);
         grid_generator::makeModule(gpuModule);
+        grid_provider::makeModule(gpuModule);
+        turbulence_model::makeModule(gpuModule);
         return gpuModule;
     }
 }
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/actuator_line.cpp b/pythonbindings/src/gpu/submodules/actuator_line.cpp
index 3207fadbc37df38e53e00adcb9a86f0b8e82ba98..c489654fd093881a068ebbd69294c4bd83847efb 100644
--- a/pythonbindings/src/gpu/submodules/actuator_line.cpp
+++ b/pythonbindings/src/gpu/submodules/actuator_line.cpp
@@ -1,8 +1,10 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/numpy.h>
-#include <gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h>
 #include <gpu/VirtualFluids_GPU/PreCollisionInteractor/PreCollisionInteractor.h>
+#include <gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h>
+#include <cstdint>
+
 class PyActuatorLine : public ActuatorLine 
 {
 public:
@@ -12,12 +14,14 @@ public:
         PYBIND11_OVERRIDE_NAME(void, ActuatorLine, "calc_blade_forces", calcBladeForces,); 
     }
 };
+
 namespace actuator_line
 {
     namespace py = pybind11;
 
     void makeModule(py::module_ &parentModule)
     {
+
         using arr = py::array_t<float, py::array::c_style>;
         
         py::class_<ActuatorLine, PreCollisionInteractor, PyActuatorLine, std::shared_ptr<ActuatorLine>>(parentModule, "ActuatorLine", py::dynamic_attr())
@@ -29,7 +33,8 @@ namespace actuator_line
                         const real,
                         int,
                         const real,
-                        const real>(), 
+                        const real,
+                        const bool>(), 
                         "n_blades", 
                         "density", 
                         "n_blade_nodes", 
@@ -38,7 +43,8 @@ namespace actuator_line
                         "diameter", 
                         "level", 
                         "delta_t", 
-                        "delta_x")
+                        "delta_x",
+                        "use_host_arrays")
         .def_property("omega", &ActuatorLine::getOmega, &ActuatorLine::setOmega)
         .def_property("azimuth", &ActuatorLine::getAzimuth, &ActuatorLine::setAzimuth)
         .def_property("yaw", &ActuatorLine::getYaw, &ActuatorLine::setYaw)
@@ -47,6 +53,8 @@ namespace actuator_line
         .def_property_readonly("n_nodes", &ActuatorLine::getNNodes)
         .def_property_readonly("n_indices", &ActuatorLine::getNIndices)
         .def_property_readonly("density", &ActuatorLine::getDensity)
+        .def_property_readonly("delta_t", &ActuatorLine::getDeltaT)
+        .def_property_readonly("delta_x", &ActuatorLine::getDeltaX)
         .def_property_readonly("position_x", &ActuatorLine::getPositionX)
         .def_property_readonly("position_y", &ActuatorLine::getPositionY)
         .def_property_readonly("position_z", &ActuatorLine::getPositionZ)
@@ -61,12 +69,40 @@ namespace actuator_line
         .def("get_blade_forces_x", [](ActuatorLine& al){ return arr({al.getNBlades(), al.getNBladeNodes()}, al.getBladeForcesX()); } )
         .def("get_blade_forces_y", [](ActuatorLine& al){ return arr({al.getNBlades(), al.getNBladeNodes()}, al.getBladeForcesY()); } )
         .def("get_blade_forces_z", [](ActuatorLine& al){ return arr({al.getNBlades(), al.getNBladeNodes()}, al.getBladeForcesZ()); } )
-        .def("set_blade_coords", [](ActuatorLine& al, arr coordsX, arr coordsY, arr coordsZ){ 
-            al.setBladeCoords(static_cast<float *>(coordsX.request().ptr), static_cast<float *>(coordsY.request().ptr), static_cast<float *>(coordsZ.request().ptr)); } )
-        .def("set_blade_velocities", [](ActuatorLine& al, arr velocitiesX, arr velocitiesY, arr velocitiesZ){ 
-            al.setBladeVelocities(static_cast<float *>(velocitiesX.request().ptr), static_cast<float *>(velocitiesY.request().ptr), static_cast<float *>(velocitiesZ.request().ptr)); } )
-        .def("set_blade_forces", [](ActuatorLine& al, arr forcesX, arr forcesY, arr forcesZ){ 
-            al.setBladeForces(static_cast<float *>(forcesX.request().ptr), static_cast<float *>(forcesY.request().ptr), static_cast<float *>(forcesZ.request().ptr)); } )
+        .def("get_blade_coords_x_device", [](ActuatorLine& al) -> intptr_t { return reinterpret_cast<intptr_t>(al.getBladeCoordsXD()); }, py::return_value_policy::reference)
+        .def("get_blade_coords_y_device", [](ActuatorLine& al) -> intptr_t { return reinterpret_cast<intptr_t>(al.getBladeCoordsYD()); }, py::return_value_policy::reference)
+        .def("get_blade_coords_z_device", [](ActuatorLine& al) -> intptr_t { return reinterpret_cast<intptr_t>(al.getBladeCoordsZD()); }, py::return_value_policy::reference)        
+        .def("get_blade_velocities_x_device", [](ActuatorLine& al) -> intptr_t { return reinterpret_cast<intptr_t>(al.getBladeVelocitiesXD()); }, py::return_value_policy::reference)
+        .def("get_blade_velocities_y_device", [](ActuatorLine& al) -> intptr_t { return reinterpret_cast<intptr_t>(al.getBladeVelocitiesYD()); }, py::return_value_policy::reference)
+        .def("get_blade_velocities_z_device", [](ActuatorLine& al) -> intptr_t { return reinterpret_cast<intptr_t>(al.getBladeVelocitiesZD()); }, py::return_value_policy::reference)
+        .def("get_blade_forces_x_device", [](ActuatorLine& al)-> intptr_t { return reinterpret_cast<intptr_t>(al.getBladeForcesXD()); }, py::return_value_policy::reference )
+        .def("get_blade_forces_y_device", [](ActuatorLine& al)-> intptr_t { return reinterpret_cast<intptr_t>(al.getBladeForcesYD()); }, py::return_value_policy::reference )
+        .def("get_blade_forces_z_device", [](ActuatorLine& al)-> intptr_t { return reinterpret_cast<intptr_t>(al.getBladeForcesZD()); }, py::return_value_policy::reference )
+        .def("set_preinit_blade_radii", [](ActuatorLine& al, arr radii){ al.setPreInitBladeRadii(static_cast<float *>(radii.request().ptr)); } )
+        .def("set_blade_coords", [](ActuatorLine& al, arr coordsX, arr coordsY, arr coordsZ)
+        { 
+            al.setBladeCoords(static_cast<float *>(coordsX.request().ptr), static_cast<float *>(coordsY.request().ptr), static_cast<float *>(coordsZ.request().ptr)); 
+        })
+        .def("set_blade_velocities", [](ActuatorLine& al, arr velocitiesX, arr velocitiesY, arr velocitiesZ)
+        { 
+            al.setBladeVelocities(static_cast<float *>(velocitiesX.request().ptr), static_cast<float *>(velocitiesY.request().ptr), static_cast<float *>(velocitiesZ.request().ptr)); 
+        })
+        .def("set_blade_forces", [](ActuatorLine& al, arr forcesX, arr forcesY, arr forcesZ)
+        { 
+            al.setBladeForces(static_cast<float *>(forcesX.request().ptr), static_cast<float *>(forcesY.request().ptr), static_cast<float *>(forcesZ.request().ptr));
+        })
+        // .def("set_blade_coords_device", [](ActuatorLine& al, arr coordsX, arr coordsY, arr coordsZ)
+        // { 
+        //     al.setBladeCoordsD(static_cast<float *>(coordsX.request().ptr), static_cast<float *>(coordsY.request().ptr), static_cast<float *>(coordsZ.request().ptr)); 
+        // })
+        // .def("set_blade_velocities_device", [](ActuatorLine& al, arr velocitiesX, arr velocitiesY, arr velocitiesZ)
+        // { 
+        //     al.setBladeVelocitiesD(static_cast<float *>(velocitiesX.request().ptr), static_cast<float *>(velocitiesY.request().ptr), static_cast<float *>(velocitiesZ.request().ptr)); 
+        // })
+        // .def("set_blade_forces_device", [](ActuatorLine& al, arr forcesX, arr forcesY, arr forcesZ)
+        // { 
+        //     al.setBladeForcesD(static_cast<float *>(forcesX.request().ptr), static_cast<float *>(forcesY.request().ptr), static_cast<float *>(forcesZ.request().ptr)); 
+        // })
         .def("calc_blade_forces", &ActuatorLine::calcBladeForces);
     }
 }
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/boundary_conditions.cpp b/pythonbindings/src/gpu/submodules/boundary_conditions.cpp
index 8f941a8705c225275d25291205ebdaeef8de5c9e..9ab758ffd6e9fc68e03eef40676508c093567df2 100644
--- a/pythonbindings/src/gpu/submodules/boundary_conditions.cpp
+++ b/pythonbindings/src/gpu/submodules/boundary_conditions.cpp
@@ -1,5 +1,6 @@
 #include <pybind11/pybind11.h>
 #include <gpu/GridGenerator/grid/BoundaryConditions/Side.h>
+#include "gpu/VirtualFluids_GPU/BoundaryConditions/BoundaryConditionFactory.h"
 
 namespace boundary_conditions
 {
@@ -14,7 +15,59 @@ namespace boundary_conditions
         .value("PY", SideType::PY)
         .value("MZ", SideType::MZ)
         .value("PZ", SideType::PZ)
-        .value("GEOMETRY", SideType::GEOMETRY)
-        .export_values();
+        .value("GEOMETRY", SideType::GEOMETRY);
+
+        py::class_<BoundaryConditionFactory>(parentModule, "BoundaryConditionFactory")
+        .def(py::init<>())
+        .def("set_velocity_boundary_condition", &BoundaryConditionFactory::setVelocityBoundaryCondition)
+        .def("set_no_slip_boundary_condition", &BoundaryConditionFactory::setNoSlipBoundaryCondition)
+        .def("set_slip_boundary_condition", &BoundaryConditionFactory::setSlipBoundaryCondition)
+        .def("set_pressure_boundary_condition", &BoundaryConditionFactory::setPressureBoundaryCondition)
+        .def("set_stress_boundary_condition", &BoundaryConditionFactory::setStressBoundaryCondition)
+        .def("set_precursor_boundary_condition", &BoundaryConditionFactory::setPrecursorBoundaryCondition)
+        .def("set_geometry_boundary_condition", &BoundaryConditionFactory::setGeometryBoundaryCondition);
+
+        py::enum_<BoundaryConditionFactory::VelocityBC>(parentModule, "VelocityBC")
+        .value("VelocitySimpleBounceBackCompressible", BoundaryConditionFactory::VelocityBC::VelocitySimpleBounceBackCompressible)
+        .value("VelocityIncompressible", BoundaryConditionFactory::VelocityBC::VelocityIncompressible)
+        .value("VelocityCompressible", BoundaryConditionFactory::VelocityBC::VelocityCompressible)
+        .value("VelocityAndPressureCompressible", BoundaryConditionFactory::VelocityBC::VelocityAndPressureCompressible)
+        .value("NotSpecified", BoundaryConditionFactory::VelocityBC::NotSpecified);
+
+
+        py::enum_<BoundaryConditionFactory::NoSlipBC>(parentModule, "NoSlipBC")
+        .value("NoSlipImplicitBounceBack", BoundaryConditionFactory::NoSlipBC::NoSlipImplicitBounceBack)
+        .value("NoSlipBounceBack", BoundaryConditionFactory::NoSlipBC::NoSlipBounceBack)
+        .value("NoSlipIncompressible", BoundaryConditionFactory::NoSlipBC::NoSlipIncompressible)
+        .value("NoSlipCompressible", BoundaryConditionFactory::NoSlipBC::NoSlipCompressible)
+        .value("NoSlip3rdMomentsCompressible", BoundaryConditionFactory::NoSlipBC::NoSlip3rdMomentsCompressible);
+
+        py::enum_<BoundaryConditionFactory::SlipBC>(parentModule, "SlipBC")
+        .value("SlipIncompressible", BoundaryConditionFactory::SlipBC::SlipIncompressible)
+        .value("SlipCompressible", BoundaryConditionFactory::SlipBC::SlipCompressible)
+        .value("SlipBounceBack", BoundaryConditionFactory::SlipBC::SlipBounceBack)
+        .value("SlipCompressibleTurbulentViscosity", BoundaryConditionFactory::SlipBC::SlipCompressibleTurbulentViscosity)
+        .value("SlipPressureCompressibleTurbulentViscosity", BoundaryConditionFactory::SlipBC::SlipPressureCompressibleTurbulentViscosity)
+        .value("NotSpecified", BoundaryConditionFactory::SlipBC::NotSpecified);
+
+        py::enum_<BoundaryConditionFactory::PressureBC>(parentModule, "PressureBC")
+        .value("PressureEquilibrium", BoundaryConditionFactory::PressureBC::PressureEquilibrium)
+        .value("PressureEquilibrium2", BoundaryConditionFactory::PressureBC::PressureEquilibrium2)
+        .value("PressureNonEquilibriumIncompressible", BoundaryConditionFactory::PressureBC::PressureNonEquilibriumIncompressible)
+        .value("PressureNonEquilibriumCompressible", BoundaryConditionFactory::PressureBC::PressureNonEquilibriumCompressible)
+        .value("OutflowNonReflective", BoundaryConditionFactory::PressureBC::OutflowNonReflective)
+        .value("OutflowNonReflectivePressureCorrection", BoundaryConditionFactory::PressureBC::OutflowNonReflectivePressureCorrection)
+        .value("NotSpecified", BoundaryConditionFactory::PressureBC::NotSpecified);
+
+        py::enum_<BoundaryConditionFactory::StressBC>(parentModule, "StressBC")
+        .value("StressCompressible", BoundaryConditionFactory::StressBC::StressCompressible)
+        .value("StressBounceBack", BoundaryConditionFactory::StressBC::StressBounceBack)
+        .value("StressPressureBounceBack", BoundaryConditionFactory::StressBC::StressPressureBounceBack)
+        .value("NotSpecified", BoundaryConditionFactory::StressBC::NotSpecified);
+
+        py::enum_<BoundaryConditionFactory::PrecursorBC>(parentModule, "PrecursorBC")
+        .value("VelocityPrecursor", BoundaryConditionFactory::PrecursorBC::VelocityPrecursor)
+        .value("DistributionsPrecursor", BoundaryConditionFactory::PrecursorBC::DistributionsPrecursor)
+        .value("NotSpecified", BoundaryConditionFactory::PrecursorBC::NotSpecified);
     }
 }
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/grid_generator.cpp b/pythonbindings/src/gpu/submodules/grid_generator.cpp
index 579c06c4e00cae9646ced8b554d71631eeb7e793..a62247aa9603f544ffadbe12442597746f75374d 100644
--- a/pythonbindings/src/gpu/submodules/grid_generator.cpp
+++ b/pythonbindings/src/gpu/submodules/grid_generator.cpp
@@ -51,6 +51,7 @@ namespace grid_generator
         .def("set_pressure_boundary_condition", &LevelGridBuilder::setPressureBoundaryCondition)
         .def("set_periodic_boundary_condition", &LevelGridBuilder::setPeriodicBoundaryCondition)
         .def("set_no_slip_boundary_condition", &LevelGridBuilder::setNoSlipBoundaryCondition)
+        .def("set_precursor_boundary_condition", &LevelGridBuilder::setPrecursorBoundaryCondition)
         .def("set_stress_boundary_condition", &LevelGridBuilder::setStressBoundaryCondition);
 
         py::class_<MultipleGridBuilder, LevelGridBuilder, std::shared_ptr<MultipleGridBuilder>>(gridGeneratorModule, "MultipleGridBuilder")
diff --git a/pythonbindings/src/gpu/submodules/grid_provider.cpp b/pythonbindings/src/gpu/submodules/grid_provider.cpp
index 02ff273e2cd1a2022943e19c9a48a447d9dfe54b..bcfff5a15d3fa2306a3a1a3d1083a99f39e1d977 100644
--- a/pythonbindings/src/gpu/submodules/grid_provider.cpp
+++ b/pythonbindings/src/gpu/submodules/grid_provider.cpp
@@ -1,8 +1,5 @@
 #include <pybind11/pybind11.h>
 #include "gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
-// #include <gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h>
-// #include <gpu/VirtualFluids_GPU/Parameter/Parameter.h>
-// #include "gpu/GridGenerator/grid/GridBuilder/GridBuilder.h"
 
 namespace grid_provider
 {
diff --git a/pythonbindings/src/gpu/submodules/parameter.cpp b/pythonbindings/src/gpu/submodules/parameter.cpp
index 7b4e67f101e3928abbd4262557864ea1d0f45b02..ba3e1b7cf94dee503deca96f32024509bc13c7d8 100644
--- a/pythonbindings/src/gpu/submodules/parameter.cpp
+++ b/pythonbindings/src/gpu/submodules/parameter.cpp
@@ -13,42 +13,43 @@ namespace parameter
     {
         py::class_<Parameter, std::shared_ptr<Parameter>>(parentModule, "Parameter")
         .def(py::init<
-                const vf::basics::ConfigurationFile&, 
                 int,
-                int
-                >(),
-                "config_data",
+                int,
+                std::optional<const vf::basics::ConfigurationFile*>>(),
+                "number_of_processes",
+                "my_ID",
+                "config_data")
+        .def(py::init<int, int>(),
                 "number_of_processes",
                 "my_ID")
+        .def(py::init<const vf::basics::ConfigurationFile*>(), "config_data")
         .def("set_forcing", &Parameter::setForcing)
+        .def("set_quadric_limiters", &Parameter::setQuadricLimiters)
         .def("set_diff_on", &Parameter::setDiffOn)
         .def("set_comp_on", &Parameter::setCompOn)
         .def("set_max_level", &Parameter::setMaxLevel)
-        .def("set_t_end", &Parameter::setTEnd)
-        .def("set_t_out", &Parameter::setTOut)
-        .def("set_t_start_out", &Parameter::setTStartOut)
+        .def("set_timestep_end", &Parameter::setTimestepEnd)
+        .def("set_timestep_out", &Parameter::setTimestepOut)
+        .def("set_timestep_start_out", &Parameter::setTimestepStartOut)
         .def("set_timestep_of_coarse_level", &Parameter::setTimestepOfCoarseLevel)
+        .def("set_calc_turbulence_intensity", &Parameter::setCalcTurbulenceIntensity)
         .def("set_output_path", &Parameter::setOutputPath)
         .def("set_output_prefix", &Parameter::setOutputPrefix)
-        .def("set_f_name", &Parameter::setFName)
+        .def("set_print_files", &Parameter::setOutflowPressureCorrectionFactor)
         .def("set_print_files", &Parameter::setPrintFiles)
         .def("set_temperature_init", &Parameter::setTemperatureInit)
         .def("set_temperature_BC", &Parameter::setTemperatureBC)
-        .def("set_viscosity", &Parameter::setViscosity)
-        .def("set_velocity", &Parameter::setVelocity)
+        .def("set_viscosity_LB", &Parameter::setViscosityLB)
+        .def("set_velocity_LB", &Parameter::setVelocityLB)
         .def("set_viscosity_ratio", &Parameter::setViscosityRatio)
         .def("set_velocity_ratio", &Parameter::setVelocityRatio)
         .def("set_density_ratio", &Parameter::setDensityRatio)
         .def("set_devices", &Parameter::setDevices)
         .def("set_is_body_force", &Parameter::setIsBodyForce)
-        .def("set_use_AMD", &Parameter::setUseAMD)
-        .def("set_use_Wale", &Parameter::setUseWale)
-        .def("set_SGS_constant", &Parameter::setSGSConstant)
         .def("set_main_kernel", &Parameter::setMainKernel)
         .def("set_AD_kernel", &Parameter::setADKernel)
-        .def("set_use_AMD", &Parameter::setUseAMD)
-        .def("set_use_Wale", &Parameter::setUseWale)
-        .def("set_SGS_constant", &Parameter::setSGSConstant)
+        .def("set_has_wall_monitor", &Parameter::setHasWallModelMonitor)
+        .def("set_outflow_pressure_correction_factor", &Parameter::setOutflowPressureCorrectionFactor)
         .def("set_initial_condition", [](Parameter &para, std::function<std::vector<float>(real, real, real)> &init_func)
         {
             para.setInitialCondition([init_func](real coordX, real coordY, real coordZ, real& rho, real& vx, real& vy, real& vz)
@@ -60,6 +61,43 @@ namespace parameter
                 vz = values[3];
             });
         })
+        .def("set_initial_condition_uniform", [](Parameter &para, real velocity_x, real velocity_y, real velocity_z)
+        {
+            para.setInitialCondition([velocity_x, velocity_y, velocity_z](real coordX, real coordY, real coordZ, real& rho, real& vx, real& vy, real& vz) // must capture values explicitly!
+            {
+                rho = c0o1;
+                vx = velocity_x;
+                vy = velocity_y;
+                vz = velocity_z;
+            });
+        })
+        .def("set_initial_condition_log_law", [](Parameter &para, real u_star, real z0, real velocityRatio)
+        {
+            para.setInitialCondition(
+                [u_star, z0, velocityRatio](real coordX, real coordY, real coordZ, real& rho, real& vx, real& vy, real& vz)
+                {
+                    coordZ = coordZ > c0o1 ? coordZ : c0o1;
+
+                    rho = c0o1;
+                    vx  = u_star/c4o10 * log(coordZ/z0+c1o1) / velocityRatio;
+                    vy = c0o1;
+                    vz = c0o1;
+                }
+            );
+        })
+        .def("set_initial_condition_perturbed_log_law", [](Parameter &para, real u_star, real z0, real L_x, real L_z, real H, real velocityRatio)
+        {
+            para.setInitialCondition(
+                [u_star, z0, L_x, L_z, H, velocityRatio](real coordX, real coordY, real coordZ, real& rho, real& vx, real& vy, real& vz)
+                {
+                    coordZ = coordZ > c0o1 ? coordZ : c0o1;
+                    rho = c0o1;
+                    vx  = (u_star/c4o10 * log(coordZ/z0+c1o1) + c2o1*sin(cPi*c16o1*coordX/L_x)*sin(cPi*c8o1*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1)) / velocityRatio; 
+                    vy  = c2o1*sin(cPi*c16o1*coordX/L_x)*sin(cPi*c8o1*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1) / velocityRatio; 
+                    vz  = c8o1*u_star/c4o10*(sin(cPi*c8o1*coordY/H)*sin(cPi*c8o1*coordZ/H)+sin(cPi*c8o1*coordX/L_x))/(pow(c1o2*L_z-coordZ, c2o1)+c1o1) / velocityRatio;
+                }
+            );
+        })
         .def("add_actuator", &Parameter::addActuator)
         .def("add_probe", &Parameter::addProbe)
         .def("get_output_path", &Parameter::getOutputPath)
@@ -70,11 +108,10 @@ namespace parameter
         .def("get_viscosity_ratio", &Parameter::getViscosityRatio)
         .def("get_density_ratio", &Parameter::getDensityRatio)
         .def("get_force_ratio", &Parameter::getForceRatio)
-        .def("get_use_AMD", &Parameter::getUseAMD)
-        .def("get_use_Wale", &Parameter::getUseWale)
         .def("get_SGS_constant", &Parameter::getSGSConstant)
         .def("get_is_body_force", &Parameter::getIsBodyForce)
         .def("set_has_wall_model_monitor", &Parameter::setHasWallModelMonitor)
         ;
+
     }
 }
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/precursor_writer.cpp b/pythonbindings/src/gpu/submodules/precursor_writer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0e45e65d4d81246d47a76cf19bc14c74ec17a4af
--- /dev/null
+++ b/pythonbindings/src/gpu/submodules/precursor_writer.cpp
@@ -0,0 +1,35 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+#include <gpu/VirtualFluids_GPU/PreCollisionInteractor/PreCollisionInteractor.h>
+#include <gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.h>
+
+namespace precursor_writer
+{
+    namespace py = pybind11;
+
+    void makeModule(py::module_ &parentModule)
+    {
+        py::enum_<OutputVariable>(parentModule, "OutputVariable")
+        .value("Velocities", OutputVariable::Velocities)
+        .value("Distributions", OutputVariable::Distributions);
+
+        py::class_<PrecursorWriter, PreCollisionInteractor, std::shared_ptr<PrecursorWriter>>(parentModule, "PrecursorWriter")
+        .def(py::init < std::string,
+                        std::string,
+                        real,
+                        real, real,
+                        real, real,
+                        uint, uint, 
+                        OutputVariable, 
+                        uint>(),
+                        "filename"
+                        "output_path", 
+                        "x_pos",
+                        "y_min", "y_max",
+                        "z_min", "z_max",
+                        "t_start_out", "t_save", 
+                        "output_variable", 
+                        "max_timesteps_per_file");
+    }
+}
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/simulation.cpp b/pythonbindings/src/gpu/submodules/simulation.cpp
index b775d604ba41530223f22738c72785b2c15348b3..88716ebb38f765a94ae4c6c42e27eae4c93e1adb 100644
--- a/pythonbindings/src/gpu/submodules/simulation.cpp
+++ b/pythonbindings/src/gpu/submodules/simulation.cpp
@@ -8,6 +8,8 @@
 #include <gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h>
 #include <gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h>
 #include <gpu/VirtualFluids_GPU/Output/DataWriter.h>
+#include "gpu/VirtualFluids_GPU/BoundaryConditions/BoundaryConditionFactory.h"
+#include "gpu/VirtualFluids_GPU/TurbulenceModels/TurbulenceModelFactory.h"
 
 namespace simulation
 {
@@ -20,11 +22,25 @@ namespace simulation
         .def(py::init<  std::shared_ptr<Parameter>,
                         std::shared_ptr<CudaMemoryManager>,
                         vf::gpu::Communicator &,
-                        GridProvider &>(), 
+                        GridProvider &,
+                        BoundaryConditionFactory*>(), 
                         "parameter",
                         "memoryManager",
                         "communicator",
-                        "gridProvider")
+                        "gridProvider",
+                        "bcFactory")
+        .def(py::init<  std::shared_ptr<Parameter>,
+                        std::shared_ptr<CudaMemoryManager>,
+                        vf::gpu::Communicator &,
+                        GridProvider &,
+                        BoundaryConditionFactory*,
+                        std::shared_ptr<TurbulenceModelFactory>>(), 
+                        "parameter",
+                        "memoryManager",
+                        "communicator",
+                        "gridProvider",
+                        "bcFactory",
+                        "tmFactory")
         .def("run", &Simulation::run)
         .def("addKineticEnergyAnalyzer", &Simulation::addKineticEnergyAnalyzer)
         .def("addEnstrophyAnalyzer", &Simulation::addEnstrophyAnalyzer);
diff --git a/pythonbindings/src/gpu/submodules/turbulence_models.cpp b/pythonbindings/src/gpu/submodules/turbulence_models.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..be9173c0ec206cecb9f602276d9234a2e9064372
--- /dev/null
+++ b/pythonbindings/src/gpu/submodules/turbulence_models.cpp
@@ -0,0 +1,24 @@
+#include "pybind11/pybind11.h"
+#include "gpu/VirtualFluids_GPU/TurbulenceModels/TurbulenceModelFactory.h"
+#include "gpu/VirtualFluids_GPU/LBM/LB.h"
+
+namespace turbulence_model
+{
+    namespace py = pybind11;
+
+    void makeModule(py::module_ &parentModule)
+    {
+        py::enum_<TurbulenceModel>(parentModule, "TurbulenceModel")
+        .value("Smagorinsky", TurbulenceModel::Smagorinsky)
+        .value("AMD", TurbulenceModel::AMD)
+        .value("QR", TurbulenceModel::QR)
+        .value("None", TurbulenceModel::None);
+
+        py::class_<TurbulenceModelFactory, std::shared_ptr<TurbulenceModelFactory>>(parentModule, "TurbulenceModelFactory")
+        .def(py::init< std::shared_ptr<Parameter>>(), "para")
+        .def("set_turbulence_model", &TurbulenceModelFactory::setTurbulenceModel)
+        .def("set_model_constant", &TurbulenceModelFactory::setModelConstant)
+        .def("read_config_file", &TurbulenceModelFactory::readConfigFile);
+
+    }
+}
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/velocity_setter.cpp b/pythonbindings/src/gpu/submodules/velocity_setter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..17114961b5b01e8a6e52144e3a89307708cc5a0b
--- /dev/null
+++ b/pythonbindings/src/gpu/submodules/velocity_setter.cpp
@@ -0,0 +1,20 @@
+#include <pybind11/pybind11.h>
+#include <gpu/GridGenerator/VelocitySetter/VelocitySetter.h>
+
+namespace velocity_setter
+{
+    namespace py = pybind11;
+
+    void makeModule(py::module_ &parentModule)
+    {
+        py::enum_<FileType>(parentModule, "FileType")
+        .value("VTK", FileType::VTK);
+
+        parentModule.def("create_file_collection", &createFileCollection);
+
+        py::class_<VelocityFileCollection, std::shared_ptr<VelocityFileCollection>>(parentModule, "VelocityFileCollection");
+
+        py::class_<VTKFileCollection, VelocityFileCollection, std::shared_ptr<VTKFileCollection>>(parentModule, "VTKFileCollection")
+        .def(py::init <std::string>(), "prefix");
+    }
+}
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..cf060397a1b4e9b38d16683d0ff98ee9532e4a7e
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,21 @@
+[metadata]
+name = pyfluids
+description = Python binding for VirtualFluids
+long_description = file: README.md
+long_description_content_type = text/markdown
+platforms = any
+url = https://git.rz.tu-bs.de/irmb/virtualfluids
+version = 0.0.1
+
+[options]
+zip_safe = False
+packages = find:
+package_dir =
+    =pythonbindings
+python_requires = >=3.6
+install_requires =
+    cmake-build-extension
+
+[options.packages.find]
+where = pythonbindings
+
diff --git a/setup.py b/setup.py
index b26e1c13d09447d17f8e9fd6e2cd0d0671595bf3..b5cd97056bd4696df8cfd1df1794f3689c23376f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,137 +1,54 @@
-import os
-import re
+import inspect
 import sys
-import platform
-import subprocess
+from pathlib import Path
 
-from setuptools import setup, Extension
-from setuptools.command.build_ext import build_ext
-from setuptools.command.install import install
-from setuptools.command.develop import develop
-from distutils.version import LooseVersion
+import cmake_build_extension
+import setuptools
 
 """
 Install python wrapper of virtual fluids
-Install GPU backend with option --GPU
-(pass to pip via --install-option="--GPU")
+install via python:
+    python setup.py install build_ext
+    set CMAKE Flags via -DBUILD_VF_GPU:BOOL=1
+or install via pip:
+    pip install -e .
+    set CMAKE Flags via --configure-settings -DBUILD_VF_GPU=1
 """
 
-vf_cmake_args = [
-    "-DBUILD_VF_PYTHON_BINDINGS=ON",
-    "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
-    "-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache",
-    "-DCMAKE_C_COMPILER_LAUNCHER=ccache",
-    "-DBUILD_SHARED_LIBS=OFF",
-    "-DBUILD_WARNINGS_AS_ERRORS=OFF"
-]
-
-vf_cpu_cmake_args = [
-    "-DBUILD_VF_DOUBLE_ACCURACY=ON",
-    "-DBUILD_VF_CPU:BOOL=ON",
-    "-DBUILD_VF_UNIT_TESTS:BOOL=ON",
-    "-DUSE_METIS=ON",
-    "-DUSE_MPI=ON"
-]
-
-vf_gpu_cmake_args = [
-    "-DBUILD_VF_DOUBLE_ACCURACY=OFF",
-    "-DBUILD_VF_GPU:BOOL=ON",
-    "-DBUILD_VF_UNIT_TESTS:BOOL=OFF",
-]
-
-GPU = False
-
-class CommandMixin:
-    user_options = [
-        ('GPU', None, 'compile pyfluids with GPU backend'),
-    ]
-
-    def initialize_options(self):
-        super().initialize_options()
-        self.GPU = False
-
-    def finalize_options(self):
-        super().finalize_options()
-
-    def run(self):
-        global GPU
-        GPU = GPU or self.GPU
-        super().run()
-
-
-class InstallCommand(CommandMixin, install):
-    user_options = getattr(install, 'user_options', []) + CommandMixin.user_options
-
-
-class DevelopCommand(CommandMixin, develop):
-    user_options = getattr(develop, 'user_options', []) + CommandMixin.user_options
-
-
-class CMakeExtension(Extension):
-    def __init__(self, name, sourcedir=''):
-        Extension.__init__(self, name, sources=[])
-        self.sourcedir = os.path.abspath(sourcedir)
-
-
-class CMakeBuild(CommandMixin, build_ext):
-    user_options = getattr(build_ext, 'user_options', []) + CommandMixin.user_options
-
-    def run(self):
-        super().run()
-        try:
-            out = subprocess.check_output(['cmake', '--version'])
-        except OSError:
-            raise RuntimeError("CMake must be installed to build the following extensions: " +
-                               ", ".join(e.name for e in self.extensions))
-
-        if platform.system() == "Windows":
-            cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
-            if cmake_version < '3.1.0':
-                raise RuntimeError("CMake >= 3.1.0 is required on Windows")
-
-        for ext in self.extensions:
-            self.build_extension(ext)
-
-    def build_extension(self, ext):
-        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
-        # required for auto-detection of auxiliary "native" libs
-        if not extdir.endswith(os.path.sep):
-            extdir += os.path.sep
-
-        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
-                      '-DPYTHON_EXECUTABLE=' + sys.executable]
-
-        cfg = 'Debug' if self.debug else 'Release'
-        build_args = ['--config', cfg]
-
-        if platform.system() == "Windows":
-            cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
-            if sys.maxsize > 2**32:
-                cmake_args += ['-A', 'x64']
-            build_args += ['--', '/m']
-        else:
-            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
-            build_args += ['--', '-j2']
-
-        cmake_args.extend(vf_cmake_args)
-        cmake_args.extend(vf_gpu_cmake_args if GPU else vf_cpu_cmake_args)
-
-        env = os.environ.copy()
-        env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
-                                                              self.distribution.get_version())
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-        cmake_cache_file = self.build_temp+"/CMakeCache.txt"
-        if os.path.exists(cmake_cache_file):
-            os.remove(cmake_cache_file)
-        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
-        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
-
+init_py = inspect.cleandoc(
+    """
+    import cmake_build_extension
+    with cmake_build_extension.build_extension_env():
+        from .bindings import *
+    """
+)
 
-setup(
-    name='pyfluids',
-    version='0.0.1',
-    ext_modules=[CMakeExtension('pyfluids')],
-    cmdclass={"install": InstallCommand, "develop": DevelopCommand, "build_ext": CMakeBuild},
-    zip_safe=False,
+extra_args = []
+if("cmake_args" in locals()):
+    extra_args.extend([f"{k}={v}" for k,v in locals()["cmake_args"].items()])
+
+setuptools.setup(
+    ext_modules=[
+        cmake_build_extension.CMakeExtension(
+            name="pyfluids",
+            install_prefix="pyfluids",
+            write_top_level_init=init_py,
+            source_dir=str(Path(__file__).parent.absolute()),
+            cmake_configure_options = [
+                f"-DPython3_ROOT_DIR={Path(sys.prefix)}",
+                "-DCALL_FROM_SETUP_PY:BOOL=ON",
+                "-DBUILD_VF_PYTHON_BINDINGS=ON",
+                "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_C_COMPILER_LAUNCHER=ccache",
+                "-DBUILD_SHARED_LIBS=OFF",
+                "-DBUILD_VF_DOUBLE_ACCURACY=OFF",
+                "-DBUILD_VF_UNIT_TESTS:BOOL=OFF",
+                "-DBUILD_WARNINGS_AS_ERRORS=OFF",
+            ] + extra_args,
+        )
+    ],
+    cmdclass=dict(
+        build_ext=cmake_build_extension.BuildExtension,
+    ),
 )
diff --git a/src/basics/basics/utilities/UbTuple.h b/src/basics/basics/utilities/UbTuple.h
index fe9c787cead38621beafab3d082122277bdcff73..228ab48898e5e61777d2fcc0061eb6f0434d5cad 100644
--- a/src/basics/basics/utilities/UbTuple.h
+++ b/src/basics/basics/utilities/UbTuple.h
@@ -597,6 +597,8 @@ inline UbTuple<T1, T2, T3, T4, T5, T6, T7, T8> makeUbTuple(T1 const &a1, T2 cons
 // some typedefs
 using UbTupleFloat2        = UbTuple<float, float>;
 using UbTupleFloat3        = UbTuple<float, float, float>;
+using UbTupleFloat4        = UbTuple<float, float, float, float>;
+using UbTupleFloat6        = UbTuple<float, float, float,float, float, float>;
 using UbTupleInt2          = UbTuple<int, int>;
 using UbTupleInt3          = UbTuple<int, int, int>;
 using UbTupleInt4          = UbTuple<int, int, int, int>;
diff --git a/src/basics/basics/writer/WbWriterVtkXmlImageBinary.cpp b/src/basics/basics/writer/WbWriterVtkXmlImageBinary.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d77af8747fbdae366c372749be6014033797501
--- /dev/null
+++ b/src/basics/basics/writer/WbWriterVtkXmlImageBinary.cpp
@@ -0,0 +1,360 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file WbWriterVtkXmlImageBinary.cpp
+//! \ingroup writer
+//! \author Soeren Freudiger, Sebastian Geller
+//=======================================================================================
+#include <basics/utilities/UbLogger.h>
+#include <basics/writer/WbWriterVtkXmlImageBinary.h>
+#include <cstring>
+
+using namespace std;
+
+/*===============================================================================*/
+const std::string WbWriterVtkXmlImageBinary::pvdEndTag = "   </Collection>\n</VTKFile>";
+/*===============================================================================*/
+string WbWriterVtkXmlImageBinary::writeCollection(const string &filename, const vector<string> &filenames,
+                                             const double &timeStep, const bool &sepGroups)
+{
+    string vtkfilename = filename + ".pvd";
+    ofstream out(vtkfilename.c_str());
+    if (!out) {
+        out.clear(); // flags ruecksetzen (ansonsten liefert utern if(!out) weiterhin true!!!
+        string path = UbSystem::getPathFromString(vtkfilename);
+        if (path.size() > 0) {
+            UbSystem::makeDirectory(path);
+            out.open(vtkfilename.c_str());
+        }
+        if (!out)
+            throw UbException(UB_EXARGS, "couldn't open file " + vtkfilename);
+    }
+
+    string endian;
+    if (UbSystem::isLittleEndian())
+        endian = "LittleEndian";
+    else
+        endian = "BigEndian";
+    out << "<VTKFile type=\"Collection\" version=\"0.1\" byte_order=\"" << endian << "\" >" << endl;
+    out << "   <Collection>" << endl;
+
+    int group = 0, part = 0;
+    for (size_t i = 0; i < filenames.size(); i++) {
+        out << "       <DataSet timestep=\"" << timeStep << "\" group=\"" << group << "\" part=\"" << part
+            << "\" file=\"" << filenames[i] << "\"/>" << endl;
+        if (sepGroups)
+            group++;
+        else
+            part++;
+    }
+    out << pvdEndTag;
+    out.close();
+
+    return vtkfilename;
+}
+/*===============================================================================*/
+string WbWriterVtkXmlImageBinary::addFilesToCollection(const string &filename, const vector<string> &filenames,
+                                                  const double &timeStep, const bool &sepGroups)
+{
+    string vtkfilename = filename;
+    fstream test(vtkfilename.c_str(), ios::in);
+    if (!test) {
+        test.clear();
+        vtkfilename += ".pvd";
+        test.open(vtkfilename.c_str(), ios::in);
+        if (!test)
+            return this->writeCollection(filename, filenames, timeStep, sepGroups);
+    }
+
+    fstream out(vtkfilename.c_str(), ios::in | ios::out);
+    out.seekp(-(int)pvdEndTag.size() - 1, ios_base::end);
+
+    int group = 0;
+    for (size_t i = 0; i < filenames.size(); i++) {
+        out << "       <DataSet timestep=\"" << timeStep << "\" group=\"" << group << "\" part=\"" << i << "\" file=\""
+            << filenames[i] << "\"/>" << endl;
+        if (sepGroups)
+            group++;
+    }
+    out << pvdEndTag;
+
+    return vtkfilename;
+}
+/*===============================================================================*/
+string WbWriterVtkXmlImageBinary::writeParallelFile(const string &filename, const UbTupleInt6 &wholeExtent, const UbTupleFloat3 &origin, const UbTupleFloat3 &spacing, 
+                                                vector<string> &pieceSources, vector<UbTupleInt6> &pieceExtents,
+                                                vector<string> &pointDataNames, vector<string> &cellDataNames)
+{
+    string vtkfilename = filename + ".pvti";
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeParallelFile to " << vtkfilename << " - start");
+
+    ofstream out(vtkfilename.c_str());
+    if (!out) {
+        out.clear(); // flags ruecksetzen (ansonsten liefert utern if(!out) weiterhin true!!!
+        string path = UbSystem::getPathFromString(vtkfilename);
+        if (path.size() > 0) {
+            UbSystem::makeDirectory(path);
+            out.open(vtkfilename.c_str());
+        }
+        if (!out)
+            throw UbException(UB_EXARGS, "couldn't open file " + vtkfilename);
+    }
+
+    // VTK FILE
+    out << "<VTKFile type=\"PImageData\" version=\"0.1\" byte_order=\"LittleEndian\">"
+        << "\n";
+    out << "  <PImageData "
+            << "WholeExtent=\"" << val<1>(wholeExtent) << " "
+                                << val<2>(wholeExtent) << " " 
+                                << val<3>(wholeExtent) << " " 
+                                << val<4>(wholeExtent) << " " 
+                                << val<5>(wholeExtent) << " "
+                                << val<6>(wholeExtent) << "\" "
+            << "GhostLevel=\"0\" "
+            << "Origin=\""  << val<1>(origin) << " "
+                            << val<2>(origin) << " "
+                            << val<3>(origin) << "\" "
+            << "Spacing=\"" << val<1>(spacing) << " "
+                            << val<2>(spacing) << " "
+                            << val<3>(spacing) << "\" "
+        << "> \n";
+    out << "    <PPointData>\n";
+    for (size_t s = 0; s < pointDataNames.size(); s++)
+        out << "      <PDataArray type=\"Float32\" Name=\"" << pointDataNames[s] << "\"/>\n";
+    out << "    </PPointData>\n";
+    if (cellDataNames.size() > 0) {
+        out << "    <PCellData>\n";
+        for (size_t s = 0; s < cellDataNames.size(); s++)
+            out << "      <PDataArray type=\"Float32\" Name=\"" << cellDataNames[s] << "\"/>\n";
+        out << "    </PCellData>\n";
+    }
+    for (size_t s = 0; s < pieceSources.size(); s++)
+        out << "    <Piece Extent=\""   << val<1>(pieceExtents[s]) << " " 
+                                        << val<2>(pieceExtents[s]) << " " 
+                                        << val<3>(pieceExtents[s]) << " " 
+                                        << val<4>(pieceExtents[s]) << " " 
+                                        << val<5>(pieceExtents[s]) << " "
+                                        << val<6>(pieceExtents[s]) << "\" Source=\"" << pieceSources[s] << "\"/>\n";
+    out << "  </PImageData>\n";
+    out << "</VTKFile>";
+    out << endl;
+    out.close();
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeParallelFile to " << vtkfilename << " - end");
+
+    return vtkfilename;
+}
+/*===============================================================================*/
+string WbWriterVtkXmlImageBinary::writeOctsWithCellData(const string &filename, vector<UbTupleFloat3> &nodes,
+                                                   vector<UbTupleInt8> &cells, vector<string> &datanames,
+                                                   vector<vector<double>> &celldata)
+{
+    string vtkfilename = filename + getFileExtension();
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeOctsWithCellData to " << vtkfilename << " - start");
+
+    vector<string> nodeDataNames;
+    vector<vector<double>> nodedata;
+
+    UbTupleFloat3 origin, spacing;
+    UbTupleInt6 extent;
+
+    getMetaDataOfImage(nodes, origin, spacing, extent);
+
+    this->writeData(vtkfilename, nodeDataNames, datanames, nodedata, celldata, extent, origin, spacing, extent);
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeOctsWithCellData to " << vtkfilename << " - end");
+
+    return vtkfilename;
+}
+/*===============================================================================*/
+string WbWriterVtkXmlImageBinary::writeOctsWithNodeData(const string &filename, vector<UbTupleFloat3> &nodes,
+                                                   vector<UbTupleUInt8> &cells, vector<string> &datanames,
+                                                   vector<vector<double>> &nodedata)
+{
+    string vtkfilename = filename + getFileExtension();
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeOctsWithNodeData to " << vtkfilename << " - start");
+
+    vector<string> cellDataNames;
+    vector<vector<double>> cellData;
+
+    UbTupleFloat3 origin, spacing;
+    UbTupleInt6 extent;
+
+    getMetaDataOfImage(nodes, origin, spacing, extent);
+
+    this->writeData(vtkfilename, datanames, cellDataNames, nodedata, cellData, extent, origin, spacing, extent);
+
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeOctsWithNodeData to " << vtkfilename << " - end");
+
+    return vtkfilename;
+}
+/*===============================================================================*/
+string WbWriterVtkXmlImageBinary::writeNodesWithNodeData(const string &filename, vector<UbTupleFloat3> &nodes,
+                                                    vector<string> &datanames,
+                                                    vector<vector<double>> &nodedata)
+{
+    string vtkfilename = filename + getFileExtension();
+    UBLOG(logDEBUG1, "WbWriterVtkXmlImageBinary::writeNodesWithNodeData to " << vtkfilename << " - start");
+
+    vector<string> cellDataNames;
+    vector<vector<double>> cellData;
+
+    UbTupleFloat3 origin, spacing;
+    UbTupleInt6 extent;
+
+    getMetaDataOfImage(nodes, origin, spacing, extent);
+    this->writeData(vtkfilename, datanames, cellDataNames, nodedata, cellData, extent, origin, spacing, extent);
+
+    return vtkfilename;
+}
+
+void WbWriterVtkXmlImageBinary::getMetaDataOfImage(vector<UbTupleFloat3> &nodes, UbTupleFloat3& origin, UbTupleFloat3& spacing, UbTupleInt6& extent)
+{
+    int nofNodes = (int)nodes.size();
+    val<1>(origin) = val<1>(nodes[0]);
+    val<2>(origin) = val<2>(nodes[0]);
+    val<3>(origin) = val<3>(nodes[0]);
+
+    float l_x = val<1>(nodes[nofNodes-1])-val<1>(origin);
+    float l_y = val<2>(nodes[nofNodes-1])-val<2>(origin);
+
+    val<1>(spacing) = val<1>(nodes[1])-val<1>(nodes[0]);
+    int nx = (l_x) / val<1>(spacing);
+    val<2>(spacing) = val<2>(nodes[nx])-val<2>(nodes[0]);    
+    int ny = (l_y) / val<2>(spacing);
+    val<3>(spacing) = val<3>(nodes[nx*ny])-val<3>(nodes[0]);
+
+    val<1>(extent) = val<1>(origin)/val<1>(spacing); val<2>(extent) = val<1>(nodes[nofNodes-1])/val<1>(spacing);    
+    val<3>(extent) = val<2>(origin)/val<2>(spacing); val<4>(extent) = val<2>(nodes[nofNodes-1])/val<2>(spacing);    
+    val<5>(extent) = val<3>(origin)/val<3>(spacing); val<6>(extent) = val<3>(nodes[nofNodes-1])/val<3>(spacing);    
+
+}
+
+void WbWriterVtkXmlImageBinary::writeData(const string &vtkfilename,
+                                            vector<string> &pointDataNames, vector<string> &cellDataNames,
+                                            vector<vector<double>> &nodedata, vector<vector<double>> &celldata,
+                                            UbTupleInt6& wholeExtent,
+                                            UbTupleFloat3& origin, UbTupleFloat3& spacing, UbTupleInt6& extent)
+{
+    ofstream out(vtkfilename.c_str(), ios::out | ios::binary);
+    if (!out) {
+        out.clear(); // flags ruecksetzen (ansonsten liefert utern if(!out) weiterhin true!!!
+        string path = UbSystem::getPathFromString(vtkfilename);
+        if (path.size() > 0) {
+            UbSystem::makeDirectory(path);
+            out.open(vtkfilename.c_str(), ios::out | ios::binary);
+        }
+        if (!out)
+            throw UbException(UB_EXARGS, "couldn't open file " + vtkfilename);
+    }
+
+    size_t nPoints = pointDataNames.size()>0 ? nodedata[0].size() : celldata[0].size();
+
+    int bytesPerByteVal      = 4; //==sizeof(int)
+
+    int bytesScalarData      = 1 /*scalar         */ * (int)nPoints * sizeof(double);
+
+    int offset = 0;
+
+    // VTK FILE
+    out << "<?xml version=\"1.0\"?>\n";
+    out << "<VTKFile type=\"ImageData\" version=\"0.1\" byte_order=\"LittleEndian\" >"
+        << "\n";
+    out << "   <ImageData "
+            << "WholeExtent=\"" << val<1>(wholeExtent) << " " 
+                                << val<2>(wholeExtent) << " " 
+                                << val<3>(wholeExtent) << " " 
+                                << val<4>(wholeExtent) << " " 
+                                << val<5>(wholeExtent) << " "
+                                << val<6>(wholeExtent) << "\" "
+            << "Origin=\""  << val<1>(origin) << " " 
+                            << val<2>(origin) << " "
+                            << val<3>(origin) << "\" "
+            << "Spacing=\"" << val<1>(spacing) << " " 
+                            << val<2>(spacing) << " " 
+                            << val<3>(spacing) << "\""
+        << "> \n";
+    out << "      <Piece Extent=\"" << val<1>(extent) << " " 
+                                    << val<2>(extent) << " " 
+                                    << val<3>(extent) << " " 
+                                    << val<4>(extent) << " " 
+                                    << val<5>(extent) << " "
+                                    << val<6>(extent) << "\">\n";
+
+    // DATA SECTION
+    if (pointDataNames.size()>0)
+    {
+        out << "         <PointData>\n";
+        for (size_t s = 0; s < pointDataNames.size(); ++s) {
+            out << "            <DataArray type=\"Float64\" Name=\"" << pointDataNames[s] << "\" format=\"appended\" offset=\""
+                << offset << "\" /> \n";
+            offset += (bytesPerByteVal + bytesScalarData);
+        }
+        out << "         </PointData>\n";
+    }
+
+    if (cellDataNames.size()>0)
+    {
+        out << "         <CellData>\n";
+        for (size_t s = 0; s < cellDataNames.size(); ++s) {
+            out << "            <DataArray type=\"Float64\" Name=\"" << cellDataNames[s] << "\" format=\"appended\" offset=\""
+                << offset << "\" /> \n";
+            offset += (bytesPerByteVal + bytesScalarData);
+        }
+        out << "         </CellData>\n";
+    }
+
+    out << "      </Piece>\n";
+    out << "   </ImageData>\n";
+
+    // AppendedData SECTION
+    out << "   <AppendedData encoding=\"raw\">\n";
+    out << "_";
+
+
+    // DATA SECTION
+    // pointData
+    for (size_t s = 0; s < pointDataNames.size(); ++s) {
+        out.write((char *)&bytesScalarData, bytesPerByteVal);
+        for (size_t d = 0; d < nodedata[s].size(); ++d) {
+            double tmp = nodedata[s][d];
+            out.write((char *)&tmp, sizeof(double));
+        }
+    }
+
+    // cellData
+    for (size_t s = 0; s < cellDataNames.size(); ++s) {
+        out.write((char *)&bytesScalarData, bytesPerByteVal);
+        for (size_t d = 0; d < celldata[s].size(); ++d) {
+            double tmp = celldata[s][d];
+            out.write((char *)&tmp, sizeof(double));
+        }
+    }
+    out << "\n   </AppendedData>\n";
+    out << "</VTKFile>";
+    out << endl;
+    out.close();
+}
\ No newline at end of file
diff --git a/src/basics/basics/writer/WbWriterVtkXmlImageBinary.h b/src/basics/basics/writer/WbWriterVtkXmlImageBinary.h
new file mode 100644
index 0000000000000000000000000000000000000000..a45b51143accccf47147483dc0034e3ad77ca33d
--- /dev/null
+++ b/src/basics/basics/writer/WbWriterVtkXmlImageBinary.h
@@ -0,0 +1,110 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file WbWriterVtkXmlBinary.h
+//! \ingroup writer
+//! \author Soeren Freudiger, Sebastian Geller
+//=======================================================================================
+#ifndef WBWRITERVTKXMLIMAGEBINARY_H
+#define WBWRITERVTKXMLIMAGEBINARY_H
+
+#include <string>
+
+#include <basics/writer/WbWriter.h>
+
+#include "basics_export.h"
+
+class BASICS_EXPORT WbWriterVtkXmlImageBinary : public WbWriter
+{
+public:
+    static WbWriterVtkXmlImageBinary *getInstance()
+    {
+        static WbWriterVtkXmlImageBinary instance;
+        return &instance;
+    }
+
+    WbWriterVtkXmlImageBinary(const WbWriterVtkXmlImageBinary &) = delete;
+    const WbWriterVtkXmlImageBinary &operator=(const WbWriterVtkXmlImageBinary &) = delete;
+
+private:
+    WbWriterVtkXmlImageBinary() : WbWriter()
+    {
+        if (sizeof(unsigned char) != 1)
+            throw UbException(UB_EXARGS, "machine error char  type mismatch");
+        if (sizeof(int) != 4)
+            throw UbException(UB_EXARGS, "machine error int   type mismatch");
+        if (sizeof(float) != 4)
+            throw UbException(UB_EXARGS, "machine error float type mismatch");
+    }
+
+    static const std::string pvdEndTag;
+
+public:
+    std::string getFileExtension() override { return ".bin.vti"; }
+
+    // write a metafile
+    std::string writeCollection(const std::string &filename, const std::vector<std::string> &filenames,
+                                const double &timestep, const bool &sepGroups);
+    std::string addFilesToCollection(const std::string &filename, const std::vector<std::string> &filenames,
+                                     const double &timestep, const bool &sepGroups);
+    std::string writeParallelFile(const std::string &filename, const UbTupleInt6 &wholeExtent, const UbTupleFloat3 &origin, const UbTupleFloat3 &spacing, 
+                                std::vector<std::string> &pieceSources, std::vector<UbTupleInt6> &pieceExtents,
+                                std::vector<std::string> &pointDataNames, std::vector<std::string> &cellDataNames);
+
+    //////////////////////////////////////////////////////////////////////////
+    // nodes
+    std::string writeNodesWithNodeData(const std::string &filename, std::vector<UbTupleFloat3> &nodes,
+                                       std::vector<std::string> &datanames,
+                                       std::vector<std::vector<double>> &nodedata) override;
+
+    //////////////////////////////////////////////////////////////////////////
+    // octs
+    //     7 ---- 6
+    //    /|     /|
+    //   4 +--- 5 |
+    //   | |    | |
+    //   | 3 ---+ 2
+    //   |/     |/
+    //   0 ---- 1
+    std::string writeOctsWithCellData(const std::string &filename, std::vector<UbTupleFloat3> &nodes,
+                                      std::vector<UbTupleInt8> &cells, std::vector<std::string> &datanames,
+                                      std::vector<std::vector<double>> &celldata) override;
+    std::string writeOctsWithNodeData(const std::string &filename, std::vector<UbTupleFloat3> &nodes,
+                                      std::vector<UbTupleUInt8> &cells, std::vector<std::string> &datanames,
+                                      std::vector<std::vector<double>> &nodedata) override;
+    void writeData(const std::string &vtkfilename,
+                                            std::vector<std::string> &pointDataNames, std::vector<std::string> &cellDataNames,
+                                            std::vector<std::vector<double>> &nodedata, std::vector<std::vector<double>> &celldata, 
+                                            UbTupleInt6 &wholeExtent,
+                                            UbTupleFloat3 &origin, UbTupleFloat3 &spacing, UbTupleInt6 &extent);
+
+private:
+    void getMetaDataOfImage(std::vector<UbTupleFloat3> &nodes, UbTupleFloat3& origin, UbTupleFloat3& spacing, UbTupleInt6& extent);
+};
+
+#endif // WBWRITERVTKXMLIMAGEBINARY_H
diff --git a/src/basics/config/ConfigurationFile.h b/src/basics/config/ConfigurationFile.h
index ef7e7c9f06f94cabb3ba9cbefe95c8ee75736958..4a53f7add85b9c6461fda0bab20fa6656eebc5d3 100644
--- a/src/basics/config/ConfigurationFile.h
+++ b/src/basics/config/ConfigurationFile.h
@@ -64,6 +64,10 @@ public:
    template<class T>
    T getValue(const std::string& key) const;
 
+   //! get value with key and default value
+   template<class T>
+   T getValue(const std::string& key, T defaultValue) const;
+
 private:
    //! the container
    std::map<std::string, std::string> data;
@@ -138,6 +142,19 @@ T ConfigurationFile::getValue(const std::string& key) const
    return x;
 }
 
+template<class T>
+T ConfigurationFile::getValue(const std::string& key, T defaultValue) const
+{
+   if (contains(key))
+   {
+      return getValue<T>(key);
+   }
+   else
+   {
+      return defaultValue;
+   }
+}
+
 }
 
 #endif
diff --git a/src/gpu/GridGenerator/VelocitySetter/VelocitySetter.cpp b/src/gpu/GridGenerator/VelocitySetter/VelocitySetter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed1335d2314f6fe4459f711872c1af968b4a600d
--- /dev/null
+++ b/src/gpu/GridGenerator/VelocitySetter/VelocitySetter.cpp
@@ -0,0 +1,425 @@
+#include "VelocitySetter.h"
+#include "GridGenerator/grid/Grid.h"
+#include "GridGenerator/grid/BoundaryConditions/BoundaryCondition.h"
+#include <logger/Logger.h>
+
+
+#include <math.h>
+#include <sstream>
+#include <fstream>
+#include <iostream>
+#include <algorithm>
+
+SPtr<VelocityFileCollection> createFileCollection(std::string prefix, FileType type)
+{
+    switch(type)
+    {
+        case FileType::VTK:
+            return std::make_shared<VTKFileCollection>(prefix);
+            break;
+        default:
+            return nullptr;
+    }
+}
+
+SPtr<VelocityReader> createReaderForCollection(SPtr<VelocityFileCollection> fileCollection)
+{
+    switch(fileCollection->getFileType())
+    {
+        case FileType::VTK:
+            return std::make_shared<VTKReader>(std::static_pointer_cast<VTKFileCollection>(fileCollection));
+            break;
+        default:
+            return nullptr;
+    }
+}
+template<typename T>
+std::vector<T> readStringToVector(std::string s)
+{
+    std::vector<T> out;
+    std::stringstream input(s);
+    float num;
+    while(input >> num)
+    {
+        out.push_back(num);
+    }
+    return out;
+}
+std::string readElement(std::string line)
+{
+    size_t elemStart = line.find("<")+1;
+    // size_t elemEnd = line.find("/>", elemStart);
+    size_t nameLen = line.find(" ", elemStart)-elemStart;
+    return line.substr(elemStart, nameLen);
+}
+
+std::string readAttribute(std::string line, std::string attributeName)
+{
+    size_t attributeStart = line.find(attributeName)+attributeName.size() + 2; // add 2 for '="'
+    size_t attributeLen = line.find("\"", attributeStart)-attributeStart;
+    return line.substr(attributeStart, attributeLen);
+}
+
+void VTKFile::readHeader()
+{
+    //TODO make this more flexible
+    std::ifstream file(this->fileName);
+
+    std::string line;
+
+    getline(file, line); // VTKFile
+    if(line[1]=='?') getline(file, line); // ignore first line if xml version
+
+    getline(file, line); // ImageData
+    std::vector<int> wholeExtent = readStringToVector<int>(readAttribute(line, "WholeExtent"));
+    std::vector<float> origin = readStringToVector<float>(readAttribute(line, "Origin"));
+    std::vector<float> spacing = readStringToVector<float>(readAttribute(line, "Spacing"));
+
+    getline(file, line); // Piece 
+    std::vector<int> pieceExtent = readStringToVector<int>(readAttribute(line, "Extent"));
+    getline(file, line); // PointData
+
+    getline(file, line);
+    while(strcmp(readElement(line).c_str(), "DataArray")==0)
+    {
+        Quantity quant = Quantity();
+        quant.name = readAttribute(line, "Name");
+        quant.offset = std::stoi(readAttribute(line, "offset"));
+        this->quantities.push_back( quant );
+        getline(file, line);
+    }
+    getline(file, line); // </Piece
+    getline(file, line); // </ImageData
+    getline(file, line); // AppendedData
+
+    int offset = int(file.tellg())+sizeof(char)+4; // skip underscore and bytesPerVal
+
+    for(auto& quantity: this->quantities)
+    {
+        quantity.offset += offset;
+    }
+
+    file.close();
+
+    this->deltaX = spacing[0];
+    this->deltaY = spacing[1];
+    this->deltaZ = spacing[2];
+
+    this->nx = pieceExtent[1]-pieceExtent[0]+1;
+    this->ny = pieceExtent[3]-pieceExtent[2]+1;
+    this->nz = pieceExtent[5]-pieceExtent[4]+1;
+
+    this->minX = origin[0]+this->deltaX*pieceExtent[0]; this->maxX = (this->nx-1)*this->deltaX+this->minX;
+    this->minY = origin[1]+this->deltaY*pieceExtent[2]; this->maxY = (this->ny-1)*this->deltaY+this->minY;
+    this->minZ = origin[2]+this->deltaZ*pieceExtent[4]; this->maxZ = (this->nz-1)*this->deltaZ+this->minZ;
+    // printFileInfo();
+
+}
+
+bool VTKFile::markNANs(std::vector<uint> readIndices)
+{
+    std::ifstream buf(fileName.c_str(), std::ios::in | std::ios::binary);
+
+    std::vector<double> tmp;
+    tmp.reserve(readIndices.size());
+    buf.seekg(this->quantities[0].offset);
+    buf.read((char*) tmp.data(), sizeof(double)*readIndices.size());
+    auto firstNAN = std::find_if(tmp.begin(), tmp.end(), [](auto it){ return isnan(it); });
+    
+    return firstNAN != tmp.end();
+}
+
+void VTKFile::loadFile()
+{
+    std::ifstream buf(this->fileName.c_str(), std::ios::in | std::ios::binary);
+    for(auto& quantity: this->quantities)
+    {
+        quantity.values.resize(getNumberOfPoints());
+        buf.seekg(quantity.offset);
+        buf.read(reinterpret_cast<char*>(quantity.values.data()), this->getNumberOfPoints()*sizeof(double));
+    }
+
+    buf.close();
+
+    this->loaded = true;
+}
+
+void VTKFile::unloadFile()
+{
+    for(auto& quantity : this->quantities)
+    {
+        std::vector<double> replacement;
+        quantity.values.swap(replacement);
+    }
+    this->loaded = false;
+}
+
+void VTKFile::getData(real* data, uint numberOfNodes, std::vector<uint> readIndeces, std::vector<uint> writeIndices, uint offsetRead, uint offsetWrite)
+{
+    if(!this->loaded) loadFile();
+
+    size_t nPoints = writeIndices.size();
+
+    for(size_t j=0; j<this->quantities.size(); j++)
+    {
+        real* quant = &data[j*numberOfNodes];
+        for(size_t i=0; i<nPoints; i++)
+        {
+            quant[offsetWrite+writeIndices[i]] = this->quantities[j].values[readIndeces[i]+offsetRead];
+        }
+    }
+}
+
+void VTKFile::printFileInfo()
+{
+    printf("file %s with \n nx %i ny %i nz %i \n origin %f %f %f \n spacing %f %f %f \n", 
+            fileName.c_str(), nx, ny, nz, minX, minY, minZ, deltaX, deltaY, deltaZ);
+    for(auto quantity: this->quantities)
+    {
+        printf("\t quantity %s offset %i \n", quantity.name.c_str(), quantity.offset);
+    }
+        
+}
+
+
+void VTKFileCollection::findFiles()
+{
+    bool foundLastLevel = false;
+
+    while(!foundLastLevel)
+    {
+        bool foundLastID = false;
+        std::vector<std::vector<VTKFile>> filesOnThisLevel;
+        while(!foundLastID)
+        {
+            bool foundLastPart = false;
+            std::vector<VTKFile> filesWithThisId;
+            while (!foundLastPart)
+            {
+                std::string fname = makeFileName((int)files.size(), (int)filesOnThisLevel.size(), (int)filesWithThisId.size());
+                std::ifstream f(fname);
+                if(f.good())
+                    filesWithThisId.emplace_back(fname);
+                else
+                    foundLastPart = true;
+                
+            }
+            if(!filesWithThisId.empty())
+                filesOnThisLevel.push_back(filesWithThisId);
+            else foundLastID = true;
+        }
+        if(!filesOnThisLevel.empty())
+            files.push_back(filesOnThisLevel);
+        else foundLastLevel = true;
+    }
+}
+    
+void VelocityReader::getNeighbors(uint* neighborNT, uint* neighborNB, uint* neighborST, uint* neighborSB)
+{
+    std::copy(planeNeighborNT.begin(), planeNeighborNT.end(), &neighborNT[writingOffset]);
+    std::copy(planeNeighborNB.begin(), planeNeighborNB.end(), &neighborNB[writingOffset]);
+    std::copy(planeNeighborST.begin(), planeNeighborST.end(), &neighborST[writingOffset]);
+    std::copy(planeNeighborSB.begin(), planeNeighborSB.end(), &neighborSB[writingOffset]);
+}
+
+void VelocityReader::getWeights(real* _weightsNT, real* _weightsNB, real* _weightsST, real* _weightsSB)
+{
+    std::copy(weightsNT.begin(), weightsNT.end(), &_weightsNT[writingOffset]);
+    std::copy(weightsNB.begin(), weightsNB.end(), &_weightsNB[writingOffset]);
+    std::copy(weightsST.begin(), weightsST.end(), &_weightsST[writingOffset]);
+    std::copy(weightsSB.begin(), weightsSB.end(), &_weightsSB[writingOffset]);
+}
+
+
+void VTKReader::initializeIndexVectors()
+{
+    this->readIndices.resize(this->fileCollection->files.size());
+    this->writeIndices.resize(this->fileCollection->files.size());
+    this->nFile.resize(this->fileCollection->files.size());
+    for(size_t lev=0; lev<this->fileCollection->files.size(); lev++)
+    {
+        this->readIndices[lev].resize(this->fileCollection->files[lev].size());
+        this->writeIndices[lev].resize(this->fileCollection->files[lev].size());
+        this->nFile[lev].resize(this->fileCollection->files[lev].size());
+    }
+}
+
+void VTKReader::fillArrays(std::vector<real>& coordsY, std::vector<real>& coordsZ)
+{
+    this->nPoints = (uint)coordsY.size();
+    this->initializeIndexVectors();
+    real max_diff = 1e-4; // maximum distance between point on grid and precursor plane to count as exact match
+    real eps = 1e-7; // small number to avoid division by zero
+    bool perfect_match = true;
+
+    this->weightsNT.reserve(this->nPoints);
+    this->weightsNB.reserve(this->nPoints);
+    this->weightsST.reserve(this->nPoints);
+    this->weightsSB.reserve(this->nPoints);
+
+    this->planeNeighborNT.reserve(this->nPoints);
+    this->planeNeighborNB.reserve(this->nPoints);
+    this->planeNeighborST.reserve(this->nPoints);
+    this->planeNeighborSB.reserve(this->nPoints);
+
+    for(uint i=0; i<nPoints; i++)
+    {
+
+        real posY = coordsY[i];
+        real posZ = coordsZ[i];
+        bool foundNT = false, foundNB = false, foundST = false, foundSB = false, foundAll = false;
+
+        for(int level = (int)this->fileCollection->files.size()-1; level>=0; level--) // go backwards to find finest nodes first
+        {
+            for(int fileId=0; fileId<(int)this->fileCollection->files[level].size(); fileId++)
+            {
+                VTKFile file = this->fileCollection->files[level][fileId][0];
+                if(!file.inBoundingBox(posY, posZ, 0.0f)) continue;
+                // y in simulation is x in precursor/file, z in simulation is y in precursor/file 
+                // simulation -> file: N -> E, S -> W, T -> N, B -> S
+                int idx = file.findNeighborWSB(posY, posZ, 0.f);
+                if(idx!=-1)
+                {
+                    // Filter for exact matches
+                    if(abs(posY-file.getX(idx)) < max_diff && abs(posZ-file.getY(idx)) < max_diff) 
+                    {
+                        this->weightsNT.emplace_back(1e6f);
+                        this->weightsNB.emplace_back(0.f);
+                        this->weightsST.emplace_back(0.f);
+                        this->weightsSB.emplace_back(0.f);
+                        uint writeIdx = this->getWriteIndex(level, fileId, idx);
+                        this->planeNeighborNT.push_back(writeIdx);
+                        this->planeNeighborNB.push_back(writeIdx);
+                        this->planeNeighborST.push_back(writeIdx);
+                        this->planeNeighborSB.push_back(writeIdx);
+                        foundNT = true; foundNB = true; foundSB = true; foundST = true;
+                    } 
+                    else
+                    {
+                        perfect_match = false;
+                    }
+
+                    if(!foundSB)
+                    {
+                        foundSB = true;
+                        real dy = file.getX(idx)-posY;
+                        real dz = file.getY(idx)-posZ;
+                        this->weightsSB.emplace_back(1.f/(dy*dy+dz*dz+eps));
+                        this->planeNeighborSB.emplace_back(getWriteIndex(level, fileId, idx));
+                    }
+                    
+                } 
+
+                if(!foundNT) //NT in simulation is EN in precursor
+                {
+                    int idx = file.findNeighborENB(posY, posZ, 0.f);
+                    if(idx!=-1)
+                    {
+                        foundNT = true;
+                        real dy = file.getX(idx)-posY;
+                        real dz = file.getY(idx)-posZ;
+                        this->weightsNT.emplace_back(1.f/(dy*dy+dz*dz+eps));
+                        this->planeNeighborNT.emplace_back(getWriteIndex(level, fileId, idx));
+                    }
+                }
+
+                if(!foundNB) //NB in simulation is ES in precursor
+                {
+                    int idx = file.findNeighborESB(posY, posZ, 0.f);
+                    if(idx!=-1)
+                    {
+                        foundNB = true;
+                        real dy = file.getX(idx)-posY;
+                        real dz = file.getY(idx)-posZ;
+                        this->weightsNB.emplace_back(1.f/(dy*dy+dz*dz+eps));
+                        this->planeNeighborNT.emplace_back(getWriteIndex(level, fileId, idx));
+                    }
+                }
+
+                if(!foundST) //ST in simulation is WN in precursor
+                {
+                    int idx = file.findNeighborWNB(posY, posZ, 0.f);
+                    if(idx!=-1)
+                    {
+                        foundST = true;
+                        real dy = file.getX(idx)-posY;
+                        real dz = file.getY(idx)-posZ;
+                        this->weightsST.emplace_back(1.f/(dy*dy+dz*dz+eps));
+                        this->planeNeighborST.emplace_back(getWriteIndex(level, fileId, idx));
+                    }
+                }
+
+                foundAll = foundNT && foundNB && foundST && foundSB;
+
+                if(foundAll) break;
+            }
+            if(foundAll) break;
+        }
+
+        if(!foundAll)
+            throw std::runtime_error("Did not find neighbors in the VelocityFileCollection for all points");
+    }
+
+    if(perfect_match)
+        printf("Precursor was a perfect match \n");
+
+
+    for(size_t level=0; level<this->fileCollection->files.size(); level++){
+        for(size_t id=0; id<this->fileCollection->files[level].size(); id++){
+            if(this->fileCollection->files[level][id][0].markNANs(this->readIndices[level][id]))
+                throw std::runtime_error("Found a NAN in the precursor where a velocity is needed");
+    }}
+}
+
+uint VTKReader::getWriteIndex(int level, int id, int linearIndex)
+{
+    auto it = std::find(this->writeIndices[level][id].begin(), this->writeIndices[level][id].end(), linearIndex);
+    uint idx = it-this->writeIndices[level][id].begin();
+    if(it==this->writeIndices[level][id].end())
+    {
+        this->writeIndices[level][id].push_back(this->nPointsRead);
+        this->readIndices[level][id].push_back(linearIndex);
+        this->nPointsRead++;
+    }
+    return idx;
+}
+
+
+void VTKReader::getNextData(real* data, uint numberOfNodes, real time)
+{
+    for(size_t level=0; level<this->fileCollection->files.size(); level++)
+    {
+        for(size_t id=0; id<this->fileCollection->files[level].size(); id++)
+        {
+            size_t nF = this->nFile[level][id];
+
+
+            if(!this->fileCollection->files[level][id][nF].inZBounds(time))
+            {
+                nF++;
+
+                printf("switching to precursor file no. %zd\n", nF);
+                if(nF == this->fileCollection->files[level][id].size())
+                    throw std::runtime_error("Not enough Precursor Files to read");
+
+                this->fileCollection->files[level][id][nF-1].unloadFile();
+                if(nF+1<this->fileCollection->files[level][id].size())
+                {
+                    VTKFile* nextFile = &this->fileCollection->files[level][id][nF+1];
+                    if(! nextFile->isLoaded())
+                    {
+                        read.wait();
+                        read = std::async(std::launch::async, [](VTKFile* file){ file->loadFile(); }, &this->fileCollection->files[level][id][nF+1]);
+                    }
+                }
+            }
+        
+
+            VTKFile* file = &this->fileCollection->files[level][id][nF];
+
+            int off = file->getClosestIdxZ(time)*file->getNumberOfPointsInXYPlane();
+            file->getData(data, numberOfNodes, this->readIndices[level][id], this->writeIndices[level][id], off, this->writingOffset);
+            this->nFile[level][id] = nF;
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/gpu/GridGenerator/VelocitySetter/VelocitySetter.h b/src/gpu/GridGenerator/VelocitySetter/VelocitySetter.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe8fbbacf51843c599769a58d31c1d8e1fa5b0d6
--- /dev/null
+++ b/src/gpu/GridGenerator/VelocitySetter/VelocitySetter.h
@@ -0,0 +1,199 @@
+#ifndef VELOCITY_SETTER_H_
+#define VELOCITY_SETTER_H_
+
+#include "Core/DataTypes.h"
+#include <Core/StringUtilities/StringUtil.h>
+#include "PointerDefinitions.h"
+
+#include <string>
+#include <vector>
+#include <math.h>
+#include <sstream>
+#include <future>
+class Grid;
+namespace gg
+{
+    class BoundaryCondition;
+}
+
+
+enum class FileType
+{
+    VTK
+};
+
+struct Quantity
+{
+    std::string name;
+    int offset;
+    std::vector<double> values;
+};
+
+class VTKFile
+{
+public: 
+    VTKFile(std::string _fileName): 
+    fileName(_fileName)
+    {
+        readHeader();
+        this->loaded = false;
+        // printFileInfo();
+    };
+
+    void getData(real* data, uint numberOfNodes, std::vector<uint> readIndeces, std::vector<uint> writeIndices, uint offsetRead, uint offsetWrite);
+    bool markNANs(std::vector<uint> readIndices);
+    bool inBoundingBox(real posX, real posY, real posZ){return  inXBounds(posX) && inYBounds(posY) && inZBounds(posZ); };
+    bool inXBounds(real posX){ return posX<=maxX && posX>=minX; };
+    bool inYBounds(real posY){ return posY<=maxY && posY>=minY; };
+    bool inZBounds(real posZ){ return posZ<=maxZ && posZ>=minZ; };
+    int findNeighborWSB(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxWX(posX)  , getIdxSY(posY)  , getIdxBZ(posZ)  ); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborWST(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxWX(posX)  , getIdxSY(posY)  , getIdxBZ(posZ)+1); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborWNB(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxWX(posX)  , getIdxSY(posY)+1, getIdxBZ(posZ)  ); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborWNT(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxWX(posX)  , getIdxSY(posY)+1, getIdxBZ(posZ)+1); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborESB(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxWX(posX)+1, getIdxSY(posY)  , getIdxBZ(posZ)  ); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborEST(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxWX(posX)+1, getIdxSY(posY)  , getIdxBZ(posZ)+1); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborENB(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxWX(posX)+1, getIdxSY(posY)+1, getIdxBZ(posZ)  ); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int findNeighborENT(real posX, real posY, real posZ){ int idx = getLinearIndex(getIdxWX(posX)+1, getIdxSY(posY)+1, getIdxBZ(posZ)+1); return (idx>=0) && (idx<nx*ny*nz) ? idx : -1; };
+    int getIdxX(int linearIdx){ return linearIdx%nx;};
+    int getIdxY(int linearIdx){ return (linearIdx/nx)%ny;};
+    int getIdxZ(int linearIdx){ return linearIdx/(nx*ny); };
+    real getX(int linearIdx){ return getIdxX(linearIdx)*deltaX+minX; };
+    real getY(int linearIdx){ return getIdxY(linearIdx)*deltaY+minY; };
+    real getZ(int linearIdx){ return getIdxZ(linearIdx)*deltaZ+minZ; };
+    int getIdxWX(real posX){ return (posX-minX)/deltaX; };
+    int getIdxSY(real posY){ return (posY-minY)/deltaY; };
+    int getIdxBZ(real posZ){ return (posZ-minZ)/deltaZ; };
+    int getClosestIdxX(real posX){ int x = round((posX-minX)/deltaX); return x>nx ? nx : (x<0 ? 0 : x);};
+    int getClosestIdxY(real posY){ int y = round((posY-minY)/deltaY); return y>ny ? ny : (y<0 ? 0 : y);};
+    int getClosestIdxZ(real posZ){ int z = round((posZ-minZ)/deltaZ); return z>nz ? nz : (z<0 ? 0 : z);};
+    int getLinearIndex(int idxX, int idxY, int idxZ){ return idxX + nx*(idxY+ny*idxZ); };
+    int getNumberOfPointsInXYPlane(){ return nx*ny; }
+    int getNumberOfPointsInYZPlane(){ return ny*nz; }
+    int getNumberOfPointsInXZPlane(){ return nx*nz; }
+    int getNumberOfPoints(){ return nx*ny*nz; }
+    size_t getNumberOfQuantities(){ return quantities.size(); }
+    void loadFile();
+    void unloadFile();
+    bool isLoaded(){return loaded;};
+
+
+private:
+    void readHeader();
+    void printFileInfo();
+
+public:
+
+private:
+    std::string fileName;
+    real minX, maxX, minY, maxY, minZ, maxZ;
+    real deltaX, deltaY, deltaZ;
+    int nx, ny, nz;
+    std::vector<Quantity> quantities;
+    bool loaded;
+};
+
+class VelocityFileCollection
+{
+public:
+    VelocityFileCollection(std::string _prefix): 
+    prefix(_prefix){};
+
+    virtual ~VelocityFileCollection() = default;
+
+    virtual size_t getNumberOfQuantities()=0;
+
+    virtual FileType getFileType()=0;
+
+protected:
+    std::string prefix;
+};
+
+
+class VTKFileCollection : public VelocityFileCollection
+{
+public:
+    VTKFileCollection(std::string _prefix): 
+    VelocityFileCollection(_prefix)
+    {
+        findFiles();
+    };
+
+    FileType getFileType(){ return FileType::VTK; };
+    size_t getNumberOfQuantities(){ return files[0][0][0].getNumberOfQuantities(); }
+    
+
+private:
+    void findFiles();
+    std::string makeFileName(int level, int id, int part)
+    { 
+        return prefix + "_lev_" + StringUtil::toString<int>(level)
+                    + "_ID_" +    StringUtil::toString<int>(id)
+                    + "_File_" +  StringUtil::toString<int>(part) 
+                    + ".bin." + suffix;
+    };
+
+
+public:
+    static const inline std::string suffix = "vti";
+    std::vector<std::vector<std::vector<VTKFile>>> files;
+};
+
+
+class VelocityReader
+{
+public:
+    VelocityReader()
+    { 
+        this->nPoints = 0; 
+        this->nPointsRead = 0;
+        this->writingOffset = 0;        
+    };
+    virtual ~VelocityReader() = default;
+
+    virtual void getNextData(real* data, uint numberOfNodes, real time)=0;
+    virtual void fillArrays(std::vector<real>& coordsY, std::vector<real>& coordsZ)=0;
+    uint getNPoints(){return nPoints; };
+    uint getNPointsRead(){return nPointsRead; };
+    size_t getNumberOfQuantities(){ return nQuantities; };
+    void setWritingOffset(uint offset){ this->writingOffset = offset; }
+    void getNeighbors(uint* neighborNT, uint* neighborNB, uint* neighborST, uint* neighborSN);
+    void getWeights(real* _weightsNT, real* _weightsNB, real* _weightsST, real* _weightsSB);
+
+public:
+    std::vector<uint> planeNeighborNT,  planeNeighborNB, planeNeighborST, planeNeighborSB;
+    std::vector<real> weightsNT, weightsNB, weightsST,  weightsSB;
+
+protected:
+    uint nPoints, nPointsRead, writingOffset;
+    uint nReads=0;
+    size_t nQuantities=0;
+};
+
+
+class VTKReader : public VelocityReader
+{
+public:
+    VTKReader(SPtr<VTKFileCollection> _fileCollection):
+    fileCollection(_fileCollection)    
+    {
+        this->nQuantities = fileCollection->getNumberOfQuantities();
+        read = std::async([](){});
+    };
+    void getNextData(real* data, uint numberOfNodes, real time) override;
+    void fillArrays(std::vector<real>& coordsY, std::vector<real>& coordsZ) override;
+private:  
+    uint getWriteIndex(int level, int id, int linearIdx);
+    void initializeIndexVectors();
+
+private:
+    std::vector<std::vector<std::vector<uint>>> readIndices, writeIndices;
+    std::vector<std::vector<size_t>> nFile;
+    SPtr<VTKFileCollection> fileCollection;
+    std::future<void> read;
+};
+
+
+SPtr<VelocityFileCollection> createFileCollection(std::string prefix, FileType type);
+SPtr<VelocityReader> createReaderForCollection(SPtr<VelocityFileCollection> fileCollection);
+
+#endif //VELOCITY_SETTER_H_
\ No newline at end of file
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp
index 5102f60fc295aadf4323a4b332bf3dd8f7f21dbf..a0cc56b08d3e04ab3c04ed7ddca017336abf3c21 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp
@@ -36,12 +36,12 @@
 
 #include "grid/BoundaryConditions/Side.h"
 #include "grid/Grid.h"
+#include "GridGenerator/VelocitySetter/VelocitySetter.h"
 
 bool gg::BoundaryCondition::isSide( SideType side ) const
 {
     return this->side->whoAmI() == side;
 }
-
 //////////////////////////////////////////////////////////////////////////
 
 void VelocityBoundaryCondition::setVelocityProfile(
@@ -124,5 +124,4 @@ void StressBoundaryCondition::fillSamplingIndices(std::vector<SPtr<Grid> > grid,
         this->velocitySamplingIndices.push_back( grid[level]->transCoordToIndex(x_sampling, y_sampling, z_sampling) );
     }
     
-}
-
+}
\ No newline at end of file
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h
index 4a3990d9f815042297be76ae83a61268c8ad6815..f70aa0cf886019e6a97ca5c86a0cdafa1296b141 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h
@@ -45,6 +45,8 @@ class Grid;
 class Side;
 enum class SideType;
 
+class VelocityReader;
+
 namespace gg
 {
 class BoundaryCondition
@@ -63,6 +65,8 @@ public:
     bool isSide(SideType side) const;
 
     real getQ(uint index, uint dir) { return this->qs[index][dir]; }
+
+    void getCoords( SPtr<Grid> grid, std::vector<real>& x, std::vector<real>& y, std::vector<real>& z);
 };
 
 }
@@ -246,6 +250,7 @@ public:
     real getVy(uint index) { return this->vyList[index]; }
     real getVz(uint index) { return this->vzList[index]; }
 
+
     void setVelocityProfile( SPtr<Grid> grid, std::function<void(real,real,real,real&,real&,real&)> velocityProfile );
 };
 
@@ -329,5 +334,32 @@ public:
     real getNormalz(uint index) { return this->normalZList[index]; }
 };
 
+class PrecursorBoundaryCondition : public gg::BoundaryCondition
+{
+public:
+    static SPtr<PrecursorBoundaryCondition> make(SPtr<VelocityReader> reader, int nTRead, real velocityX, real velocityY, real velocityZ)
+    {
+        return SPtr<PrecursorBoundaryCondition>(new PrecursorBoundaryCondition(reader, nTRead, velocityX, velocityY, velocityZ));
+    }
 
+    SPtr<VelocityReader> getReader(){ return reader; }
+    real getVelocityX() { return velocityX; }
+    real getVelocityY() { return velocityY; }
+    real getVelocityZ() { return velocityZ; }
+
+private:
+    PrecursorBoundaryCondition(SPtr<VelocityReader> _reader, uint _nTRead, real vx, real vy, real vz) : reader(_reader), nTRead(_nTRead), velocityX(vx), velocityY(vy), velocityZ(vz) { };
+    virtual char getType() const override
+    {
+        return vf::gpu::BC_VELOCITY;
+    }
+public:
+    uint nTRead;
+
+private:
+    real velocityX = 0.0;
+    real velocityY = 0.0;
+    real velocityZ = 0.0;
+    SPtr<VelocityReader> reader;
+};
 #endif
\ No newline at end of file
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
index 6c7bf8ca1853826d83fb6a713ffe03716bd2cf9a..270aff9d8ff9639b8ae0c19451ca90990eba9c63 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
@@ -40,6 +40,18 @@
 
 using namespace gg;
 
+std::vector<real> Side::getNormal()
+{
+    std::vector<real> normal;
+    if(this->getCoordinate()==X_INDEX)
+        normal = {(real)this->getDirection(), 0.0, 0.0};
+    if(this->getCoordinate()==Y_INDEX)
+        normal = {0.0, (real)this->getDirection(), 0.0};
+    if(this->getCoordinate()==Z_INDEX)
+        normal = {0.0, 0.0, (real)this->getDirection()};
+    return normal;
+}
+
 void Side::addIndices(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition, std::string coord, real constant,
                       real startInner, real endInner, real startOuter, real endOuter)
 {
@@ -49,11 +61,20 @@ void Side::addIndices(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition
         {
             const uint index = getIndex(grid, coord, constant, v1, v2);
 
-            if ((index != INVALID_INDEX) && (  grid->getFieldEntry(index) == vf::gpu::FLUID
-                                            || grid->getFieldEntry(index) == vf::gpu::FLUID_CFC
-                                            || grid->getFieldEntry(index) == vf::gpu::FLUID_CFF
-                                            || grid->getFieldEntry(index) == vf::gpu::FLUID_FCC
-                                            || grid->getFieldEntry(index) == vf::gpu::FLUID_FCF ))
+            if ((index != INVALID_INDEX) && (   grid->getFieldEntry(index) == vf::gpu::FLUID
+                                            ||  grid->getFieldEntry(index) == vf::gpu::FLUID_CFC
+                                            ||  grid->getFieldEntry(index) == vf::gpu::FLUID_CFF
+                                            ||  grid->getFieldEntry(index) == vf::gpu::FLUID_FCC
+                                            ||  grid->getFieldEntry(index) == vf::gpu::FLUID_FCF 
+                                            ||  grid->getFieldEntry(index) == vf::gpu::FLUID_FCF
+                                            
+                                            //! Enforce overlap of BCs on edge nodes
+                                            ||  grid->getFieldEntry(index)  == vf::gpu::BC_PRESSURE
+                                            ||  grid->getFieldEntry(index)  == vf::gpu::BC_VELOCITY 
+                                            ||  grid->getFieldEntry(index)  == vf::gpu::BC_NOSLIP   
+                                            ||  grid->getFieldEntry(index)  == vf::gpu::BC_SLIP     
+                                            ||  grid->getFieldEntry(index)  == vf::gpu::BC_STRESS )
+                                            /*&& boundaryCondition->getType()!=vf::gpu::BC_STRESS*/ )
             {
                 grid->setFieldEntry(index, boundaryCondition->getType());
                 boundaryCondition->indices.push_back(index);
@@ -64,6 +85,24 @@ void Side::addIndices(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition
 
                 boundaryCondition->patches.push_back(0);
             }
+            // else if(boundaryCondition->getType()==vf::gpu::BC_STRESS && (index != INVALID_INDEX) && (   grid->getFieldEntry(index) == vf::gpu::FLUID
+            //                                 ||  grid->getFieldEntry(index) == vf::gpu::FLUID_CFC
+            //                                 ||  grid->getFieldEntry(index) == vf::gpu::FLUID_CFF
+            //                                 ||  grid->getFieldEntry(index) == vf::gpu::FLUID_FCC
+            //                                 ||  grid->getFieldEntry(index) == vf::gpu::FLUID_FCF 
+            //                                 ||  grid->getFieldEntry(index) == vf::gpu::FLUID_FCF
+            //                                 ||  grid->getFieldEntry(index)  == vf::gpu::BC_PRESSURE
+            //                                 ))
+            // {
+            //     grid->setFieldEntry(index, boundaryCondition->getType());
+            //     boundaryCondition->indices.push_back(index);
+            //     setPressureNeighborIndices(boundaryCondition, grid, index);
+            //     setStressSamplingIndices(boundaryCondition, grid, index);
+
+            //     setQs(grid, boundaryCondition, index);
+
+            //     boundaryCondition->patches.push_back(0);
+            // }
 
         }
     }
@@ -152,16 +191,28 @@ void Side::setQs(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition, uin
             else                neighborZ = grid->getLastFluidNode ( coords, 2, grid->getEndZ() );
         }
 
+        //! Only seting q's that partially point in the Side-normal direction
+        bool alignedWithNormal = (this->getNormal()[0]*grid->getDirection()[dir * DIMENSION + 0]+
+                                  this->getNormal()[1]*grid->getDirection()[dir * DIMENSION + 1]+
+                                  this->getNormal()[2]*grid->getDirection()[dir * DIMENSION + 2] ) > 0;
+        
+        // if(boundaryCondition->getType()==vf::gpu::BC_VELOCITY && z < 8.0 )
+        // {
+        //     alignedWithNormal = true;
+        //     printf("XYZ: %f \t %f \t %f \n", x,y,z);
+        //     printf("dir: %d \t %d \t %d \n\n", grid->getDirection()[dir * DIMENSION + 0], grid->getDirection()[dir * DIMENSION + 1], grid->getDirection()[dir * DIMENSION + 2]);
+        // }
+
         uint neighborIndex = grid->transCoordToIndex( neighborX, neighborY, neighborZ );
-        if( grid->getFieldEntry(neighborIndex) == vf::gpu::STOPPER_OUT_OF_GRID_BOUNDARY ||
-            grid->getFieldEntry(neighborIndex) == vf::gpu::STOPPER_OUT_OF_GRID ||
-            grid->getFieldEntry(neighborIndex) == vf::gpu::STOPPER_SOLID )
+        if((grid->getFieldEntry(neighborIndex) == vf::gpu::STOPPER_OUT_OF_GRID_BOUNDARY ||
+            grid->getFieldEntry(neighborIndex) == vf::gpu::STOPPER_OUT_OF_GRID          ||
+            grid->getFieldEntry(neighborIndex) == vf::gpu::STOPPER_SOLID)               &&
+            alignedWithNormal )
             qNode[dir] = 0.5;
         else
             qNode[dir] = -1.0;
-
     }
-
+    
     boundaryCondition->qs.push_back(qNode);
 }
 
@@ -260,7 +311,7 @@ void MY::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCond
     real coordinateNormal = grid[level]->getStartY() + grid[level]->getDelta();
 
     if( coordinateNormal > grid[0]->getStartY() + grid[0]->getDelta() ) return;
-
+    
     Side::addIndices(grid[level], boundaryCondition, "y", coordinateNormal, startInner, endInner, startOuter, endOuter);
 }
 
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h
index 6df6bfccc9a39b80de3ac43d057a03945d035b34..53a763bc562ee978042b28d24856fbcca256c5f9 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h
@@ -72,15 +72,17 @@ public:
 
     virtual SideType whoAmI() const = 0;
 
+    std::vector<real> getNormal();
+
 protected:
-    static void addIndices(SPtr<Grid> grid, SPtr<gg::BoundaryCondition> boundaryCondition, std::string coord, real constant,
+    void addIndices(SPtr<Grid> grid, SPtr<gg::BoundaryCondition> boundaryCondition, std::string coord, real constant,
                            real startInner, real endInner, real startOuter, real endOuter);
 
     static void setPressureNeighborIndices(SPtr<gg::BoundaryCondition> boundaryCondition, SPtr<Grid> grid, const uint index);
 
     static void setStressSamplingIndices(SPtr<gg::BoundaryCondition> boundaryCondition, SPtr<Grid> grid, const uint index);
 
-    static void setQs(SPtr<Grid> grid, SPtr<gg::BoundaryCondition> boundaryCondition, uint index);
+    void setQs(SPtr<Grid> grid, SPtr<gg::BoundaryCondition> boundaryCondition, uint index);
 
 private:
     static uint getIndex(SPtr<Grid> grid, std::string coord, real constant, real v1, real v2);
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
index 739aef59f76a33fa67d472a77ef258469f5e411c..8e86e8bae6c4801839486767e2c85ab100b8588d 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
@@ -54,6 +54,7 @@ class GridWrapper;
 class Transformator;
 class ArrowTransformator;
 class PolyDataWriterWrapper;
+class VelocityReader;
 
 class BoundingBox;
 class Grid;
@@ -113,6 +114,15 @@ public:
     virtual void getPressureValues(real *rho, int *indices, int *neighborIndices, int level) const = 0;
     virtual void getPressureQs(real *qs[27], int level) const                                      = 0;
 
+    virtual uint getPrecursorSize(int level) const              = 0;
+    virtual void getPrecursorValues(uint* neighborNT, uint* neighborNB, uint* neighborST, uint* neighborSB, 
+                                    real* weightsNT, real* weightsNB, real* weightsST, real* weightsSB, 
+                                    int* indices, std::vector<SPtr<VelocityReader>>& reader, 
+                                    int& numberOfPrecursorNodes, size_t& numberOfQuantities, uint& nTRead, 
+                                    real& velocityX, real& velocityY, real& velocityZ, int level) const = 0;
+
+    virtual void getPrecursorQs(real* qs[27], int level) const  = 0;
+
     virtual uint getGeometrySize(int level) const                                 = 0;
     virtual void getGeometryIndices(int *indices, int level) const                = 0;
     virtual void getGeometryQs(real *qs[27], int level) const                     = 0;
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
index 083b9a51e0b151f49922df456e968c4b204e4af7..0fd97f95817a73992347d33279978afdef87b0fc 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
@@ -52,6 +52,8 @@
 #include "io/QLineWriter.h"
 #include "io/SimulationFileWriter/SimulationFileWriter.h"
 
+#include "VelocitySetter/VelocitySetter.h"
+
 #include "utilities/communication.h"
 #include "utilities/transformator/ArrowTransformator.h"
 
@@ -243,6 +245,24 @@ void LevelGridBuilder::setNoSlipGeometryBoundaryCondition()
     }
 }
 
+void LevelGridBuilder::setPrecursorBoundaryCondition(SideType sideType, SPtr<VelocityFileCollection> fileCollection, int nTRead, real velocityX, real velocityY, real velocityZ)
+{
+    for (uint level = 0; level < getNumberOfGridLevels(); level++)
+    {
+        auto reader = createReaderForCollection(fileCollection);
+        SPtr<PrecursorBoundaryCondition> precursorBoundaryCondition = PrecursorBoundaryCondition::make( reader, nTRead, velocityX, velocityY, velocityZ);
+
+        auto side = SideFactory::make(sideType);
+
+        precursorBoundaryCondition->side = side;
+        precursorBoundaryCondition->side->addIndices(grids, level, precursorBoundaryCondition);
+
+        boundaryConditions[level]->precursorBoundaryConditions.push_back(precursorBoundaryCondition);
+
+        *logging::out << logging::Logger::INFO_INTERMEDIATE << "Set Precursor BC on level " << level << " with " << (int)precursorBoundaryCondition->indices.size() << "\n";
+    }
+}
+
 GRIDGENERATOR_EXPORT void LevelGridBuilder::setEnableFixRefinementIntoTheWall(bool enableFixRefinementIntoTheWall)
 {
     for (uint level = 0; level < this->grids.size(); level++)
@@ -594,6 +614,87 @@ void LevelGridBuilder::getPressureQs(real* qs[27], int level) const
     }
 }
 
+uint LevelGridBuilder::getPrecursorSize(int level) const
+{
+    uint size = 0;
+    for (auto boundaryCondition : boundaryConditions[level]->precursorBoundaryConditions)
+    {
+        size += uint(boundaryCondition->indices.size());
+    }
+    return size;
+}
+
+void LevelGridBuilder::getPrecursorValues(  uint* neighborNT, uint* neighborNB, uint* neighborST, uint* neighborSB, 
+                                            real* weightsNT, real* weightsNB, real* weightsST, real* weightsSB, 
+                                            int* indices, std::vector<SPtr<VelocityReader>>& reader, 
+                                            int& numberOfPrecursorNodes, size_t& numberOfQuantities, uint& nTRead, 
+                                            real& velocityX, real& velocityY, real& velocityZ, int level) const
+{
+    int allIndicesCounter = 0;
+    int allNodesCounter = 0;
+    uint tmpNTRead = 0;
+    size_t tmpNQuantities = 0;
+
+    for (auto boundaryCondition : boundaryConditions[level]->precursorBoundaryConditions)
+    {
+        if( tmpNTRead == 0 )
+            tmpNTRead = boundaryCondition->nTRead;
+        if( tmpNTRead != boundaryCondition->nTRead )
+            throw std::runtime_error("All precursor boundary conditions must have the same NTRead value");
+
+        auto BCreader = boundaryCondition->getReader();
+        BCreader->setWritingOffset(allIndicesCounter);
+        reader.push_back(BCreader);
+
+        std::vector<real> y, z;
+        real xTmp, yTmp, zTmp;
+        for(uint i = 0; i<boundaryCondition->indices.size(); i++)
+        {
+            indices[allIndicesCounter] = grids[level]->getSparseIndex(boundaryCondition->indices[i]) + 1;
+            grids[level]->transIndexToCoords(boundaryCondition->indices[i], xTmp, yTmp, zTmp);
+            y.push_back(yTmp);
+            z.push_back(zTmp);
+            allIndicesCounter++;
+        }
+        BCreader->fillArrays(y, z);
+        BCreader->getNeighbors(neighborNT, neighborNB, neighborST, neighborSB);
+        BCreader->getWeights(weightsNT, weightsNB, weightsST, weightsSB);
+        if(tmpNQuantities == 0)
+            tmpNQuantities = BCreader->getNumberOfQuantities();
+        if(tmpNQuantities != BCreader->getNumberOfQuantities()) 
+            throw std::runtime_error("All precursor files must have the same quantities.");
+        allNodesCounter += BCreader->getNPointsRead();
+        velocityX = boundaryCondition->getVelocityX();
+        velocityY = boundaryCondition->getVelocityY();
+        velocityZ = boundaryCondition->getVelocityZ();
+    }
+    numberOfPrecursorNodes = allNodesCounter;
+
+    if (tmpNTRead == 0)
+        throw std::runtime_error("NTRead of precursor needs to be larger than 0.");
+    nTRead = tmpNTRead;
+    
+    if (tmpNQuantities == 0)
+        throw std::runtime_error("Number of quantities in precursor needs to be larger than 0.");
+    numberOfQuantities = tmpNQuantities;
+}
+
+void LevelGridBuilder::getPrecursorQs(real* qs[27], int level) const
+{
+    int allIndicesCounter = 0;
+    for (auto boundaryCondition : boundaryConditions[level]->precursorBoundaryConditions)
+    {
+        for ( uint index = 0; index < boundaryCondition->indices.size(); index++ )
+        {
+            for (int dir = 0; dir <= grids[level]->getEndDirection(); dir++)
+            {
+                qs[dir][allIndicesCounter] = boundaryCondition->qs[index][dir];
+            }
+            allIndicesCounter++;
+        }
+    }
+}
+
 uint LevelGridBuilder::getGeometrySize(int level) const
 {
     if (boundaryConditions[level]->geometryBoundaryCondition)
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
index 4d73b8b0464b9823f7fad0ac011450a23b4054f1..7240f466b077bf612361dd3b0465faf6ec97420b 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
@@ -58,8 +58,12 @@ class SlipBoundaryCondition;
 class StressBoundaryCondition;
 class PressureBoundaryCondition;
 class GeometryBoundaryCondition;
+class PrecursorBoundaryCondition;
 enum class SideType;
 
+class VelocityReader;
+class VelocityFileCollection;
+
 
 
 class LevelGridBuilder : public GridBuilder
@@ -80,6 +84,7 @@ public:
     GRIDGENERATOR_EXPORT void setPressureBoundaryCondition(SideType sideType, real rho);
     GRIDGENERATOR_EXPORT void setPeriodicBoundaryCondition(bool periodic_X, bool periodic_Y, bool periodic_Z);
     GRIDGENERATOR_EXPORT void setNoSlipBoundaryCondition(SideType sideType);
+    GRIDGENERATOR_EXPORT void setPrecursorBoundaryCondition(SideType sideType, SPtr<VelocityFileCollection> fileCollection, int nTRead, real velocityX=0.0f, real velocityY=0.0f, real velocityZ=0.0f);
 
     GRIDGENERATOR_EXPORT void setEnableFixRefinementIntoTheWall(bool enableFixRefinementIntoTheWall);
 
@@ -121,6 +126,14 @@ public:
     GRIDGENERATOR_EXPORT void getPressureValues(real* rho, int* indices, int* neighborIndices, int level) const override;
     GRIDGENERATOR_EXPORT virtual void getPressureQs(real* qs[27], int level) const override;
 
+    GRIDGENERATOR_EXPORT uint getPrecursorSize(int level) const override;
+    GRIDGENERATOR_EXPORT void getPrecursorValues(   uint* neighborNT, uint* neighborNB, uint* neighborST, uint* neighborSB, 
+                                                    real* weightsNT, real* weightsNB, real* weightsST, real* weightsSB, 
+                                                    int* indices, std::vector<SPtr<VelocityReader>>& reader, 
+                                                    int& numberOfPrecursorNodes, size_t& numberOfQuantities, uint& nTRead,
+                                                    real& velocityX, real& velocityY, real& velocityZ, int level) const override;
+    GRIDGENERATOR_EXPORT virtual void getPrecursorQs(real* qs[27], int level) const override;
+
     GRIDGENERATOR_EXPORT virtual void getGeometryQs(real *qs[27], int level) const override;
     GRIDGENERATOR_EXPORT virtual uint getGeometrySize(int level) const override;
     GRIDGENERATOR_EXPORT virtual void getGeometryIndices(int *indices, int level) const override;
@@ -149,6 +162,8 @@ protected:
 
         std::vector<SPtr<VelocityBoundaryCondition>> noSlipBoundaryConditions;
 
+        std::vector<SPtr<PrecursorBoundaryCondition>> precursorBoundaryConditions;
+
         SPtr<GeometryBoundaryCondition> geometryBoundaryCondition;
     };
     bool geometryHasValues = false;
diff --git a/src/gpu/VirtualFluids_GPU/CMakeLists.txt b/src/gpu/VirtualFluids_GPU/CMakeLists.txt
index 759528e5346ba8d9899cb90eb64503b20a44c4fc..fd84df4a667f6506d95a1afb3ff15cae8fdb0d15 100644
--- a/src/gpu/VirtualFluids_GPU/CMakeLists.txt
+++ b/src/gpu/VirtualFluids_GPU/CMakeLists.txt
@@ -24,5 +24,8 @@ if(BUILD_VF_UNIT_TESTS)
 	set_source_files_properties(DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreamsTest.cpp PROPERTIES LANGUAGE CUDA)
     set_source_files_properties(Communication/ExchangeData27Test.cpp PROPERTIES LANGUAGE CUDA)
     set_source_files_properties(BoundaryConditions/BoundaryConditionFactoryTest.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(KernelManager/BCKernelManagerTest.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(Parameter/ParameterTest.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(Parameter/EdgeNodeFinderTest.cpp PROPERTIES LANGUAGE CUDA)
     target_include_directories(VirtualFluids_GPUTests PRIVATE "${VF_THIRD_DIR}/cuda_samples/")
 endif()
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
index 296ab819c5538a6b6d6a6827b5c28cbc475af838..b7e8f595c678e571be8894a611d5f7386ca54bd0 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
@@ -28,7 +28,7 @@ void UpdateGrid27::updateGrid(int level, unsigned int t)
 
     //////////////////////////////////////////////////////////////////////////
 
-    postCollisionBC(level);
+    postCollisionBC(level, t);
 
     //////////////////////////////////////////////////////////////////////////
 
@@ -227,9 +227,10 @@ void UpdateGrid27::exchangeMultiGPUAfterFtoC(int level, int streamIndex)
     }
 }
 
-void UpdateGrid27::postCollisionBC(int level)
+void UpdateGrid27::postCollisionBC(int level, uint t)
 {
     //////////////////////////////////////////////////////////////////////////
+    // G E O M E T R Y
     // V E L O C I T Y (I N F L O W)
     this->bcKernelManager->runVelocityBCKernelPost(level);
 
@@ -257,6 +258,10 @@ void UpdateGrid27::postCollisionBC(int level)
     // P R E S S U R E
     this->bcKernelManager->runPressureBCKernelPost(level);
 
+    //////////////////////////////////////////////////////////////////////////
+    // P R E C U R S O R
+    this->bcKernelManager->runPrecursorBCKernelPost(level, t, cudaMemoryManager.get());
+
     //////////////////////////////////////////////////////////////////////////
     // A D V E C T I O N    D I F F U S I O N
     if (para->getDiffOn())
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
index 8110923bf066412e2bb09ffa1f10efe3ddc983c7..bb26a6237aadb71514e5e100dc04318100be1f7b 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
@@ -34,7 +34,7 @@ private:
     void collisionUsingIndices(int level, unsigned int t, uint *fluidNodeIndices = nullptr, uint numberOfFluidNodes = 0, int stream = -1);
     void collisionAdvectionDiffusion(int level);
 
-    void postCollisionBC(int level);
+    void postCollisionBC(int level, unsigned int t);
     void preCollisionBC(int level, unsigned int t);
     void collisionPorousMedia(int level);
 
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
index aecd306373eff65a80c708c9c8783c155e1ebe48..3edeb199ef02cd2b633d25c9c0b3a1b9f94d3187 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
@@ -10,6 +10,7 @@
 #include <algorithm>
 #include "utilities/math/Math.h"
 #include "Output/QDebugWriter.hpp"
+#include "GridGenerator/VelocitySetter/VelocitySetter.h"
 
 #include "utilities/communication.h"
 #include "Communication/Communicator.h"
@@ -123,6 +124,7 @@ void GridGenerator::allocArrays_BoundaryValues()
 
         ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
         para->getParH(level)->pressureBC.numberOfBCnodes = 0;
+        para->getParD(level)->outflowPressureCorrectionFactor = para->getOutflowPressureCorrectionFactor();
         if (numberOfPressureValues > 1)
         {
             blocks = (numberOfPressureValues / para->getParH(level)->numberofthreads) + 1;
@@ -236,6 +238,99 @@ void GridGenerator::allocArrays_BoundaryValues()
         para->getParD(level)->numberOfVeloBCnodesRead = para->getParH(level)->velocityBC.numberOfBCnodes * para->getD3Qxx();
     }
 
+    for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
+        const auto numberOfPrecursorValues = int(builder->getPrecursorSize(level));
+        std::cout << "size precursor level " << level << " : " << numberOfPrecursorValues << std::endl;
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        int blocks = (numberOfPrecursorValues / para->getParH(level)->numberofthreads) + 1;
+        para->getParH(level)->precursorBC.sizeQ = blocks * para->getParH(level)->numberofthreads;
+        para->getParD(level)->precursorBC.sizeQ = para->getParH(level)->precursorBC.sizeQ;
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        para->getParH(level)->precursorBC.numberOfBCnodes = numberOfPrecursorValues;
+        para->getParD(level)->precursorBC.numberOfBCnodes = numberOfPrecursorValues;
+        para->getParH(level)->numberOfPrecursorBCnodesRead = numberOfPrecursorValues * para->getD3Qxx();
+        para->getParD(level)->numberOfPrecursorBCnodesRead = numberOfPrecursorValues * para->getD3Qxx();
+
+        if (numberOfPrecursorValues > 1)
+        {
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            cudaMemoryManager->cudaAllocPrecursorBC(level);
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+            builder->getPrecursorValues(
+                    para->getParH(level)->precursorBC.planeNeighborNT, para->getParH(level)->precursorBC.planeNeighborNB, 
+                    para->getParH(level)->precursorBC.planeNeighborST, para->getParH(level)->precursorBC.planeNeighborSB, 
+                    para->getParH(level)->precursorBC.weightsNT, para->getParH(level)->precursorBC.weightsNB, 
+                    para->getParH(level)->precursorBC.weightsST, para->getParH(level)->precursorBC.weightsSB, 
+                    para->getParH(level)->precursorBC.k, para->getParH(level)->velocityReader, para->getParH(level)->precursorBC.numberOfPrecursorNodes, 
+                    para->getParH(level)->precursorBC.numberOfQuantities, para->getParH(level)->precursorBC.nTRead, 
+                    para->getParH(level)->precursorBC.velocityX, para->getParH(level)->precursorBC.velocityY, para->getParH(level)->precursorBC.velocityZ,
+                    level);
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            para->getParD(level)->precursorBC.numberOfPrecursorNodes = para->getParH(level)->precursorBC.numberOfPrecursorNodes;
+            para->getParD(level)->precursorBC.numberOfQuantities = para->getParH(level)->precursorBC.numberOfQuantities;
+            para->getParD(level)->precursorBC.nTRead = para->getParH(level)->precursorBC.nTRead;
+            para->getParD(level)->precursorBC.velocityX = para->getParH(level)->precursorBC.velocityX;
+            para->getParD(level)->precursorBC.velocityY = para->getParH(level)->precursorBC.velocityY;
+            para->getParD(level)->precursorBC.velocityZ = para->getParH(level)->precursorBC.velocityZ;
+
+            for(auto reader : para->getParH(level)->velocityReader)
+            {
+                if(reader->getNumberOfQuantities() != para->getParD(level)->precursorBC.numberOfQuantities) throw std::runtime_error("Number of quantities in reader and number of quantities needed for precursor don't match!");
+            }
+
+            cudaMemoryManager->cudaCopyPrecursorBC(level);
+            cudaMemoryManager->cudaAllocPrecursorData(level);
+
+            // read first timestep of precursor into next and copy to next on device
+            for(auto reader : para->getParH(level)->velocityReader)
+            {   
+                reader->getNextData(para->getParH(level)->precursorBC.next, para->getParH(level)->precursorBC.numberOfPrecursorNodes, 0);
+            }
+
+            cudaMemoryManager->cudaCopyPrecursorData(level);
+
+            //switch next with last pointers
+            real* tmp = para->getParD(level)->precursorBC.last;
+            para->getParD(level)->precursorBC.last = para->getParD(level)->precursorBC.next;
+            para->getParD(level)->precursorBC.next = tmp;
+
+            //read second timestep of precursor into next and copy next to device
+            real nextTime = para->getParD(level)->precursorBC.nTRead*pow(2,-((real)level))*para->getTimeRatio();
+            for(auto reader : para->getParH(level)->velocityReader)
+            {   
+                reader->getNextData(para->getParH(level)->precursorBC.next, para->getParH(level)->precursorBC.numberOfPrecursorNodes, nextTime);
+            }
+
+            cudaMemoryManager->cudaCopyPrecursorData(level);
+
+            para->getParD(level)->precursorBC.nPrecursorReads = 1;
+
+
+            //switch next with current pointers
+            tmp = para->getParD(level)->precursorBC.current;
+            para->getParD(level)->precursorBC.current = para->getParD(level)->precursorBC.next;
+            para->getParD(level)->precursorBC.next = tmp;
+
+            //start usual cycle of loading, i.e. read velocities of timestep after current and copy asynchronously to device
+            for(auto reader : para->getParH(level)->velocityReader)
+            {   
+                reader->getNextData(para->getParH(level)->precursorBC.next, para->getParH(level)->precursorBC.numberOfPrecursorNodes, 2*nextTime);
+            }
+
+            cudaMemoryManager->cudaCopyPrecursorData(level);
+
+            para->getParD(level)->precursorBC.nPrecursorReads = 2;
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // advection - diffusion stuff
+        if (para->getDiffOn()==true){
+            throw std::runtime_error(" Advection Diffusion not implemented for Precursor!");
+        }
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    }
+
 
 
     if (builder->hasGeometryValues()) {
@@ -868,6 +963,50 @@ void GridGenerator::allocArrays_BoundaryQs()
         }
     }
 
+    for (uint i = 0; i < builder->getNumberOfGridLevels(); i++) {
+        const auto numberOfPrecursorNodes = int(builder->getPrecursorSize(i));
+        if (numberOfPrecursorNodes > 0)
+        {
+            std::cout << "size velocity level " << i << " : " << numberOfPrecursorNodes << std::endl;
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            //preprocessing
+            real* QQ = para->getParH(i)->precursorBC.q27[0];
+            unsigned int sizeQ = para->getParH(i)->precursorBC.numberOfBCnodes;
+            QforBoundaryConditions Q;
+            getPointersToBoundaryConditions(Q, QQ, sizeQ);
+
+            builder->getPrecursorQs(Q.q27, i);
+
+            if (para->getDiffOn()) {
+                throw std::runtime_error("Advection diffusion not implemented for Precursor!");
+                //////////////////////////////////////////////////////////////////////////
+                // para->getParH(i)->TempVel.kTemp = numberOfVelocityNodes;
+                // para->getParD(i)->TempVel.kTemp = numberOfVelocityNodes;
+                // std::cout << "Groesse TempVel.kTemp = " << para->getParH(i)->TempPress.kTemp << std::endl;
+                // std::cout << "getTemperatureInit = " << para->getTemperatureInit() << std::endl;
+                // std::cout << "getTemperatureBC = " << para->getTemperatureBC() << std::endl;
+                // //////////////////////////////////////////////////////////////////////////
+                // cudaMemoryManager->cudaAllocTempVeloBC(i);
+                // //cout << "nach alloc " << std::endl;
+                // //////////////////////////////////////////////////////////////////////////
+                // for (int m = 0; m < numberOfVelocityNodes; m++)
+                // {
+                //     para->getParH(i)->TempVel.temp[m] = para->getTemperatureInit();
+                //     para->getParH(i)->TempVel.tempPulse[m] = para->getTemperatureBC();
+                //     para->getParH(i)->TempVel.velo[m] = para->getVelocity();
+                //     para->getParH(i)->TempVel.k[m] = para->getParH(i)->Qinflow.k[m];
+                // }
+                // //////////////////////////////////////////////////////////////////////////
+                // //cout << "vor copy " << std::endl;
+                // cudaMemoryManager->cudaCopyTempVeloBCHD(i);
+                // //cout << "nach copy " << std::endl;
+                //////////////////////////////////////////////////////////////////////////
+            }
+            cudaMemoryManager->cudaCopyPrecursorBC(i);
+        }
+    }
+
+
 
     for (uint i = 0; i < builder->getNumberOfGridLevels(); i++) {
         const int numberOfGeometryNodes = builder->getGeometrySize(i);
diff --git a/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.cpp b/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.cpp
index bff054eb174a0f5fa34119deedde6f1c9733d83c..b1c398638cff1ec1b6d52f59f8e773183e270331 100644
--- a/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.cpp
+++ b/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.cpp
@@ -35,6 +35,11 @@ void BoundaryConditionFactory::setStressBoundaryCondition(const StressBC boundar
     this->stressBoundaryCondition = boundaryConditionType;
 }
 
+void BoundaryConditionFactory::setPrecursorBoundaryCondition(const PrecursorBC boundaryConditionType)
+{
+    this->precursorBoundaryCondition = boundaryConditionType;
+}
+
 boundaryCondition BoundaryConditionFactory::getVelocityBoundaryConditionPost(bool isGeometryBC) const
 {
     const VelocityBC &boundaryCondition =
@@ -132,6 +137,22 @@ boundaryCondition BoundaryConditionFactory::getPressureBoundaryConditionPre() co
         case PressureBC::OutflowNonReflective:
             return QPressNoRhoDev27;
             break;
+        case PressureBC::OutflowNonReflectivePressureCorrection:
+            return QPressZeroRhoOutflowDev27;
+        default:
+            return nullptr;
+    }
+}
+
+precursorBoundaryConditionFunc BoundaryConditionFactory::getPrecursorBoundaryConditionPost() const
+{
+    switch (this->precursorBoundaryCondition) {
+        case PrecursorBC::VelocityPrecursor:
+            return QPrecursorDevCompZeroPress;
+            break;
+        case PrecursorBC::DistributionsPrecursor:
+            return PrecursorDevDistributions;
+            break;
         default:
             return nullptr;
     }
diff --git a/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.h b/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.h
index 9d6872c4847be72dff4be7137b774c8082e39e34..fcd309c1690d6d326ea6796a016514aba263527a 100644
--- a/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.h
+++ b/src/gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.h
@@ -47,6 +47,7 @@ class Parameter;
 
 using boundaryCondition = std::function<void(LBMSimulationParameter *, QforBoundaryConditions *)>;
 using boundaryConditionWithParameter = std::function<void(Parameter *, QforBoundaryConditions *, const int level)>;
+using precursorBoundaryConditionFunc = std::function<void(LBMSimulationParameter *, QforPrecursorBoundaryConditions *, real tRatio, real velocityRatio)>;
 
 class BoundaryConditionFactory
 {
@@ -109,6 +110,8 @@ public:
         PressureNonEquilibriumCompressible,
         //! - OutflowNonReflective = outflow boundary condition, should be combined with VelocityAndPressureCompressible
         OutflowNonReflective,
+        //! - OutflowNonreflectivePressureCorrection = like OutflowNonReflective, but also reduces pressure overshoot
+        OutflowNonReflectivePressureCorrection,
         //! - NotSpecified =  the user did not set a boundary condition
         NotSpecified
     };
@@ -128,11 +131,21 @@ public:
     // enum class OutflowBoundaryCondition {};  // TODO:
     // https://git.rz.tu-bs.de/m.schoenherr/VirtualFluids_dev/-/issues/16
 
+    enum class PrecursorBC {
+        //! - VelocityPrecursor
+        VelocityPrecursor,
+        //! - DisitributionsPrecursor
+        DistributionsPrecursor,
+        //! - NotSpecified =  the user did not set a boundary condition
+        NotSpecified
+    };
+
     void setVelocityBoundaryCondition(const BoundaryConditionFactory::VelocityBC boundaryConditionType);
     void setNoSlipBoundaryCondition(const BoundaryConditionFactory::NoSlipBC boundaryConditionType);
     void setSlipBoundaryCondition(const BoundaryConditionFactory::SlipBC boundaryConditionType);
     void setPressureBoundaryCondition(const BoundaryConditionFactory::PressureBC boundaryConditionType);
     void setStressBoundaryCondition(const BoundaryConditionFactory::StressBC boundaryConditionType);
+    void setPrecursorBoundaryCondition(const BoundaryConditionFactory::PrecursorBC boundaryConditionType);
     //! \brief set a boundary condition for the geometry
     //! param boundaryConditionType: a velocity, no-slip or slip boundary condition
     //! \details suggestions for boundaryConditionType:
@@ -152,6 +165,8 @@ public:
     [[nodiscard]] boundaryCondition getSlipBoundaryConditionPost(bool isGeometryBC = false) const;
     [[nodiscard]] boundaryCondition getPressureBoundaryConditionPre() const;
     [[nodiscard]] boundaryCondition getGeometryBoundaryConditionPost() const;
+    [[nodiscard]] precursorBoundaryConditionFunc getPrecursorBoundaryConditionPost() const;
+
 
     [[nodiscard]] boundaryConditionWithParameter getStressBoundaryConditionPost() const;
 
@@ -162,6 +177,7 @@ private:
     PressureBC pressureBoundaryCondition = PressureBC::NotSpecified;
     std::variant<VelocityBC, NoSlipBC, SlipBC> geometryBoundaryCondition = NoSlipBC::NoSlipImplicitBounceBack;
     StressBC stressBoundaryCondition = StressBC::NotSpecified;
+    PrecursorBC precursorBoundaryCondition = PrecursorBC::NotSpecified;
 
     // OutflowBoundaryConditon outflowBC // TODO: https://git.rz.tu-bs.de/m.schoenherr/VirtualFluids_dev/-/issues/16
 };
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
index ea385fd7e39c2c2b2e9bddb462229e163b541797..b84f7c7403c462fb74d7d9eccf94cd82b29818fe 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
+++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
@@ -5,15 +5,18 @@
 #include <math.h>
 
 #include <Parameter/Parameter.h>
+
 #include "Parameter/CudaStreamManager.h"
 #include "PreCollisionInteractor/ActuatorLine.h"
 #include "PreCollisionInteractor/ActuatorFarm.h"
 #include "PreCollisionInteractor/Probes/Probe.h"
+#include <PreCollisionInteractor/PrecursorWriter.h>
 
 #include "Calculation/PorousMedia.h"
 
 #include "lbm/constants/NumericConstants.h"
 
+
 void CudaMemoryManager::cudaAllocFull(int lev)
 {
     checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->geo      ), parameter->getParH(lev)->mem_size_int  ));
@@ -240,6 +243,7 @@ void CudaMemoryManager::cudaCopyVeloBC(int lev)
 	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->velocityBC.deltaVz, parameter->getParH(lev)->velocityBC.deltaVz,            mem_size_inflow_Q_q,  cudaMemcpyHostToDevice));
 
 }
+
 void CudaMemoryManager::cudaFreeVeloBC(int lev)
 {
 	checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->velocityBC.q27[0] ));
@@ -1656,6 +1660,133 @@ void CudaMemoryManager::cudaFreeWallModel(int lev, bool hasWallModelMonitor)
     }
 }
 
+
+//Precursor BC
+void CudaMemoryManager::cudaAllocPrecursorBC(int lev)
+{   
+    uint memSizeQInt = parameter->getParH(lev)->precursorBC.numberOfBCnodes*sizeof(int);
+    uint memSizeQUint = parameter->getParH(lev)->precursorBC.numberOfBCnodes*sizeof(uint);
+    uint memSizeQReal = parameter->getParH(lev)->precursorBC.numberOfBCnodes*sizeof(real);
+
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.k, memSizeQInt));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.q27[0], parameter->getD3Qxx()*memSizeQReal));
+    
+
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.planeNeighborNT, memSizeQUint));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.planeNeighborNB, memSizeQUint));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.planeNeighborST, memSizeQUint));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.planeNeighborSB, memSizeQUint));
+
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.weightsNT, memSizeQReal));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.weightsNB, memSizeQReal));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.weightsST, memSizeQReal));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.weightsSB, memSizeQReal));
+
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.k, memSizeQInt));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.q27[0], parameter->getD3Qxx()*memSizeQReal));
+
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.planeNeighborNT, memSizeQUint));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.planeNeighborNB, memSizeQUint));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.planeNeighborST, memSizeQUint));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.planeNeighborSB, memSizeQUint));
+
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.weightsNT, memSizeQReal));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.weightsNB, memSizeQReal));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.weightsST, memSizeQReal));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.weightsSB, memSizeQReal));
+
+    real memSize = memSizeQInt+4*memSizeQUint+(4+parameter->getD3Qxx())*memSizeQReal;
+    setMemsizeGPU(memSize, false);
+
+}
+
+
+void CudaMemoryManager::cudaAllocPrecursorData(int lev)
+{
+    size_t size = parameter->getParH(lev)->precursorBC.numberOfPrecursorNodes*sizeof(real)*parameter->getParH(lev)->precursorBC.numberOfQuantities;
+
+    checkCudaErrors( cudaStreamCreate(&parameter->getParH(lev)->precursorBC.stream) );
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.last, size));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.current, size));
+    checkCudaErrors( cudaMallocHost((void**) &parameter->getParH(lev)->precursorBC.next, size));
+
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.last, size));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.current, size));
+    checkCudaErrors( cudaMalloc((void**) &parameter->getParD(lev)->precursorBC.next, size));
+    setMemsizeGPU(3*size, false);
+}
+
+
+void CudaMemoryManager::cudaCopyPrecursorBC(int lev)
+{
+    uint memSizeQInt = parameter->getParH(lev)->precursorBC.numberOfBCnodes*sizeof(int);
+    uint memSizeQUint = parameter->getParH(lev)->precursorBC.numberOfBCnodes*sizeof(uint);
+    uint memSizeQReal = parameter->getParH(lev)->precursorBC.numberOfBCnodes*sizeof(real);
+
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.k, parameter->getParH(lev)->precursorBC.k, memSizeQInt, cudaMemcpyHostToDevice));
+
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.q27[0], parameter->getParH(lev)->precursorBC.q27[0], memSizeQReal*parameter->getD3Qxx(), cudaMemcpyHostToDevice));
+
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.planeNeighborNT, parameter->getParH(lev)->precursorBC.planeNeighborNT, memSizeQUint, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.planeNeighborNB, parameter->getParH(lev)->precursorBC.planeNeighborNB, memSizeQUint, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.planeNeighborST, parameter->getParH(lev)->precursorBC.planeNeighborST, memSizeQUint, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.planeNeighborSB, parameter->getParH(lev)->precursorBC.planeNeighborSB, memSizeQUint, cudaMemcpyHostToDevice));
+
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.weightsNT, parameter->getParH(lev)->precursorBC.weightsNT, memSizeQReal, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.weightsNB, parameter->getParH(lev)->precursorBC.weightsNB, memSizeQReal, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.weightsST, parameter->getParH(lev)->precursorBC.weightsST, memSizeQReal, cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->precursorBC.weightsSB, parameter->getParH(lev)->precursorBC.weightsSB, memSizeQReal, cudaMemcpyHostToDevice));
+}
+void CudaMemoryManager::cudaCopyPrecursorData(int lev)
+{
+    auto prec = &parameter->getParH(lev)->precursorBC;
+    size_t memSize = prec->numberOfPrecursorNodes*sizeof(real)*prec->numberOfQuantities;
+    checkCudaErrors( cudaStreamSynchronize(prec->stream) );
+    checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->precursorBC.next, prec->next, memSize, cudaMemcpyHostToDevice, prec->stream)) ;
+}
+
+
+void CudaMemoryManager::cudaFreePrecursorBC(int lev)
+{
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.k));
+
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.q27[0]));
+
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.planeNeighborNT));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.planeNeighborNB));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.planeNeighborST));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.planeNeighborSB));
+
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.weightsNT));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.weightsNB));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.weightsST));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.weightsSB));
+
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.k));
+
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.q27[0]));
+
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.planeNeighborNT));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.planeNeighborNB));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.planeNeighborST));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.planeNeighborSB));
+
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.weightsNT));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.weightsNB));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.weightsST));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.weightsSB));
+}
+
+void CudaMemoryManager::cudaFreePrecursorData(int lev)
+{
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.last));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.current));
+    checkCudaErrors( cudaFreeHost( parameter->getParH(lev)->precursorBC.next));
+
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.last));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.current));
+    checkCudaErrors( cudaFree( parameter->getParD(lev)->precursorBC.next));
+}
 //Test roundoff error
 void CudaMemoryManager::cudaAllocTestRE(int lev, unsigned int size)
 {
@@ -3479,8 +3610,11 @@ void CudaMemoryManager::cudaCopyProbeQuantityArrayHtoD(Probe* probe, int level)
 }
 void CudaMemoryManager::cudaCopyProbeQuantityArrayDtoH(Probe* probe, int level)
 {
-    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->quantitiesArrayH, probe->getProbeStruct(level)->quantitiesArrayD, probe->getProbeStruct(level)->nArrays*sizeof(real)*probe->getProbeStruct(level)->nPoints, cudaMemcpyDeviceToHost) );
+    auto probeStruct = probe->getProbeStruct(level);
+
+    checkCudaErrors( cudaMemcpy(probeStruct->quantitiesArrayH, probeStruct->quantitiesArrayD, probeStruct->nArrays*sizeof(real)*probeStruct->nPoints, cudaMemcpyDeviceToHost) );
 }
+
 void CudaMemoryManager::cudaFreeProbeQuantityArray(Probe* probe, int level)
 {
     checkCudaErrors( cudaFreeHost(probe->getProbeStruct(level)->quantitiesArrayH) );
@@ -3510,6 +3644,7 @@ void CudaMemoryManager::cudaCopyProbeQuantitiesAndOffsetsDtoH(Probe* probe, int
     checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->quantitiesH, probe->getProbeStruct(level)->quantitiesD, int(Statistic::LAST)*sizeof(bool), cudaMemcpyDeviceToHost) );
     checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->arrayOffsetsH, probe->getProbeStruct(level)->arrayOffsetsD, int(Statistic::LAST)*sizeof(int), cudaMemcpyDeviceToHost) );
 }
+
 void CudaMemoryManager::cudaFreeProbeQuantitiesAndOffsets(Probe* probe, int level)
 {
     checkCudaErrors( cudaFreeHost(probe->getProbeStruct(level)->quantitiesH) );
@@ -3518,23 +3653,51 @@ void CudaMemoryManager::cudaFreeProbeQuantitiesAndOffsets(Probe* probe, int leve
     checkCudaErrors( cudaFree    (probe->getProbeStruct(level)->arrayOffsetsD) );
 }
 
+void CudaMemoryManager::cudaAllocPrecursorWriter(PrecursorWriter* writer, int level)
+{
+    auto prec =  writer->getPrecursorStruct(level);
+    size_t indSize = prec->nPoints*sizeof(uint);
 
+    checkCudaErrors( cudaStreamCreate(&prec->stream) );
 
+    checkCudaErrors( cudaMallocHost((void**) &prec->indicesH, indSize));
+    checkCudaErrors( cudaMalloc((void**) &prec->indicesD, indSize));
 
+    size_t dataSize  = prec->nPoints*sizeof(real)*prec->nQuantities;
+    size_t dataSizeH = dataSize * prec->timestepsPerFile;
+    
+    checkCudaErrors( cudaMallocHost((void**) &prec->dataH, dataSizeH));
+    checkCudaErrors( cudaMallocHost((void**) &prec->bufferH, dataSizeH));
+    checkCudaErrors( cudaMalloc((void**) &prec->dataD, dataSize));
+    checkCudaErrors( cudaMalloc((void**) &prec->bufferD, dataSize));
 
+    setMemsizeGPU(indSize+2*dataSize, false);
+}
 
+void CudaMemoryManager::cudaCopyPrecursorWriterIndicesHtoD(PrecursorWriter* writer, int level)
+{
+    checkCudaErrors( cudaMemcpy(writer->getPrecursorStruct(level)->indicesD, writer->getPrecursorStruct(level)->indicesH, writer->getPrecursorStruct(level)->nPoints*sizeof(uint), cudaMemcpyHostToDevice) );
+}
 
+void CudaMemoryManager::cudaCopyPrecursorWriterOutputVariablesDtoH(PrecursorWriter* writer, int level)
+{
+    auto prec =  writer->getPrecursorStruct(level);
+    int sizeTimestep = prec->nPoints*prec->nQuantities;
 
+    checkCudaErrors( cudaStreamSynchronize(prec->stream) );
+    checkCudaErrors( cudaMemcpyAsync( &prec->bufferH[prec->timestepsBuffered*sizeTimestep], prec->bufferD, sizeof(real)*sizeTimestep, cudaMemcpyDeviceToHost, prec->stream));
+}
 
+void CudaMemoryManager::cudaFreePrecursorWriter(PrecursorWriter* writer, int level)
+{
+    checkCudaErrors( cudaFreeHost(writer->getPrecursorStruct(level)->indicesH));
+    checkCudaErrors( cudaFree(writer->getPrecursorStruct(level)->indicesD));
 
-
-
-
-
-
-
-
-
+    checkCudaErrors( cudaFreeHost(writer->getPrecursorStruct(level)->dataH));
+    checkCudaErrors( cudaFreeHost(writer->getPrecursorStruct(level)->bufferH));
+    checkCudaErrors( cudaFree(writer->getPrecursorStruct(level)->dataD));
+    checkCudaErrors( cudaFree(writer->getPrecursorStruct(level)->bufferD));
+}
 
 
 CudaMemoryManager::CudaMemoryManager(std::shared_ptr<Parameter> parameter) : parameter(parameter)
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
index beb87ba639e160cc6be6e036f615dabe80d0b865..7d4b3af414f919c099f06d2387c896a1763f9231 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
+++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
@@ -21,6 +21,8 @@ class PorousMedia;
 class ActuatorLine;
 class ActuatorFarm;
 class Probe;
+class VelocitySetter;
+class PrecursorWriter;
 
 class VIRTUALFLUIDS_GPU_EXPORT CudaMemoryManager
 {
@@ -183,6 +185,13 @@ public:
     void cudaCopyStressBC(int lev);
     void cudaFreeStressBC(int lev);
 
+    void cudaAllocPrecursorBC(int lev);
+    void cudaAllocPrecursorData(int lev);
+    void cudaCopyPrecursorBC(int lev);
+    void cudaCopyPrecursorData(int lev);
+    void cudaFreePrecursorBC(int lev);
+    void cudaFreePrecursorData(int lev);
+
     void cudaAllocWallModel(int lev, bool hasWallModelMonitor);
     void cudaCopyWallModel(int lev,  bool hasWallModelMonitor);
     void cudaFreeWallModel(int lev,  bool hasWallModelMonitor);
@@ -436,6 +445,12 @@ public:
     void cudaCopyProbeQuantitiesAndOffsetsDtoH(Probe* probe, int level);
     void cudaFreeProbeQuantitiesAndOffsets(Probe* probe, int level);
 
+    //Precursor Writer
+    void cudaAllocPrecursorWriter(PrecursorWriter* writer, int level);
+    void cudaCopyPrecursorWriterIndicesHtoD(PrecursorWriter* writer, int level);
+    void cudaCopyPrecursorWriterOutputVariablesDtoH(PrecursorWriter* writer, int level);
+    void cudaFreePrecursorWriter(PrecursorWriter* writer, int level);
+
 private:
     std::shared_ptr<Parameter> parameter;
     double memsizeGPU = 0.0;
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h b/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
index ee987ae23402ef304220349db77084cc341ccd5a..ceb70fb123c52c282200137a00522ff2b9905f86 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
+++ b/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
@@ -900,6 +900,8 @@ void QPressDevDirDepBot27(unsigned int numberOfThreads,
 
 void QPressNoRhoDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
 
+void QPressZeroRhoOutflowDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
+
 void QInflowScaleByPressDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition);
 
 void QPressDevOld27(unsigned int numberOfThreads,
@@ -1007,6 +1009,14 @@ void VelSchlaffer27(  unsigned int numberOfThreads,
                                  unsigned int size_Mat, 
                                  bool isEvenTimestep);
 
+void QPrecursorDevCompZeroPress(LBMSimulationParameter* parameterDevice, QforPrecursorBoundaryConditions* boundaryCondition, real tRatio, real velocityRatio);
+
+void PrecursorDevEQ27(LBMSimulationParameter* parameterDevice, QforPrecursorBoundaryConditions* boundaryCondition, real tRatio, real velocityRatio);
+
+void PrecursorDevDistributions(LBMSimulationParameter* parameterDevice, QforPrecursorBoundaryConditions* boundaryCondition, real tRatio, real velocityRatio);
+
+void QPrecursorDevDistributions(LBMSimulationParameter* parameterDevice, QforPrecursorBoundaryConditions* boundaryCondition, real tRatio, real velocityRatio);
+
 void QADDev7(unsigned int numberOfThreads,
                         real* DD, 
                         real* DD7,
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh b/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
index 94b9704b7ca57df4cd985f5aff9521b8a087b97f..b35e01eb997723eb12f5645857bc230536fe97fe 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
+++ b/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
@@ -1080,7 +1080,7 @@ __global__ void QPressDeviceDirDepBot27(  real* rhoBC,
                                                      bool isEvenTimestep);
 
 __global__ void QPressNoRhoDevice27(  real* rhoBC,
-												 real* DD,
+												 real* distributions,
 												 int* k_Q,
 												 int* k_N,
 												 int numberOfBCnodes,
@@ -1088,8 +1088,23 @@ __global__ void QPressNoRhoDevice27(  real* rhoBC,
 												 unsigned int* neighborX,
 												 unsigned int* neighborY,
 												 unsigned int* neighborZ,
-												 unsigned int size_Mat,
-												 bool isEvenTimestep);
+												 unsigned int numberOfLBnodes,
+												 bool isEvenTimestep,
+												 int direction);
+
+__global__ void QPressZeroRhoOutflowDevice27(  real* rhoBC,
+											real* distributions, 
+											int* k_Q, 
+											int* k_N, 
+											int numberOfBCnodes, 
+											real om1, 
+											unsigned int* neighborX,
+											unsigned int* neighborY,
+											unsigned int* neighborZ,
+											unsigned int numberOfLBnodes, 
+											bool isEvenTimestep,
+											int direction,
+											real densityCorrectionFactor);
 
 __global__ void QInflowScaleByPressDevice27(  real* rhoBC,
 														 real* DD,
@@ -1228,6 +1243,103 @@ __global__ void VelSchlaff27(  int t,
                                           unsigned int size_Mat,
                                           bool isEvenTimestep);
 
+__global__ void QPrecursorDeviceCompZeroPress( 	int* subgridDistanceIndices,
+                                                int numberOfBCnodes,
+                                                int numberOfPrecursorNodes,
+                                                int sizeQ,
+                                                real omega,
+                                                real* distributions,
+                                                real* subgridDistances,
+                                                uint* neighborX, 
+                                                uint* neighborY, 
+                                                uint* neighborZ,
+                                                uint* neighborsNT, 
+                                                uint* neighborsNB,
+                                                uint* neighborsST,
+                                                uint* neighborsSB,
+                                                real* weightsNT, 
+                                                real* weightsNB,
+                                                real* weightsST,
+                                                real* weightsSB,
+                                                real* vLast, 
+                                                real* vCurrent,
+                                                real velocityX,
+                                                real velocityY,
+                                                real velocityZ,
+                                                real tRatio,
+                                                real velocityRatio,
+                                                unsigned long long numberOfLBnodes,
+                                                bool isEvenTimestep);
+
+__global__ void PrecursorDeviceEQ27( 	int* subgridDistanceIndices,
+                                        int numberOfBCnodes,
+                                        int numberOfPrecursorNodes,
+                                        real omega,
+                                        real* distributions,
+                                        uint* neighborX, 
+                                        uint* neighborY, 
+                                        uint* neighborZ,
+                                        uint* neighborsNT, 
+                                        uint* neighborsNB,
+                                        uint* neighborsST,
+                                        uint* neighborsSB,
+                                        real* weightsNT, 
+                                        real* weightsNB,
+                                        real* weightsST,
+                                        real* weightsSB,
+                                        real* vLast, 
+                                        real* vCurrent,
+                                        real velocityX,
+                                        real velocityY,
+                                        real velocityZ,
+                                        real tRatio,
+                                        real velocityRatio,
+                                        unsigned long long numberOfLBnodes,
+                                        bool isEvenTimestep);
+
+__global__ void PrecursorDeviceDistributions( 	int* subgridDistanceIndices,
+												int numberOfBCNodes,
+												int numberOfPrecursorNodes,
+												real* distributions,
+												uint* neighborX, 
+												uint* neighborY, 
+												uint* neighborZ,
+												uint* neighborsNT, 
+												uint* neighborsNB,
+												uint* neighborsST,
+												uint* neighborsSB,
+												real* weightsNT, 
+												real* weightsNB,
+												real* weightsST,
+												real* weightsSB,
+												real* fsLast, 
+												real* fsNext,
+												real tRatio,
+												unsigned long long numberOfLBnodes,
+												bool isEvenTimestep);
+__global__ void QPrecursorDeviceDistributions( 	int* subgridDistanceIndices,
+												real* subgridDistances,
+												int sizeQ,
+												int numberOfBCNodes,
+												int numberOfPrecursorNodes,
+												real* distributions,
+												uint* neighborX, 
+												uint* neighborY, 
+												uint* neighborZ,
+												uint* neighborsNT, 
+												uint* neighborsNB,
+												uint* neighborsST,
+												uint* neighborsSB,
+												real* weightsNT, 
+												real* weightsNB,
+												real* weightsST,
+												real* weightsSB,
+												real* fsLast, 
+												real* fsNext,
+												real tRatio,
+												unsigned long long numberOfLBnodes,
+												bool isEvenTimestep);
+												
 //Advection / Diffusion BCs
 __global__ void QAD7( real* DD,
                                  real* DD7,
diff --git a/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu b/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
index 63fc5be0ebe5d4a26d4662ee8c0dddbc3098247a..489eb0a60ddb8bf9e1605a68e6d0f62211e26575 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
@@ -54,22 +54,9 @@ void KernelCasSP27( unsigned int numberOfThreads,
                                int size_Mat,
                                bool EvenOrOdd)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LB_Kernel_Casc_SP_27<<< grid, threads >>>(s9,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+      LB_Kernel_Casc_SP_27<<< grid.grid, grid.threads >>>(s9,
                                                 bcMatD,
                                                 neighborX,
                                                 neighborY,
@@ -90,22 +77,9 @@ void KernelCasSPMS27( unsigned int numberOfThreads,
                                  int size_Mat,
                                  bool EvenOrOdd)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LB_Kernel_Casc_SP_MS_27<<< grid, threads >>>(s9,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+      LB_Kernel_Casc_SP_MS_27<<< grid.grid, grid.threads >>>(s9,
                                                    bcMatD,
                                                    neighborX,
                                                    neighborY,
@@ -126,22 +100,9 @@ void KernelCasSPMSOHM27( unsigned int numberOfThreads,
                                     int size_Mat,
                                     bool EvenOrOdd)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LB_Kernel_Casc_SP_MS_OHM_27<<< grid, threads >>>(  s9,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+      LB_Kernel_Casc_SP_MS_OHM_27<<< grid.grid, grid.threads >>>(  s9,
                                                          bcMatD,
                                                          neighborX,
                                                          neighborY,
@@ -165,22 +126,9 @@ void KernelKumCompSRTSP27(
 	real* forces,
 	bool EvenOrOdd)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   LB_Kernel_Kum_New_Comp_SRT_SP_27 <<< grid, threads >>>(
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+   LB_Kernel_Kum_New_Comp_SRT_SP_27 <<< grid.grid, grid.threads >>>(
 	   omega,
 	   bcMatD,
 	   neighborX,
@@ -209,22 +157,9 @@ void KernelKum1hSP27(    unsigned int numberOfThreads,
 									int size_Mat,
 									bool EvenOrOdd)
 {
-	int Grid = (size_Mat / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-		LB_Kernel_Kum_1h_SP_27<<< grid, threads >>>(omega,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+		LB_Kernel_Kum_1h_SP_27<<< grid.grid, grid.threads >>>(omega,
 													deltaPhi,
 													angularVelocity,
 													bcMatD,
@@ -250,22 +185,9 @@ void KernelCascadeSP27(  unsigned int numberOfThreads,
 									int size_Mat,
 									bool EvenOrOdd)
 {
-	int Grid = (size_Mat / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-		LB_Kernel_Cascade_SP_27<<< grid, threads >>>(s9,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+		LB_Kernel_Cascade_SP_27<<< grid.grid, grid.threads >>>(s9,
 													bcMatD,
 													neighborX,
 													neighborY,
@@ -286,22 +208,10 @@ void KernelKumNewSP27(   unsigned int numberOfThreads,
 									int size_Mat,
 									bool EvenOrOdd)
 {
-	int Grid = (size_Mat / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-		LB_Kernel_Kum_New_SP_27<<< grid, threads >>>(s9,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+
+		LB_Kernel_Kum_New_SP_27<<< grid.grid, grid.threads >>>(s9,
 													bcMatD,
 													neighborX,
 													neighborY,
@@ -329,22 +239,9 @@ void KernelKumNewCompSP27(unsigned int numberOfThreads,
 	//dim3 grid(Grid, 1, 1);
 	//dim3 threads(numberOfThreads, 1, 1 );
 
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-		//LB_Kernel_Kum_New_Comp_SP_27<<< grid, threads >>>(	s9,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+		//LB_Kernel_Kum_New_Comp_SP_27<<< grid.grid, grid.threads >>>(	s9,
 		//													bcMatD,
 		//													neighborX,
 		//													neighborY,
@@ -375,22 +272,10 @@ void CumulantOnePreconditionedErrorDiffusionChimCompSP27(unsigned int numberOfTh
 	//dim3 grid(Grid, 1, 1);
 	//dim3 threads(numberOfThreads, 1, 1 );
 
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	Cumulant_One_preconditioned_errorDiffusion_chim_Comp_SP_27 <<< grid, threads >>>(	s9,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+
+	Cumulant_One_preconditioned_errorDiffusion_chim_Comp_SP_27 <<< grid.grid, grid.threads >>>(	s9,
 																						bcMatD,
 																						neighborX,
 																						neighborY,
@@ -420,22 +305,10 @@ void CumulantOnePreconditionedChimCompSP27(  unsigned int numberOfThreads,
 	//dim3 grid(Grid, 1, 1);
 	//dim3 threads(numberOfThreads, 1, 1 );
 
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	Cumulant_One_preconditioned_chim_Comp_SP_27 <<< grid, threads >>>(	s9,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+
+	Cumulant_One_preconditioned_chim_Comp_SP_27 <<< grid.grid, grid.threads >>>(	s9,
 																		bcMatD,
 																		neighborX,
 																		neighborY,
@@ -465,22 +338,10 @@ void CumulantOneChimCompSP27(unsigned int numberOfThreads,
 	//dim3 grid(Grid, 1, 1);
 	//dim3 threads(numberOfThreads, 1, 1 );
 
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	Cumulant_One_chim_Comp_SP_27 <<< grid, threads >>>(	s9,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+
+	Cumulant_One_chim_Comp_SP_27 <<< grid.grid, grid.threads >>>(	s9,
 														bcMatD,
 														neighborX,
 														neighborY,
@@ -506,22 +367,10 @@ void KernelKumIsoTestSP27(unsigned int numberOfThreads,
 									 int size_Mat,
 									 bool EvenOrOdd)
 {
-	int Grid = (size_Mat / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	LB_Kernel_Kum_IsoTest_SP_27<<< grid, threads >>>(s9,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+
+	LB_Kernel_Kum_IsoTest_SP_27<<< grid.grid, grid.threads >>>(s9,
 													bcMatD,
 													neighborX,
 													neighborY,
@@ -545,22 +394,10 @@ void KernelKumCompSP27(  unsigned int numberOfThreads,
 									int size_Mat,
 									bool EvenOrOdd)
 {
-	int Grid = (size_Mat / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-		LB_Kernel_Kum_Comp_SP_27<<< grid, threads >>>(s9,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+
+		LB_Kernel_Kum_Comp_SP_27<<< grid.grid, grid.threads >>>(s9,
 													bcMatD,
 													neighborX,
 													neighborY,
@@ -587,22 +424,10 @@ void KernelPMCumOneCompSP27(unsigned int numberOfThreads,
 									   unsigned int* nodeIdsPorousMedia,
 									   bool EvenOrOdd)
 {
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_PM_Cum_One_Comp_SP_27 <<< grid, threads >>>(omega,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+
+	LB_Kernel_PM_Cum_One_Comp_SP_27 <<< grid.grid, grid.threads >>>(omega,
 														  neighborX,
 														  neighborY,
 														  neighborZ,
@@ -642,22 +467,10 @@ void KernelWaleBySoniMalavCumAA2016CompSP27(
 	//dim3 grid(Grid, 1, 1);
 	//dim3 threads(numberOfThreads, 1, 1 );
 
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2, 1);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_WaleBySoniMalav_Cum_AA2016_Comp_SP_27 << < grid, threads >> >(
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+
+	LB_Kernel_WaleBySoniMalav_Cum_AA2016_Comp_SP_27 << < grid.grid, grid.threads >> >(
 		s9,
 		bcMatD,
 		neighborX,
@@ -687,22 +500,9 @@ void KernelADincomp7(   unsigned int numberOfThreads,
 								   int size_Mat,
 								   bool EvenOrOdd)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LB_Kernel_AD_Incomp_7<<< grid, threads >>>( diffusivity,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+      LB_Kernel_AD_Incomp_7<<< grid.grid, grid.threads >>>( diffusivity,
 												  bcMatD,
 												  neighborX,
 												  neighborY,
@@ -725,22 +525,9 @@ void KernelADincomp27( unsigned int numberOfThreads,
 								  int size_Mat,
 								  bool EvenOrOdd)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LB_Kernel_AD_Incomp_27<<< grid, threads >>>( diffusivity,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LB_Kernel_AD_Incomp_27<<< grid.grid, grid.threads >>>( diffusivity,
 													bcMatD,
 													neighborX,
 													neighborY,
@@ -749,7 +536,7 @@ void KernelADincomp27( unsigned int numberOfThreads,
 													DD27,
 													size_Mat,
 													EvenOrOdd);
-      getLastCudaError("LB_Kernel_AD_Incomp_27 execution failed");
+	getLastCudaError("LB_Kernel_AD_Incomp_27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void Init27( int myid,
@@ -771,7 +558,7 @@ void Init27( int myid,
    dim3 threads       ( grid_nx, 1, 1 );
    dim3 grid          ( grid_ny, grid_nz );   // Gitter fuer Kollision und Propagation
 
-      LBInit27<<< grid, threads >>> (  myid,
+	LBInit27<<< grid, threads >>> (  myid,
                                        numprocs,
                                        u0,
                                        geoD,
@@ -786,7 +573,7 @@ void Init27( int myid,
                                        DD,
                                        level,
                                        maxlevel);
-      getLastCudaError("LBInit27 execution failed");
+	getLastCudaError("LBInit27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void InitNonEqPartSP27( unsigned int numberOfThreads,
@@ -804,22 +591,9 @@ void InitNonEqPartSP27( unsigned int numberOfThreads,
                                    real omega,
                                    bool EvenOrOdd)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBInitNonEqPartSP27<<< grid, threads >>>( neighborX,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBInitNonEqPartSP27<<< grid.grid, grid.threads >>>( neighborX,
                                                 neighborY,
                                                 neighborZ,
                                                 neighborWSB,
@@ -832,7 +606,7 @@ void InitNonEqPartSP27( unsigned int numberOfThreads,
                                                 DD,
                                                 omega,
                                                 EvenOrOdd);
-      getLastCudaError("LBInitNonEqPartSP27 execution failed");
+	getLastCudaError("LBInitNonEqPartSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void InitThS7(     unsigned int numberOfThreads,
@@ -848,22 +622,9 @@ void InitThS7(     unsigned int numberOfThreads,
                               real* DD7,
                               bool EvenOrOdd)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      InitAD7<<< grid, threads >>>( neighborX,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	InitAD7<<< grid.grid, grid.threads >>>( neighborX,
                                        neighborY,
                                        neighborZ,
                                        geoD,
@@ -874,7 +635,7 @@ void InitThS7(     unsigned int numberOfThreads,
                                        size_Mat,
                                        DD7,
                                        EvenOrOdd);
-      getLastCudaError("InitAD7 execution failed");
+	getLastCudaError("InitAD7 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void InitADDev27( unsigned int numberOfThreads,
@@ -890,22 +651,9 @@ void InitADDev27( unsigned int numberOfThreads,
                            real* DD27,
                            bool EvenOrOdd)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      InitAD27<<< grid, threads >>>(neighborX,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	InitAD27<<< grid.grid, grid.threads >>>(neighborX,
                                        neighborY,
                                        neighborZ,
                                        geoD,
@@ -916,7 +664,7 @@ void InitADDev27( unsigned int numberOfThreads,
                                        size_Mat,
                                        DD27,
                                        EvenOrOdd);
-      getLastCudaError("InitAD27 execution failed");
+	getLastCudaError("InitAD27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void PostProcessorF3_2018Fehlberg(
@@ -937,22 +685,9 @@ void PostProcessorF3_2018Fehlberg(
 	real* forces,
 	bool EvenOrOdd)
 {
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	  LB_PostProcessor_F3_2018_Fehlberg <<< grid, threads >>> (   omega,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LB_PostProcessor_F3_2018_Fehlberg <<< grid.grid, grid.threads >>> (   omega,
 																  bcMatD,
 																  neighborX,
 																  neighborY,
@@ -967,7 +702,7 @@ void PostProcessorF3_2018Fehlberg(
 																  level,
 																  forces,
 																  EvenOrOdd);
-      getLastCudaError("LB_PostProcessor_F3_2018_Fehlberg execution failed");
+	getLastCudaError("LB_PostProcessor_F3_2018_Fehlberg execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void CalcMac27( real* vxD,
@@ -988,7 +723,7 @@ void CalcMac27( real* vxD,
    dim3 threads       ( grid_nx, 1, 1 );
    dim3 grid          ( grid_ny, grid_nz );
 
-      LBCalcMac27<<< grid, threads >>> (  vxD,
+	LBCalcMac27<<< grid, threads >>> (  vxD,
                                           vyD,
                                           vzD,
                                           rhoD,
@@ -999,7 +734,7 @@ void CalcMac27( real* vxD,
                                           size_Mat,
                                           DD,
                                           isEvenTimestep);
-      getLastCudaError("LBCalcMac27 execution failed");
+	getLastCudaError("LBCalcMac27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void CalcMacSP27( real* vxD,
@@ -1016,22 +751,9 @@ void CalcMacSP27( real* vxD,
                              real* DD,
                              bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcMacSP27<<< grid, threads >>> (   vxD,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBCalcMacSP27<<< grid.grid, grid.threads >>> (   vxD,
                                              vyD,
                                              vzD,
                                              rhoD,
@@ -1043,7 +765,7 @@ void CalcMacSP27( real* vxD,
                                              size_Mat,
                                              DD,
                                              isEvenTimestep);
-      getLastCudaError("LBCalcMacSP27 execution failed");
+	getLastCudaError("LBCalcMacSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void CalcMacCompSP27( real* vxD,
@@ -1060,22 +782,9 @@ void CalcMacCompSP27( real* vxD,
 								 real* DD,
 								 bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcMacCompSP27<<< grid, threads >>> (   vxD,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBCalcMacCompSP27<<< grid.grid, grid.threads >>> (   vxD,
 												 vyD,
 												 vzD,
 												 rhoD,
@@ -1087,7 +796,7 @@ void CalcMacCompSP27( real* vxD,
 												 size_Mat,
 												 DD,
 												 isEvenTimestep);
-      getLastCudaError("LBCalcMacSP27 execution failed");
+	getLastCudaError("LBCalcMacSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void CalcMacThS7(  real* Conc,
@@ -1100,22 +809,9 @@ void CalcMacThS7(  real* Conc,
                               real* DD7,
                               bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      CalcConc7<<< grid, threads >>> (Conc,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	CalcConc7<<< grid.grid, grid.threads >>> (Conc,
                                           geoD,
                                           neighborX,
                                           neighborY,
@@ -1123,7 +819,7 @@ void CalcMacThS7(  real* Conc,
                                           size_Mat,
                                           DD7,
                                           isEvenTimestep);
-      getLastCudaError("CalcConc7 execution failed");
+	getLastCudaError("CalcConc7 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void PlaneConcThS7(real* Conc,
@@ -1138,22 +834,9 @@ void PlaneConcThS7(real* Conc,
 							  real* DD7,
 							  bool isEvenTimestep)
 {
-   int Grid = (numberOfPointskPC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      GetPlaneConc7<<< grid, threads >>> (	Conc,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfPointskPC);
+
+	GetPlaneConc7<<< grid.grid, grid.threads >>> (	Conc,
 												kPC,
 												numberOfPointskPC,
 												geoD,
@@ -1163,7 +846,7 @@ void PlaneConcThS7(real* Conc,
 												size_Mat,
 												DD7,
 												isEvenTimestep);
-      getLastCudaError("GetPlaneConc7 execution failed");
+	getLastCudaError("GetPlaneConc7 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void PlaneConcThS27(real* Conc,
@@ -1178,22 +861,9 @@ void PlaneConcThS27(real* Conc,
 							   real* DD27,
 							   bool isEvenTimestep)
 {
-   int Grid = (numberOfPointskPC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      GetPlaneConc27<<< grid, threads >>> (	Conc,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfPointskPC);
+
+	GetPlaneConc27<<< grid.grid, grid.threads >>> (	Conc,
 												kPC,
 												numberOfPointskPC,
 												geoD,
@@ -1203,7 +873,7 @@ void PlaneConcThS27(real* Conc,
 												size_Mat,
 												DD27,
 												isEvenTimestep);
-      getLastCudaError("GetPlaneConc27 execution failed");
+	getLastCudaError("GetPlaneConc27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void CalcConcentration27( unsigned int numberOfThreads,
@@ -1216,22 +886,9 @@ void CalcConcentration27( unsigned int numberOfThreads,
                                      real* DD27,
                                      bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      CalcConc27<<< grid, threads >>> (  Conc,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	CalcConc27<<< grid.grid, grid.threads >>> (  Conc,
                                              geoD,
                                              neighborX,
                                              neighborY,
@@ -1239,7 +896,7 @@ void CalcConcentration27( unsigned int numberOfThreads,
                                              size_Mat,
                                              DD27,
                                              isEvenTimestep);
-      getLastCudaError("CalcConc27 execution failed");
+	getLastCudaError("CalcConc27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void CalcMedSP27(  real* vxD,
@@ -1256,22 +913,9 @@ void CalcMedSP27(  real* vxD,
                               real* DD,
                               bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcMedSP27<<< grid, threads >>> (   vxD,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBCalcMedSP27<<< grid.grid, grid.threads >>> (   vxD,
                                              vyD,
                                              vzD,
                                              rhoD,
@@ -1283,7 +927,7 @@ void CalcMedSP27(  real* vxD,
                                              size_Mat,
                                              DD,
                                              isEvenTimestep);
-      getLastCudaError("LBCalcMedSP27 execution failed");
+	getLastCudaError("LBCalcMedSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void CalcMedCompSP27(  real* vxD,
@@ -1300,22 +944,9 @@ void CalcMedCompSP27(  real* vxD,
 								  real* DD,
 								  bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcMedCompSP27<<< grid, threads >>> (   vxD,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBCalcMedCompSP27<<< grid.grid, grid.threads >>> (   vxD,
 												 vyD,
 												 vzD,
 												 rhoD,
@@ -1327,7 +958,7 @@ void CalcMedCompSP27(  real* vxD,
 												 size_Mat,
 												 DD,
 												 isEvenTimestep);
-      getLastCudaError("LBCalcMedSP27 execution failed");
+	getLastCudaError("LBCalcMedSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void CalcMedCompAD27(
@@ -1347,22 +978,9 @@ void CalcMedCompAD27(
 	real* DD_AD,
 	bool isEvenTimestep)
 {
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LBCalcMedCompAD27 <<< grid, threads >>> (
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBCalcMedCompAD27 <<< grid.grid, grid.threads >>> (
 		vxD,
 		vyD,
 		vzD,
@@ -1394,22 +1012,9 @@ void CalcMacMedSP27(  real* vxD,
                                  unsigned int numberOfThreads,
                                  bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcMacMedSP27<<< grid, threads >>> (   vxD,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBCalcMacMedSP27<<< grid.grid, grid.threads >>> (   vxD,
                                                 vyD,
                                                 vzD,
                                                 rhoD,
@@ -1421,7 +1026,7 @@ void CalcMacMedSP27(  real* vxD,
                                                 tdiff,
                                                 size_Mat,
                                                 isEvenTimestep);
-      getLastCudaError("LBCalcMacMedSP27 execution failed");
+	getLastCudaError("LBCalcMacMedSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void ResetMedianValuesSP27(
@@ -1434,22 +1039,10 @@ void ResetMedianValuesSP27(
 	unsigned int numberOfThreads,
 	bool isEvenTimestep)
 {
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LBResetMedianValuesSP27 << < grid, threads >> > (
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+
+	LBResetMedianValuesSP27 << < grid.grid, grid.threads >> > (
 		vxD,
 		vyD,
 		vzD,
@@ -1471,22 +1064,9 @@ void ResetMedianValuesAD27(
 	unsigned int numberOfThreads,
 	bool isEvenTimestep)
 {
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LBResetMedianValuesAD27 << < grid, threads >> > (
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBResetMedianValuesAD27 << < grid.grid, grid.threads >> > (
 		vxD,
 		vyD,
 		vzD,
@@ -1512,22 +1092,9 @@ void Calc2ndMomentsIncompSP27(real* kxyFromfcNEQ,
 										 real* DD,
 										 bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalc2ndMomentsIncompSP27<<< grid, threads >>> (  kxyFromfcNEQ,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBCalc2ndMomentsIncompSP27<<< grid.grid, grid.threads >>> (  kxyFromfcNEQ,
 														 kyzFromfcNEQ,
 														 kxzFromfcNEQ,
 														 kxxMyyFromfcNEQ,
@@ -1539,7 +1106,7 @@ void Calc2ndMomentsIncompSP27(real* kxyFromfcNEQ,
 														 size_Mat,
 														 DD,
 														 isEvenTimestep);
-      getLastCudaError("LBCalc2ndMomentsIncompSP27 execution failed");
+	getLastCudaError("LBCalc2ndMomentsIncompSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void Calc2ndMomentsCompSP27( real* kxyFromfcNEQ,
@@ -1556,22 +1123,9 @@ void Calc2ndMomentsCompSP27( real* kxyFromfcNEQ,
 										real* DD,
 										bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalc2ndMomentsCompSP27<<< grid, threads >>> (kxyFromfcNEQ,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBCalc2ndMomentsCompSP27<<< grid.grid, grid.threads >>> (kxyFromfcNEQ,
 													 kyzFromfcNEQ,
 													 kxzFromfcNEQ,
 													 kxxMyyFromfcNEQ,
@@ -1583,7 +1137,7 @@ void Calc2ndMomentsCompSP27( real* kxyFromfcNEQ,
 													 size_Mat,
 													 DD,
 													 isEvenTimestep);
-      getLastCudaError("LBCalc2ndMomentsCompSP27 execution failed");
+	getLastCudaError("LBCalc2ndMomentsCompSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void Calc3rdMomentsIncompSP27(real* CUMbbb,
@@ -1602,22 +1156,9 @@ void Calc3rdMomentsIncompSP27(real* CUMbbb,
 										 real* DD,
 										 bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalc3rdMomentsIncompSP27<<< grid, threads >>> (  CUMbbb,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBCalc3rdMomentsIncompSP27<<< grid.grid, grid.threads >>> (  CUMbbb,
 														 CUMabc,
 														 CUMbac,
 														 CUMbca,
@@ -1631,7 +1172,7 @@ void Calc3rdMomentsIncompSP27(real* CUMbbb,
 														 DD,
 														 size_Mat,
 														 isEvenTimestep);
-      getLastCudaError("LBCalc3rdMomentsIncompSP27 execution failed");
+	getLastCudaError("LBCalc3rdMomentsIncompSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void Calc3rdMomentsCompSP27( real* CUMbbb,
@@ -1650,22 +1191,9 @@ void Calc3rdMomentsCompSP27( real* CUMbbb,
 										real* DD,
 										bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalc3rdMomentsCompSP27<<< grid, threads >>> (CUMbbb,
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBCalc3rdMomentsCompSP27<<< grid.grid, grid.threads >>> (CUMbbb,
 													 CUMabc,
 													 CUMbac,
 													 CUMbca,
@@ -1679,7 +1207,7 @@ void Calc3rdMomentsCompSP27( real* CUMbbb,
 													 DD,
 													 size_Mat,
 													 isEvenTimestep);
-      getLastCudaError("LBCalc3rdMomentsCompSP27 execution failed");
+	getLastCudaError("LBCalc3rdMomentsCompSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void CalcHigherMomentsIncompSP27(real* CUMcbb,
@@ -1701,22 +1229,9 @@ void CalcHigherMomentsIncompSP27(real* CUMcbb,
 											real* DD,
 											bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcHigherMomentsIncompSP27<<< grid, threads >>> (CUMcbb,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBCalcHigherMomentsIncompSP27<<< grid.grid, grid.threads >>> (CUMcbb,
 														  CUMbcb,
 														  CUMbbc,
 														  CUMcca,
@@ -1733,7 +1248,7 @@ void CalcHigherMomentsIncompSP27(real* CUMcbb,
 														  DD,
 														  size_Mat,
 														  isEvenTimestep);
-      getLastCudaError("LBCalcHigherMomentsIncompSP27 execution failed");
+	getLastCudaError("LBCalcHigherMomentsIncompSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void CalcHigherMomentsCompSP27(  real* CUMcbb,
@@ -1755,22 +1270,9 @@ void CalcHigherMomentsCompSP27(  real* CUMcbb,
 											real* DD,
 											bool isEvenTimestep)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcHigherMomentsCompSP27<<< grid, threads >>> (  CUMcbb,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+
+	LBCalcHigherMomentsCompSP27<<< grid.grid, grid.threads >>> (  CUMcbb,
 														  CUMbcb,
 														  CUMbbc,
 														  CUMcca,
@@ -1787,7 +1289,7 @@ void CalcHigherMomentsCompSP27(  real* CUMcbb,
 														  DD,
 														  size_Mat,
 														  isEvenTimestep);
-      getLastCudaError("LBCalcHigherMomentsCompSP27 execution failed");
+	getLastCudaError("LBCalcHigherMomentsCompSP27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void LBCalcMeasurePoints27(real* vxMP,
@@ -1807,22 +1309,9 @@ void LBCalcMeasurePoints27(real* vxMP,
                                       unsigned int numberOfThreads,
                                       bool isEvenTimestep)
 {
-   int Grid = (numberOfPointskMP / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBCalcMeasurePoints<<< grid, threads >>> (vxMP,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfPointskMP);
+
+	LBCalcMeasurePoints<<< grid.grid, grid.threads >>> (vxMP,
                                                 vyMP,
                                                 vzMP,
                                                 rhoMP,
@@ -1837,7 +1326,7 @@ void LBCalcMeasurePoints27(real* vxMP,
                                                 size_Mat,
                                                 DD,
                                                 isEvenTimestep);
-      getLastCudaError("LBCalcMeasurePoints execution failed");
+	getLastCudaError("LBCalcMeasurePoints execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void BcPress27( int nx,
@@ -1853,10 +1342,10 @@ void BcPress27( int nx,
                            unsigned int size_Mat,
                            bool isEvenTimestep)
 {
-   dim3 threads       ( grid_nx, 1, 1 );
-   dim3 grid          ( grid_ny, 1 );
+	dim3 threads       ( grid_nx, 1, 1 );
+	dim3 grid          ( grid_ny, 1 );
 
-      LB_BC_Press_East27<<< grid, threads >>> ( nx,
+	LB_BC_Press_East27<<< grid, threads >>> ( nx,
                                                 ny,
                                                 tz,
                                                 bcMatD,
@@ -1866,7 +1355,7 @@ void BcPress27( int nx,
                                                 DD,
                                                 size_Mat,
                                                 isEvenTimestep);
-      getLastCudaError("LB_BC_Press_East27 execution failed");
+	getLastCudaError("LB_BC_Press_East27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void BcVel27(int nx,
@@ -1885,10 +1374,10 @@ void BcVel27(int nx,
                         real u0x,
                         real om)
 {
-   dim3 threads       ( grid_nx, 1, 1 );
-   dim3 grid          ( grid_ny, 1 );
+	dim3 threads       ( grid_nx, 1, 1 );
+	dim3 grid          ( grid_ny, 1 );
 
-      LB_BC_Vel_West_27<<< grid, threads >>> (  nx,
+	LB_BC_Vel_West_27<<< grid, threads >>> (  nx,
                                                 ny,
                                                 nz,
                                                 itz,
@@ -1903,7 +1392,7 @@ void BcVel27(int nx,
                                                 grid_nx,
                                                 grid_ny,
                                                 om);
-      getLastCudaError("LB_BC_Vel_West_27 execution failed");
+	getLastCudaError("LB_BC_Vel_West_27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QADPressDev7( unsigned int numberOfThreads,
@@ -1922,22 +1411,9 @@ void QADPressDev7( unsigned int numberOfThreads,
                               unsigned int size_Mat,
                               bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADPress7<<< gridQ, threads >>>( DD,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QADPress7<<< grid.grid, grid.threads >>>( DD,
                                        DD7,
                                        temp,
                                        velo,
@@ -1951,7 +1427,7 @@ void QADPressDev7( unsigned int numberOfThreads,
                                        neighborZ,
                                        size_Mat,
                                        isEvenTimestep);
-      getLastCudaError("QADPress7 execution failed");
+	getLastCudaError("QADPress7 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QADPressDev27(unsigned int numberOfThreads,
@@ -1970,22 +1446,9 @@ void QADPressDev27(unsigned int numberOfThreads,
                               unsigned int size_Mat,
                               bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADPress27<<< gridQ, threads >>>(   DD,
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QADPress27<<< grid.grid, grid.threads >>>(   DD,
                                           DD27,
                                           temp,
                                           velo,
@@ -1999,7 +1462,7 @@ void QADPressDev27(unsigned int numberOfThreads,
                                           neighborZ,
                                           size_Mat,
                                           isEvenTimestep);
-      getLastCudaError("QADPress27 execution failed");
+	getLastCudaError("QADPress27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QADPressNEQNeighborDev27(
@@ -2017,22 +1480,9 @@ void QADPressNEQNeighborDev27(
 										)
 {
 
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   QADPressNEQNeighbor27<<< gridQ, threads >>>(
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+	QADPressNEQNeighbor27<<< grid.grid, grid.threads >>>(
 												DD,
 												DD27,
 												k_Q,
@@ -2044,7 +1494,7 @@ void QADPressNEQNeighborDev27(
 												size_Mat,
 												isEvenTimestep
 											  );
-   getLastCudaError("QADPressNEQNeighbor27 execution failed");
+   	getLastCudaError("QADPressNEQNeighbor27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QADVelDev7(unsigned int numberOfThreads,
@@ -2063,22 +1513,9 @@ void QADVelDev7(unsigned int numberOfThreads,
                            unsigned int size_Mat,
                            bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADVel7<<< gridQ, threads >>> (  
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QADVel7<<< grid.grid, grid.threads >>> (  
                                        DD,
                                        DD7,
                                        temp,
@@ -2093,7 +1530,7 @@ void QADVelDev7(unsigned int numberOfThreads,
                                        neighborZ,
                                        size_Mat,
                                        isEvenTimestep);
-      getLastCudaError("QADVel7 execution failed");
+	getLastCudaError("QADVel7 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QADVelDev27(  unsigned int numberOfThreads,
@@ -2112,22 +1549,9 @@ void QADVelDev27(  unsigned int numberOfThreads,
                               unsigned int size_Mat,
                               bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADVel27<<< gridQ, threads >>> ( DD,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QADVel27<<< grid.grid, grid.threads >>> ( DD,
                                       DD27,
                                       temp,
                                       velo,
@@ -2159,22 +1583,9 @@ void QADDev7(unsigned int numberOfThreads,
                         unsigned int size_Mat,
                         bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QAD7<<< gridQ, threads >>> (     DD,
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QAD7<<< grid.grid, grid.threads >>> (     DD,
                                        DD7,
                                        temp,
                                        diffusivity,
@@ -2242,11 +1653,9 @@ void ADSlipVelDevComp(
 	uint size_Mat,
 	bool isEvenTimestep)
 {
-	int Grid = (numberOfBCnodes / numberOfThreads) + 1;
-	dim3 gridQ(Grid, 1, 1);
-	dim3 threads(numberOfThreads, 1, 1);
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
 
-	AD_SlipVelDeviceComp << < gridQ, threads >> > (
+	AD_SlipVelDeviceComp << < grid.grid, grid.threads >> > (
 		normalX,
 		normalY,
 		normalZ,
@@ -2280,22 +1689,9 @@ void QADDirichletDev27( unsigned int numberOfThreads,
 								   unsigned int size_Mat,
 								   bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADDirichlet27<<< gridQ, threads >>> (
+   	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QADDirichlet27<<< grid.grid, grid.threads >>> (
 											   DD,
 											   DD27,
 											   temp,
@@ -2327,22 +1723,9 @@ void QADBBDev27(unsigned int numberOfThreads,
                            unsigned int size_Mat,
                            bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADBB27<<< gridQ, threads >>> (  DD,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QADBB27<<< grid.grid, grid.threads >>> (  DD,
                                        DD27,
                                        temp,
                                        diffusivity,
@@ -2373,22 +1756,9 @@ void QNoSlipADincompDev7(unsigned int numberOfThreads,
 									unsigned int size_Mat,
 									bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QNoSlipADincomp7<<< gridQ, threads >>> (
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QNoSlipADincomp7<<< grid.grid, grid.threads >>> (
 											   DD,
 											   DD7,
 											   temp,
@@ -2420,22 +1790,9 @@ void QNoSlipADincompDev27(  unsigned int numberOfThreads,
 									   unsigned int size_Mat,
 									   bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QNoSlipADincomp27<<< gridQ, threads >>> (
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QNoSlipADincomp27<<< grid.grid, grid.threads >>> (
 											   DD,
 											   DD27,
 											   temp,
@@ -2468,24 +1825,10 @@ void QADVeloIncompDev7( unsigned int numberOfThreads,
 								   unsigned int size_Mat,
 								   bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADVeloIncomp7<<< gridQ, threads >>> ( 
-											   DD,
-											   DD7,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QADVeloIncomp7<<< grid.grid, grid.threads >>> ( DD,
+	  										   DD7,
 											   temp,
 											   velo,
 											   diffusivity,
@@ -2517,22 +1860,9 @@ void QADVeloIncompDev27(   unsigned int numberOfThreads,
 									  unsigned int size_Mat,
 									  bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADVeloIncomp27<<< gridQ, threads >>> (
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QADVeloIncomp27<<< grid.grid, grid.threads >>> (
 											  DD,
 											  DD27,
 											  temp,
@@ -2566,22 +1896,9 @@ void QADPressIncompDev7( unsigned int numberOfThreads,
 									  unsigned int size_Mat,
 									  bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADPressIncomp7<<< gridQ, threads >>>(
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QADPressIncomp7<<< grid.grid, grid.threads >>>(
 											   DD,
 											   DD7,
 											   temp,
@@ -2615,24 +1932,10 @@ void QADPressIncompDev27(  unsigned int numberOfThreads,
 									  unsigned int size_Mat,
 									  bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QADPressIncomp27<<< gridQ, threads >>>(
-											  DD,
-											  DD27,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QADPressIncomp27<<< grid.grid, grid.threads >>>(DD, 
+	  										  DD27, 
 											  temp,
 											  velo,
 											  diffusivity,
@@ -2701,22 +2004,9 @@ void QDevCompThinWalls27(unsigned int numberOfThreads,
 									unsigned int size_Mat,
 									bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   QDeviceCompThinWallsPartOne27 <<< gridQ, threads >>> (DD,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+   QDeviceCompThinWallsPartOne27 <<< grid.grid, grid.threads >>> (DD,
 														 k_Q,
 														 QQ,
 														 numberOfBCnodes,
@@ -2728,7 +2018,7 @@ void QDevCompThinWalls27(unsigned int numberOfThreads,
 														 isEvenTimestep);
    getLastCudaError("QDeviceCompThinWallsPartOne27 execution failed");
 
-   QThinWallsPartTwo27 <<< gridQ, threads >>> ( DD,
+   QThinWallsPartTwo27 <<< grid.grid, grid.threads >>> ( DD,
 												k_Q,
 												QQ,
 												numberOfBCnodes,
@@ -2774,22 +2064,9 @@ void QDevIncompHighNu27( unsigned int numberOfThreads,
 									unsigned int size_Mat,
 									bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QDeviceIncompHighNu27<<< gridQ, threads >>> (
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QDeviceIncompHighNu27<<< grid.grid, grid.threads >>> (
 												   DD,
 												   k_Q,
 												   QQ,
@@ -2815,22 +2092,9 @@ void QDevCompHighNu27(   unsigned int numberOfThreads,
 									unsigned int size_Mat,
 									bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QDeviceCompHighNu27<<< gridQ, threads >>> (
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QDeviceCompHighNu27<<< grid.grid, grid.threads >>> (
 												   DD,
 												   k_Q,
 												   QQ,
@@ -2880,22 +2144,9 @@ void QVelDeviceCouette27(unsigned int numberOfThreads,
 									unsigned int size_Mat,
 									bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QVelDevCouette27<<< gridQ, threads >>> ( vx,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QVelDevCouette27<<< grid.grid, grid.threads >>> ( vx,
 												vy,
 												vz,
 												DD,
@@ -2933,22 +2184,9 @@ void QVelDevice1h27(   unsigned int numberOfThreads,
 								  unsigned int size_Mat,
 								  bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QVelDev1h27<<< gridQ, threads >>> (nx,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+   QVelDev1h27<<< grid.grid, grid.threads >>> (nx,
                                           ny,
                                           vx,
                                           vy,
@@ -3010,22 +2248,9 @@ void QVelDevCompPlusSlip27(unsigned int numberOfThreads,
 									  unsigned int size_Mat,
 									  bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QVelDeviceCompPlusSlip27<<< gridQ, threads >>> (
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QVelDeviceCompPlusSlip27<<< grid.grid, grid.threads >>> (
 													  vx,
 													  vy,
 													  vz,
@@ -3081,22 +2306,9 @@ void QVelDevCompThinWalls27(unsigned int numberOfThreads,
 							           unsigned int size_Mat,
 							           bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   QVelDeviceCompThinWallsPartOne27<<< gridQ, threads >>> (vx,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+   QVelDeviceCompThinWallsPartOne27<<< grid.grid, grid.threads >>> (vx,
 											                  vy,
 											                  vz,
 											                  DD,
@@ -3111,7 +2323,7 @@ void QVelDevCompThinWalls27(unsigned int numberOfThreads,
 											                  isEvenTimestep);
    getLastCudaError("QVelDeviceCompThinWallsPartOne27 execution failed");
 
-	QThinWallsPartTwo27 <<< gridQ, threads >>> (
+	QThinWallsPartTwo27 <<< grid.grid, grid.threads >>> (
        DD,
        k_Q,
        QQ,
@@ -3163,22 +2375,9 @@ void QVelDevIncompHighNu27(unsigned int numberOfThreads,
 									  unsigned int size_Mat,
 									  bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QVelDeviceIncompHighNu27<<< gridQ, threads >>> (
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QVelDeviceIncompHighNu27<<< grid.grid, grid.threads >>> (
 													  vx,
 													  vy,
 													  vz,
@@ -3210,22 +2409,9 @@ void QVelDevCompHighNu27(  unsigned int numberOfThreads,
 									  unsigned int size_Mat,
 									  bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QVelDeviceCompHighNu27<<< gridQ, threads >>> (
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      QVelDeviceCompHighNu27<<< grid.grid, grid.threads >>> (
 													  vx,
 													  vy,
 													  vz,
@@ -3256,22 +2442,9 @@ void QVeloDevEQ27(unsigned int numberOfThreads,
 							 unsigned int size_Mat,
 							 bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QVeloDeviceEQ27<<< gridQ, threads >>> (VeloX,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+   QVeloDeviceEQ27<<< grid.grid, grid.threads >>> (VeloX,
 											 VeloY,
 											 VeloZ,
 											 DD,
@@ -3301,22 +2474,9 @@ void QVeloStreetDevEQ27(
 	uint  size_Mat,
 	bool  isEvenTimestep)
 {
-	int Grid = (numberOfStreetNodes / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 gridQ(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	QVeloStreetDeviceEQ27 << < gridQ, threads >> > (
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfStreetNodes);
+
+	QVeloStreetDeviceEQ27 << < grid.grid, grid.threads >> > (
 		veloXfraction,
 		veloYfraction,
 		naschVelo,
@@ -3355,7 +2515,7 @@ void QSlipDevCompTurbulentViscosity27(LBMSimulationParameter* parameterDevice, Q
 {
    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
-
+   
    QSlipDeviceComp27TurbViscosity<<< grid, threads >>> (
          parameterDevice->distributions.f[0],
          boundaryCondition->k,
@@ -3395,7 +2555,7 @@ void QSlipDevComp27(LBMSimulationParameter* parameterDevice, QforBoundaryConditi
 {
    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
    dim3 threads(parameterDevice->numberofthreads, 1, 1 );
-
+   
    QSlipDeviceComp27<<< grid, threads >>> (
          parameterDevice->distributions.f[0],
          boundaryCondition->k,
@@ -3444,22 +2604,9 @@ void QSlipGeomDevComp27(unsigned int numberOfThreads,
 								   unsigned int size_Mat,
 								   bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QSlipGeomDeviceComp27<<< gridQ, threads >>> (DD,
+	vf::cuda::CudaGrid grid(numberOfThreads, numberOfBCnodes);
+
+   QSlipGeomDeviceComp27<<< grid.grid, grid.threads >>> (DD,
 												   k_Q,
 												   QQ,
 												   numberOfBCnodes,
@@ -3472,7 +2619,7 @@ void QSlipGeomDevComp27(unsigned int numberOfThreads,
 												   neighborZ,
 												   size_Mat,
 												   isEvenTimestep);
-      getLastCudaError("QSlipGeomDeviceComp27 execution failed");
+   getLastCudaError("QSlipGeomDeviceComp27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QSlipNormDevComp27(unsigned int numberOfThreads,
@@ -3490,22 +2637,9 @@ void QSlipNormDevComp27(unsigned int numberOfThreads,
 								   unsigned int size_Mat,
 								   bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QSlipNormDeviceComp27<<< gridQ, threads >>> (DD,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+   QSlipNormDeviceComp27<<< grid.grid, grid.threads >>> (DD,
 												   k_Q,
 												   QQ,
 												   numberOfBCnodes,
@@ -3676,22 +2810,9 @@ void QPressDevAntiBB27(  unsigned int numberOfThreads,
 									unsigned int size_Mat,
 									bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-    QPressDeviceAntiBB27<<< gridQ, threads >>>( rhoBC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+   QPressDeviceAntiBB27<<< grid.grid, grid.threads >>>( rhoBC,
 												vx,
 												vy,
 												vz,
@@ -3705,7 +2826,7 @@ void QPressDevAntiBB27(  unsigned int numberOfThreads,
 												neighborZ,
 												size_Mat,
 												isEvenTimestep);
-    getLastCudaError("QPressDeviceAntiBB27 execution failed");
+   getLastCudaError("QPressDeviceAntiBB27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QPressDevFixBackflow27( unsigned int numberOfThreads,
@@ -3720,22 +2841,9 @@ void QPressDevFixBackflow27( unsigned int numberOfThreads,
                                         unsigned int size_Mat,
                                         bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QPressDeviceFixBackflow27<<< gridQ, threads >>> (  rhoBC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+   QPressDeviceFixBackflow27<<< grid.grid, grid.threads >>> (  rhoBC,
                                                          DD,
                                                          k_Q,
                                                          numberOfBCnodes,
@@ -3745,7 +2853,7 @@ void QPressDevFixBackflow27( unsigned int numberOfThreads,
                                                          neighborZ,
                                                          size_Mat,
                                                          isEvenTimestep);
-      getLastCudaError("QPressDeviceFixBackflow27 execution failed");
+   getLastCudaError("QPressDeviceFixBackflow27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QPressDevDirDepBot27(  unsigned int numberOfThreads,
@@ -3760,22 +2868,9 @@ void QPressDevDirDepBot27(  unsigned int numberOfThreads,
                                        unsigned int size_Mat,
                                        bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QPressDeviceDirDepBot27<<< gridQ, threads >>> ( rhoBC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+   QPressDeviceDirDepBot27<<< grid.grid, grid.threads >>> ( rhoBC,
                                                       DD,
                                                       k_Q,
                                                       numberOfBCnodes,
@@ -3785,7 +2880,7 @@ void QPressDevDirDepBot27(  unsigned int numberOfThreads,
                                                       neighborZ,
                                                       size_Mat,
                                                       isEvenTimestep);
-      getLastCudaError("QPressDeviceDirDepBot27 execution failed");
+   getLastCudaError("QPressDeviceDirDepBot27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QPressNoRhoDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
@@ -3804,10 +2899,33 @@ void QPressNoRhoDev27(LBMSimulationParameter* parameterDevice, QforBoundaryCondi
          parameterDevice->neighborY,
          parameterDevice->neighborZ,
          parameterDevice->numberOfNodes,
-         parameterDevice->isEvenTimestep);
+         parameterDevice->isEvenTimestep,
+         vf::lbm::dir::DIR_P00);
    getLastCudaError("QPressNoRhoDevice27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
+void QPressZeroRhoOutflowDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
+{
+   dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
+   dim3 threads(parameterDevice->numberofthreads, 1, 1 );
+
+   QPressZeroRhoOutflowDevice27<<< grid, threads >>> (
+         boundaryCondition->RhoBC,
+         parameterDevice->distributions.f[0],
+         boundaryCondition->k,
+         boundaryCondition->kN,
+         boundaryCondition->numberOfBCnodes,
+         parameterDevice->omega,
+         parameterDevice->neighborX,
+         parameterDevice->neighborY,
+         parameterDevice->neighborZ,
+         parameterDevice->numberOfNodes,
+         parameterDevice->isEvenTimestep,
+         vf::lbm::dir::DIR_P00,
+         parameterDevice->outflowPressureCorrectionFactor);
+   getLastCudaError("QPressZeroRhoOutflowDev27 execution failed");
+}
+//////////////////////////////////////////////////////////////////////////
 void QInflowScaleByPressDev27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
 {
    dim3 grid = vf::cuda::getCudaGrid( parameterDevice->numberofthreads,  boundaryCondition->numberOfBCnodes);
@@ -3841,22 +2959,9 @@ void QPressDevOld27(  unsigned int numberOfThreads,
                                      unsigned int size_Mat,
                                      bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QPressDeviceOld27<<< gridQ, threads >>> ( rhoBC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+   QPressDeviceOld27<<< grid.grid, grid.threads >>> ( rhoBC,
                                                 DD,
                                                 k_Q,
                                                 k_N,
@@ -3867,7 +2972,7 @@ void QPressDevOld27(  unsigned int numberOfThreads,
                                                 neighborZ,
                                                 size_Mat,
                                                 isEvenTimestep);
-      getLastCudaError("QPressDeviceOld27 execution failed");
+   getLastCudaError("QPressDeviceOld27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QPressDevIncompNEQ27(LBMSimulationParameter* parameterDevice, QforBoundaryConditions* boundaryCondition)
@@ -3941,22 +3046,9 @@ void QPressDevZero27(unsigned int numberOfThreads,
                                 unsigned int size_Mat,
                                 bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QPressDeviceZero27<<< gridQ, threads >>> (DD,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+   QPressDeviceZero27<<< grid.grid, grid.threads >>> (DD,
                                                 k_Q,
                                                 numberOfBCnodes,
                                                 neighborX,
@@ -3964,7 +3056,7 @@ void QPressDevZero27(unsigned int numberOfThreads,
                                                 neighborZ,
                                                 size_Mat,
                                                 isEvenTimestep);
-      getLastCudaError("QPressDeviceOld27 execution failed");
+   getLastCudaError("QPressDeviceOld27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
 void QPressDevFake27(     unsigned int numberOfThreads,
@@ -3980,22 +3072,10 @@ void QPressDevFake27(     unsigned int numberOfThreads,
                                      unsigned int size_Mat,
                                      bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      QPressDeviceFake27<<< gridQ, threads >>> (rhoBC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+
+      QPressDeviceFake27<<< grid.grid, grid.threads >>> (rhoBC,
                                                 DD,
                                                 k_Q,
                                                 k_N,
@@ -4040,22 +3120,9 @@ void QPressDev27_IntBB(  unsigned int numberOfThreads,
 									unsigned int size_Mat,
 									bool isEvenTimestep)
 {
-	int Grid = (numberOfBCnodes / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 gridQ(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-		QPressDevice27_IntBB<<< gridQ, threads >>> (rho,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+	QPressDevice27_IntBB<<< grid.grid, grid.threads >>> (rho,
 													DD,
 													k_Q,
 													QQ,
@@ -4066,7 +3133,7 @@ void QPressDev27_IntBB(  unsigned int numberOfThreads,
 													neighborZ,
 													size_Mat,
 													isEvenTimestep);
-		getLastCudaError("QPressDevice27_IntBB execution failed");
+	getLastCudaError("QPressDevice27_IntBB execution failed");
 }
 // TODO: https://git.rz.tu-bs.de/irmb/VirtualFluids_dev/-/issues/29
 //////////////////////////////////////////////////////////////////////////
@@ -4087,22 +3154,9 @@ void PressSchlaffer27(unsigned int numberOfThreads,
                                  unsigned int size_Mat,
                                  bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      PressSchlaff27<<< gridQ, threads >>>(  rhoBC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+   PressSchlaff27<<< grid.grid, grid.threads >>>(  rhoBC,
                                              DD,
                                              vx0,
                                              vy0,
@@ -4117,7 +3171,7 @@ void PressSchlaffer27(unsigned int numberOfThreads,
                                              neighborZ,
                                              size_Mat,
                                              isEvenTimestep);
-      getLastCudaError("PressSchlaff27 execution failed");
+   getLastCudaError("PressSchlaff27 execution failed");
 }
 // TODO: https://git.rz.tu-bs.de/irmb/VirtualFluids_dev/-/issues/29
 //////////////////////////////////////////////////////////////////////////
@@ -4136,22 +3190,9 @@ void VelSchlaffer27(  unsigned int numberOfThreads,
                                  unsigned int size_Mat,
                                  bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      VelSchlaff27<<< gridQ, threads >>>( t,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+   VelSchlaff27<<< grid.grid, grid.threads >>>( t,
                                           DD,
                                           vz0,
                                           deltaVz0,
@@ -4167,7 +3208,71 @@ void VelSchlaffer27(  unsigned int numberOfThreads,
       getLastCudaError("VelSchlaff27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
-void PropVelo(   unsigned int numberOfThreads,
+void QPrecursorDevCompZeroPress(LBMSimulationParameter* parameterDevice, QforPrecursorBoundaryConditions* boundaryCondition, real tRatio, real velocityRatio)
+{
+
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+
+	QPrecursorDeviceCompZeroPress<<< grid.grid, grid.threads >>>(boundaryCondition->k, boundaryCondition->numberOfBCnodes, boundaryCondition->numberOfPrecursorNodes, boundaryCondition->sizeQ, parameterDevice->omega, 
+		parameterDevice->distributions.f[0], boundaryCondition->q27[0],
+		parameterDevice->neighborX, parameterDevice->neighborY, parameterDevice->neighborZ,
+		boundaryCondition->planeNeighborNT, boundaryCondition->planeNeighborNB, boundaryCondition->planeNeighborST, boundaryCondition->planeNeighborSB,
+		boundaryCondition->weightsNT, boundaryCondition->weightsNB, boundaryCondition->weightsST, boundaryCondition->weightsSB,
+		boundaryCondition->last, boundaryCondition->current,
+		boundaryCondition->velocityX, boundaryCondition->velocityY, boundaryCondition->velocityZ, 
+		tRatio, velocityRatio, parameterDevice->numberOfNodes, parameterDevice->isEvenTimestep);
+	getLastCudaError("QPrecursorDeviceCompZeroPress execution failed"); 
+
+}
+//////////////////////////////////////////////////////////////////////////
+void PrecursorDevEQ27( LBMSimulationParameter* parameterDevice, QforPrecursorBoundaryConditions* boundaryCondition, real tRatio, real velocityRatio)
+{
+
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+
+	PrecursorDeviceEQ27<<< grid.grid, grid.threads >>>(boundaryCondition->k, boundaryCondition->numberOfBCnodes, boundaryCondition->numberOfPrecursorNodes, parameterDevice->omega, parameterDevice->distributions.f[0], 
+		parameterDevice->neighborX, parameterDevice->neighborX, parameterDevice->neighborX,
+		boundaryCondition->planeNeighborNT, boundaryCondition->planeNeighborNB, boundaryCondition->planeNeighborST, boundaryCondition->planeNeighborSB,
+		boundaryCondition->weightsNT, boundaryCondition->weightsNB, boundaryCondition->weightsST, boundaryCondition->weightsSB,
+		boundaryCondition->last, boundaryCondition->current,
+		boundaryCondition->velocityX, boundaryCondition->velocityY, boundaryCondition->velocityZ, 
+		tRatio, velocityRatio, parameterDevice->numberOfNodes, parameterDevice->isEvenTimestep);
+	getLastCudaError("PrecursorDeviceEQ27 execution failed"); 
+
+}
+//////////////////////////////////////////////////////////////////////////
+void PrecursorDevDistributions( LBMSimulationParameter* parameterDevice, QforPrecursorBoundaryConditions* boundaryCondition, real tRatio, real velocityRatio)
+{
+
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+
+	PrecursorDeviceDistributions<<< grid.grid, grid.threads >>>(boundaryCondition->k, boundaryCondition->numberOfBCnodes, boundaryCondition->numberOfPrecursorNodes, parameterDevice->distributions.f[0],
+		parameterDevice->neighborX, parameterDevice->neighborY, parameterDevice->neighborZ,
+		boundaryCondition->planeNeighborNT, boundaryCondition->planeNeighborNB, boundaryCondition->planeNeighborST, boundaryCondition->planeNeighborSB,
+		boundaryCondition->weightsNT, boundaryCondition->weightsNB, boundaryCondition->weightsST, boundaryCondition->weightsSB,
+		boundaryCondition->last, boundaryCondition->current,
+		tRatio, parameterDevice->numberOfNodes, parameterDevice->isEvenTimestep);
+	getLastCudaError("QPrecursorDeviceCompZeroPress execution failed"); 
+
+}
+
+//////////////////////////////////////////////////////////////////////////
+void QPrecursorDevDistributions( LBMSimulationParameter* parameterDevice, QforPrecursorBoundaryConditions* boundaryCondition, real tRatio, real velocityRatio)
+{
+
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(parameterDevice->numberofthreads, boundaryCondition->numberOfBCnodes);
+
+	QPrecursorDeviceDistributions<<< grid.grid, grid.threads >>>(boundaryCondition->k, boundaryCondition->q27[0], boundaryCondition->sizeQ, boundaryCondition->numberOfBCnodes, boundaryCondition->numberOfPrecursorNodes, parameterDevice->distributions.f[0],
+		parameterDevice->neighborX, parameterDevice->neighborY, parameterDevice->neighborZ,
+		boundaryCondition->planeNeighborNT, boundaryCondition->planeNeighborNB, boundaryCondition->planeNeighborST, boundaryCondition->planeNeighborSB,
+		boundaryCondition->weightsNT, boundaryCondition->weightsNB, boundaryCondition->weightsST, boundaryCondition->weightsSB,
+		boundaryCondition->last, boundaryCondition->current,
+		tRatio, parameterDevice->numberOfNodes, parameterDevice->isEvenTimestep);
+	getLastCudaError("QPrecursorDeviceCompZeroPress execution failed"); 
+
+}
+//////////////////////////////////////////////////////////////////////////
+extern "C" void PropVelo(   unsigned int numberOfThreads,
                             unsigned int* neighborX,
                             unsigned int* neighborY,
                             unsigned int* neighborZ,
@@ -4182,22 +3287,9 @@ void PropVelo(   unsigned int numberOfThreads,
                             real* DD,
                             bool EvenOrOdd)
 {
-   int Grid = (size_Prop / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 grid(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      PropellerBC<<< grid, threads >>>(neighborX,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Prop);
+
+      PropellerBC<<< grid.grid, grid.threads >>>(neighborX,
                                        neighborY,
                                        neighborZ,
                                        rho,
@@ -4236,22 +3328,9 @@ void ScaleCF27( real* DC,
                         unsigned int nyF,
                         unsigned int numberOfThreads)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF27<<< gridINT_CF, threads >>> ( DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+   
+      scaleCF27<<< grid.grid, grid.threads >>> ( DC,
                                              DF,
                                              neighborCX,
                                              neighborCY,
@@ -4299,22 +3378,9 @@ void ScaleCFEff27(real* DC,
                              unsigned int numberOfThreads,
                              OffCF offCF)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCFEff27<<< gridINT_CF, threads >>> ( DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCFEff27<<< grid.grid, grid.threads >>> ( DC,
                                                 DF,
                                                 neighborCX,
                                                 neighborCY,
@@ -4363,22 +3429,9 @@ void ScaleCFLast27(real* DC,
                               unsigned int numberOfThreads,
                               OffCF offCF)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCFLast27<<< gridINT_CF, threads >>> (DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCFLast27<<< grid.grid, grid.threads >>> (DC,
                                                 DF,
                                                 neighborCX,
                                                 neighborCY,
@@ -4427,22 +3480,9 @@ void ScaleCFpress27(  real* DC,
                                  unsigned int numberOfThreads,
                                  OffCF offCF)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCFpress27<<< gridINT_CF, threads >>>(DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCFpress27<<< grid.grid, grid.threads >>>(DC,
                                                 DF,
                                                 neighborCX,
                                                 neighborCY,
@@ -4491,22 +3531,9 @@ void ScaleCF_Fix_27(  real* DC,
                                  unsigned int numberOfThreads,
                                  OffCF offCF)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_Fix_27<<< gridINT_CF, threads >>>(DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCF_Fix_27<<< grid.grid, grid.threads >>>(DC,
                                                 DF,
                                                 neighborCX,
                                                 neighborCY,
@@ -4555,22 +3582,9 @@ void ScaleCF_Fix_comp_27( real* DC,
 									 unsigned int numberOfThreads,
 									 OffCF offCF)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_Fix_comp_27<<< gridINT_CF, threads >>>(   DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCF_Fix_comp_27<<< grid.grid, grid.threads >>>(   DC,
 														DF,
 														neighborCX,
 														neighborCY,
@@ -4620,22 +3634,9 @@ void ScaleCF_0817_comp_27(real* DC,
 									 OffCF offCF,
                             CUstream_st *stream)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_0817_comp_27<<< gridINT_CF, threads, 0, stream >>>(  DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCF_0817_comp_27<<< grid.grid, grid.threads, 0, stream >>>(  DC,
 														DF,
 														neighborCX,
 														neighborCY,
@@ -4685,22 +3686,9 @@ void ScaleCF_comp_D3Q27F3_2018(real* DC,
 										  unsigned int numberOfThreads,
 										  OffCF offCF)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_comp_D3Q27F3_2018 <<< gridINT_CF, threads >>>(DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCF_comp_D3Q27F3_2018 <<< grid.grid, grid.threads >>>(DC,
 															DF,
 															G6,
 															neighborCX,
@@ -4752,22 +3740,9 @@ void ScaleCF_comp_D3Q27F3(real* DC,
 									 OffCF offCF,
                             CUstream_st *stream)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_comp_D3Q27F3 <<< gridINT_CF, threads, 0, stream >>>( DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCF_comp_D3Q27F3 <<< grid.grid, grid.threads, 0, stream >>>( DC,
 														DF,
 														G6,
 														neighborCX,
@@ -4817,22 +3792,9 @@ void ScaleCF_staggered_time_comp_27(  real* DC,
 												 unsigned int numberOfThreads,
 												 OffCF offCF)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_staggered_time_comp_27<<< gridINT_CF, threads >>>(    DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCF_staggered_time_comp_27<<< grid.grid, grid.threads >>>(    DC,
 																	DF,
 																	neighborCX,
 																	neighborCY,
@@ -4940,22 +3902,9 @@ void ScaleCF_RhoSq_3rdMom_comp_27(real* DC,
 											 OffCF offCF,
                                   CUstream_st *stream)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_RhoSq_3rdMom_comp_27<<< gridINT_CF, threads, 0, stream >>>(  DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCF_RhoSq_3rdMom_comp_27<<< grid.grid, grid.threads, 0, stream >>>(  DC,
 																DF,
 																neighborCX,
 																neighborCY,
@@ -5005,22 +3954,9 @@ void ScaleCF_AA2016_comp_27(real* DC,
 									   OffCF offCF,
                               CUstream_st *stream)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_AA2016_comp_27<<< gridINT_CF, threads, 0, stream >>>(DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCF_AA2016_comp_27<<< grid.grid, grid.threads, 0, stream >>>(DC,
 														DF,
 														neighborCX,
 														neighborCY,
@@ -5069,22 +4005,9 @@ void ScaleCF_NSPress_27(  real* DC,
 									 unsigned int numberOfThreads,
 									 OffCF offCF)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCF_NSPress_27<<< gridINT_CF, threads >>>(DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCF_NSPress_27<<< grid.grid, grid.threads >>>(DC,
 													DF,
 													neighborCX,
 													neighborCY,
@@ -5130,22 +4053,9 @@ void ScaleCFThSMG7(   real* DC,
                                  unsigned int numberOfThreads,
                                  OffCF offCF)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCFThSMG7<<< gridINT_CF, threads >>> (DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCFThSMG7<<< grid.grid, grid.threads >>> (DC,
                                                 DF,
                                                 DD7C,
                                                 DD7F,
@@ -5187,22 +4097,9 @@ void ScaleCFThS7(  real* DC,
                               real diffusivity_fine,
                               unsigned int numberOfThreads)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCFThS7<<< gridINT_CF, threads >>> (  DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCFThS7<<< grid.grid, grid.threads >>> (  DC,
                                                 DF,
                                                 DD7C,
                                                 DD7F,
@@ -5244,22 +4141,9 @@ void ScaleCFThS27( real* DC,
                               unsigned int numberOfThreads,
 							  OffCF offCF)
 {
-   int Grid = (kCF / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_CF(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleCFThS27<<< gridINT_CF, threads >>> ( DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kCF);
+
+      scaleCFThS27<<< grid.grid, grid.threads >>> ( DC,
                                                 DF,
                                                 DD27C,
                                                 DD27F,
@@ -5304,22 +4188,10 @@ void ScaleFC27( real* DC,
                            unsigned int nyF,
                            unsigned int numberOfThreads)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC27<<< gridINT_FC, threads >>> ( DC,
+   
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFC27<<< grid.grid, grid.threads >>> ( DC,
                                              DF,
                                              neighborCX,
                                              neighborCY,
@@ -5367,22 +4239,9 @@ void ScaleFCEff27(real* DC,
                              unsigned int numberOfThreads,
                              OffFC offFC)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFCEff27<<< gridINT_FC, threads >>> ( DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFCEff27<<< grid.grid, grid.threads >>> ( DC,
                                                 DF,
                                                 neighborCX,
                                                 neighborCY,
@@ -5431,22 +4290,9 @@ void ScaleFCLast27(real* DC,
                               unsigned int numberOfThreads,
                               OffFC offFC)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFCLast27<<< gridINT_FC, threads >>> (DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFCLast27<<< grid.grid, grid.threads >>> (DC,
                                                 DF,
                                                 neighborCX,
                                                 neighborCY,
@@ -5495,22 +4341,9 @@ void ScaleFCpress27(real* DC,
                               unsigned int numberOfThreads,
                               OffFC offFC)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFCpress27<<< gridINT_FC, threads >>> (  DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFCpress27<<< grid.grid, grid.threads >>> (  DC,
                                                    DF,
                                                    neighborCX,
                                                    neighborCY,
@@ -5559,22 +4392,9 @@ void ScaleFC_Fix_27(real* DC,
                               unsigned int numberOfThreads,
                               OffFC offFC)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_Fix_27<<< gridINT_FC, threads >>> (  DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFC_Fix_27<<< grid.grid, grid.threads >>> (  DC,
                                                    DF,
                                                    neighborCX,
                                                    neighborCY,
@@ -5623,22 +4443,9 @@ void ScaleFC_Fix_comp_27(  real* DC,
 									  unsigned int numberOfThreads,
 									  OffFC offFC)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_Fix_comp_27<<< gridINT_FC, threads >>> ( DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFC_Fix_comp_27<<< grid.grid, grid.threads >>> ( DC,
 													   DF,
 													   neighborCX,
 													   neighborCY,
@@ -5688,22 +4495,9 @@ void ScaleFC_0817_comp_27( real* DC,
 									  OffFC offFC,
                              CUstream_st *stream)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_0817_comp_27<<< gridINT_FC, threads, 0, stream >>> (DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFC_0817_comp_27<<< grid.grid, grid.threads, 0, stream >>> (DC,
 													   DF,
 													   neighborCX,
 													   neighborCY,
@@ -5753,22 +4547,9 @@ void ScaleFC_comp_D3Q27F3_2018( real* DC,
 										   unsigned int numberOfThreads,
 										   OffFC offFC)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-     scaleFC_comp_D3Q27F3_2018 <<< gridINT_FC, threads >>> (DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+     scaleFC_comp_D3Q27F3_2018 <<< grid.grid, grid.threads >>> (DC,
 															DF,
 															G6,
 															neighborCX,
@@ -5820,22 +4601,9 @@ void ScaleFC_comp_D3Q27F3( real* DC,
 									  OffFC offFC,
                              CUstream_st *stream)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-     scaleFC_comp_D3Q27F3 <<< gridINT_FC, threads, 0, stream >>> (DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+     scaleFC_comp_D3Q27F3 <<< grid.grid, grid.threads, 0, stream >>> (DC,
 													   DF,
 													   G6,
 													   neighborCX,
@@ -5885,22 +4653,9 @@ void ScaleFC_staggered_time_comp_27(   real* DC,
 												  unsigned int numberOfThreads,
 												  OffFC offFC)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_staggered_time_comp_27<<< gridINT_FC, threads >>> (  DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFC_staggered_time_comp_27<<< grid.grid, grid.threads >>> (  DC,
 																   DF,
 																   neighborCX,
 																   neighborCY,
@@ -6007,22 +4762,9 @@ void ScaleFC_RhoSq_3rdMom_comp_27( real* DC,
 											  OffFC offFC,
                                    CUstream_st *stream)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_RhoSq_3rdMom_comp_27<<< gridINT_FC, threads, 0, stream >>>(DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFC_RhoSq_3rdMom_comp_27<<< grid.grid, grid.threads, 0, stream >>>(DC,
 															  DF,
 															  neighborCX,
 															  neighborCY,
@@ -6072,22 +4814,9 @@ void ScaleFC_AA2016_comp_27( real* DC,
 										OffFC offFC,
                               CUstream_st *stream)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_AA2016_comp_27<<< gridINT_FC, threads, 0, stream >>>(DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFC_AA2016_comp_27<<< grid.grid, grid.threads, 0, stream >>>(DC,
 														DF,
 														neighborCX,
 														neighborCY,
@@ -6136,22 +4865,9 @@ void ScaleFC_NSPress_27(real* DC,
 								  unsigned int numberOfThreads,
 								  OffFC offFC)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFC_NSPress_27<<< gridINT_FC, threads >>> (  DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFC_NSPress_27<<< grid.grid, grid.threads >>> (  DC,
 													   DF,
 													   neighborCX,
 													   neighborCY,
@@ -6197,22 +4913,9 @@ void ScaleFCThSMG7(real* DC,
                               unsigned int numberOfThreads,
                               OffFC offFC)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFCThSMG7<<< gridINT_FC, threads >>>( DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFCThSMG7<<< grid.grid, grid.threads >>>( DC,
                                                 DF,
                                                 DD7C,
                                                 DD7F,
@@ -6254,22 +4957,9 @@ void ScaleFCThS7(  real* DC,
                               real diffusivity_coarse,
                               unsigned int numberOfThreads)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFCThS7<<< gridINT_FC, threads >>>(DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFCThS7<<< grid.grid, grid.threads >>>(DC,
                                              DF,
                                              DD7C,
                                              DD7F,
@@ -6311,22 +5001,9 @@ void ScaleFCThS27( real* DC,
                               unsigned int numberOfThreads,
 							  OffFC offFC)
 {
-   int Grid = (kFC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridINT_FC(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      scaleFCThS27<<< gridINT_FC, threads >>>(  DC,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, kFC);
+
+      scaleFCThS27<<< grid.grid, grid.threads >>>(  DC,
                                                 DF,
                                                 DD27C,
                                                 DD27F,
@@ -6362,22 +5039,9 @@ void DragLiftPostD27(real* DD,
 								bool isEvenTimestep,
 								unsigned int numberOfThreads)
 {
-	int Grid = (numberOfBCnodes / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	DragLiftPost27<<< grid, threads >>>(DD,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+	DragLiftPost27<<< grid.grid, grid.threads >>>(DD,
 										k_Q,
 										QQ,
 										numberOfBCnodes,
@@ -6406,22 +5070,9 @@ void DragLiftPreD27( real* DD,
 								bool isEvenTimestep,
 								unsigned int numberOfThreads)
 {
-	int Grid = (numberOfBCnodes / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	DragLiftPre27<<< grid, threads >>>( DD,
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+	DragLiftPre27<<< grid.grid, grid.threads >>>( DD,
 										k_Q,
 										QQ,
 										numberOfBCnodes,
@@ -6447,22 +5098,9 @@ void CalcCPtop27(real* DD,
 							bool isEvenTimestep,
 							unsigned int numberOfThreads)
 {
-	int Grid = (nonCp / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	CalcCP27<<< grid, threads >>>(DD,
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, nonCp);
+
+	CalcCP27<<< grid.grid, grid.threads >>>(DD,
 								  cpIndex,
 								  nonCp,
 								  cpPress,
@@ -6485,22 +5123,9 @@ void CalcCPbottom27( real* DD,
 								bool isEvenTimestep,
 								unsigned int numberOfThreads)
 {
-	int Grid = (nonCp / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	CalcCP27<<< grid, threads >>>(DD,
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, nonCp);
+
+	CalcCP27<<< grid.grid, grid.threads >>>(DD,
 								  cpIndex,
 								  nonCp,
 								  cpPress,
@@ -6524,22 +5149,9 @@ void GetSendFsPreDev27(real* DD,
 								  unsigned int numberOfThreads,
 								  cudaStream_t stream)
 {
-	int Grid = (buffmax / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	getSendFsPre27<<< grid, threads, 0, stream >>>(DD,
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, buffmax);
+
+	getSendFsPre27<<< grid.grid, grid.threads, 0, stream >>>(DD,
 										bufferFs,
 										sendIndex,
 										buffmax,
@@ -6563,22 +5175,9 @@ void GetSendFsPostDev27(real* DD,
 								   unsigned int numberOfThreads,
 								   cudaStream_t stream)
 {
-	int Grid = (buffmax / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	getSendFsPost27<<< grid, threads, 0, stream >>>(DD,
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, buffmax);
+
+	getSendFsPost27<<< grid.grid, grid.threads, 0, stream >>>(DD,
 										 bufferFs,
 										 sendIndex,
 										 buffmax,
@@ -6602,22 +5201,9 @@ void SetRecvFsPreDev27(real* DD,
 								  unsigned int numberOfThreads,
 	                              cudaStream_t stream)
 {
-	int Grid = (buffmax / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	setRecvFsPre27<<< grid, threads, 0, stream >>>(DD,
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, buffmax);
+
+	setRecvFsPre27<<< grid.grid, grid.threads, 0, stream >>>(DD,
 										bufferFs,
 										recvIndex,
 										buffmax,
@@ -6641,22 +5227,9 @@ void SetRecvFsPostDev27(real* DD,
 	                               unsigned int numberOfThreads,
 	                               cudaStream_t stream)
 {
-	int Grid = (buffmax / numberOfThreads)+1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid/Grid1)+1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1 );
-
-	setRecvFsPost27<<< grid, threads, 0, stream >>>(DD,
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, buffmax);
+
+	setRecvFsPost27<<< grid.grid, grid.threads, 0, stream >>>(DD,
 										 bufferFs,
 										 recvIndex,
 										 buffmax,
@@ -6680,22 +5253,9 @@ void getSendGsDevF3(
 	bool isEvenTimestep,
 	unsigned int numberOfThreads)
 {
-	int Grid = (buffmax / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	getSendGsF3 <<< grid, threads >>> (
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, buffmax);
+
+	getSendGsF3 <<< grid.grid, grid.threads >>> (
 		G6,
 		bufferGs,
 		sendIndex,
@@ -6720,22 +5280,9 @@ void setRecvGsDevF3(
 	bool isEvenTimestep,
 	unsigned int numberOfThreads)
 {
-	int Grid = (buffmax / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid > 512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	setRecvGsF3 <<< grid, threads >>> (
+	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, buffmax);
+
+	setRecvGsF3 <<< grid.grid, grid.threads >>> (
 		G6,
 		bufferGs,
 		recvIndex,
@@ -6763,22 +5310,9 @@ void WallFuncDev27(unsigned int numberOfThreads,
 							  unsigned int size_Mat,
 							  bool isEvenTimestep)
 {
-   int Grid = (numberOfBCnodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      WallFunction27<<< gridQ, threads >>> (
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfBCnodes);
+
+      WallFunction27<<< grid.grid, grid.threads >>> (
 											  vx,
 											  vy,
 											  vz,
@@ -6814,22 +5348,9 @@ void SetOutputWallVelocitySP27(unsigned int numberOfThreads,
 										  real* DD,
 										  bool isEvenTimestep)
 {
-   int Grid = (numberOfWallNodes / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      LBSetOutputWallVelocitySP27<<< gridQ, threads >>> (	vxD,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfWallNodes);
+
+      LBSetOutputWallVelocitySP27<<< grid.grid, grid.threads >>> (	vxD,
 															vyD,
 															vzD,
 															vxWall,
@@ -6862,22 +5383,9 @@ void GetVelotoForce27(unsigned int numberOfThreads,
 								 unsigned int size_Mat,
 								 bool isEvenTimestep)
 {
-   int Grid = (nonAtBC / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-      GetVeloforForcing27<<< gridQ, threads >>> (DD,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, nonAtBC);
+
+      GetVeloforForcing27<<< grid.grid, grid.threads >>> (DD,
 												bcIndex,
 												nonAtBC,
 												Vx,
@@ -6911,27 +5419,14 @@ void InitParticlesDevice(real* coordX,
 									unsigned int* neighborY,
 									unsigned int* neighborZ,
 									unsigned int* neighborWSB,
-							        int level,
+									int level,
 									unsigned int numberOfParticles,
 									unsigned int size_Mat,
 									unsigned int numberOfThreads)
 {
-   int Grid = (numberOfParticles / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   InitParticles<<< gridQ, threads >>> (coordX,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfParticles);
+
+   InitParticles<<< grid.grid, grid.threads >>> (coordX,
 										coordY,
 										coordZ,
 										coordParticleXlocal,
@@ -6986,22 +5481,9 @@ void MoveParticlesDevice(real* coordX,
 									unsigned int numberOfThreads,
 									bool isEvenTimestep)
 {
-   int Grid = (numberOfParticles / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   MoveParticles<<< gridQ, threads >>> (coordX,
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, numberOfParticles);
+
+   MoveParticles<<< grid.grid, grid.threads >>> (coordX,
 										coordY,
 										coordZ,
 										coordParticleXlocal,
@@ -7035,22 +5517,8 @@ void initRandomDevice(curandState* state,
 								 unsigned int size_Mat,
 								 unsigned int numberOfThreads)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   initRandom<<< gridQ, threads >>> (state);
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+   initRandom<<< grid.grid, grid.threads >>> (state);
    getLastCudaError("initRandom execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
@@ -7059,22 +5527,8 @@ void generateRandomValuesDevice( curandState* state,
 											real* randArray,
 											unsigned int numberOfThreads)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   generateRandomValues<<< gridQ, threads >>> (state,randArray);
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+   generateRandomValues<<< grid.grid, grid.threads >>> (state,randArray);
    getLastCudaError("generateRandomValues execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
@@ -7097,22 +5551,8 @@ void CalcTurbulenceIntensityDevice(
    bool isEvenTimestep,
    uint numberOfThreads)
 {
-   int Grid = (size_Mat / numberOfThreads)+1;
-   int Grid1, Grid2;
-   if (Grid>512)
-   {
-      Grid1 = 512;
-      Grid2 = (Grid/Grid1)+1;
-   }
-   else
-   {
-      Grid1 = 1;
-      Grid2 = Grid;
-   }
-   dim3 gridQ(Grid1, Grid2);
-   dim3 threads(numberOfThreads, 1, 1 );
-
-   CalcTurbulenceIntensity<<<gridQ, threads>>>(
+   vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(numberOfThreads, size_Mat);
+   CalcTurbulenceIntensity<<<grid.grid, grid.threads>>>(
      vxx,
      vyy,
      vzz,
diff --git a/src/gpu/VirtualFluids_GPU/GPU/PrecursorBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/PrecursorBCs27.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a0daa5c229aabac360a71ae0f538f74124e3e963
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/GPU/PrecursorBCs27.cu
@@ -0,0 +1,966 @@
+#include "LBM/LB.h" 
+#include <lbm/constants/NumericConstants.h>
+#include <lbm/constants/D3Q27.h>
+#include <lbm/MacroscopicQuantities.h>
+
+#include "VirtualFluids_GPU/Kernel/Utilities/DistributionHelper.cuh"
+#include "VirtualFluids_GPU/GPU/KernelUtilities.h"
+
+using namespace vf::lbm::constant;
+using namespace vf::lbm::dir;
+
+__global__ void QPrecursorDeviceCompZeroPress( 	int* subgridDistanceIndices,
+                                                int numberOfBCnodes,
+                                                int numberOfPrecursorNodes,
+                                                int sizeQ,
+                                                real omega,
+                                                real* distributions,
+                                                real* subgridDistances,
+                                                uint* neighborX, 
+                                                uint* neighborY, 
+                                                uint* neighborZ,
+                                                uint* neighborsNT, 
+                                                uint* neighborsNB,
+                                                uint* neighborsST,
+                                                uint* neighborsSB,
+                                                real* weightsNT, 
+                                                real* weightsNB,
+                                                real* weightsST,
+                                                real* weightsSB,
+                                                real* vLast, 
+                                                real* vCurrent,
+                                                real velocityX,
+                                                real velocityY,
+                                                real velocityZ,
+                                                real tRatio,
+                                                real velocityRatio,
+                                                unsigned long long numberOfLBnodes,
+                                                bool isEvenTimestep)
+{
+    const unsigned k = vf::gpu::getNodeIndex();
+
+    if(k>=numberOfBCnodes) return;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // interpolation of velocity
+    real vxLastInterpd, vyLastInterpd, vzLastInterpd; 
+    real vxNextInterpd, vyNextInterpd, vzNextInterpd; 
+
+    uint kNT = neighborsNT[k];
+    real dNT = weightsNT[k];
+
+    real* vxLast = vLast;
+    real* vyLast = &vLast[numberOfPrecursorNodes];
+    real* vzLast = &vLast[2*numberOfPrecursorNodes];
+
+    real* vxCurrent = vCurrent;
+    real* vyCurrent = &vCurrent[numberOfPrecursorNodes];
+    real* vzCurrent = &vCurrent[2*numberOfPrecursorNodes];
+
+    if(dNT < 1e6)
+    {
+        uint kNB = neighborsNB[k];
+        uint kST = neighborsST[k];
+        uint kSB = neighborsSB[k];
+
+        real dNB = weightsNB[k];
+        real dST = weightsST[k];
+        real dSB = weightsSB[k];
+
+        real invWeightSum = 1.f/(dNT+dNB+dST+dSB);
+
+        vxLastInterpd = (vxLast[kNT]*dNT + vxLast[kNB]*dNB + vxLast[kST]*dST + vxLast[kSB]*dSB)*invWeightSum;
+        vyLastInterpd = (vyLast[kNT]*dNT + vyLast[kNB]*dNB + vyLast[kST]*dST + vyLast[kSB]*dSB)*invWeightSum;
+        vzLastInterpd = (vzLast[kNT]*dNT + vzLast[kNB]*dNB + vzLast[kST]*dST + vzLast[kSB]*dSB)*invWeightSum;
+
+        vxNextInterpd = (vxCurrent[kNT]*dNT + vxCurrent[kNB]*dNB + vxCurrent[kST]*dST + vxCurrent[kSB]*dSB)*invWeightSum;
+        vyNextInterpd = (vyCurrent[kNT]*dNT + vyCurrent[kNB]*dNB + vyCurrent[kST]*dST + vyCurrent[kSB]*dSB)*invWeightSum;
+        vzNextInterpd = (vzCurrent[kNT]*dNT + vzCurrent[kNB]*dNB + vzCurrent[kST]*dST + vzCurrent[kSB]*dSB)*invWeightSum;
+    }
+    else
+    {
+        vxLastInterpd = vxLast[kNT];
+        vyLastInterpd = vyLast[kNT];
+        vzLastInterpd = vzLast[kNT];
+
+        vxNextInterpd = vxCurrent[kNT];
+        vyNextInterpd = vyCurrent[kNT];
+        vzNextInterpd = vzCurrent[kNT];
+    }
+
+    // if(k==16300)s printf("%f %f %f\n", vxLastInterpd, vyLastInterpd, vzLastInterpd);
+    real VeloX = (velocityX + (1.f-tRatio)*vxLastInterpd + tRatio*vxNextInterpd)/velocityRatio;
+    real VeloY = (velocityY + (1.f-tRatio)*vyLastInterpd + tRatio*vyNextInterpd)/velocityRatio; 
+    real VeloZ = (velocityZ + (1.f-tRatio)*vzLastInterpd + tRatio*vzNextInterpd)/velocityRatio;
+    // From here on just a copy of QVelDeviceCompZeroPress
+    ////////////////////////////////////////////////////////////////////////////////
+
+    Distributions27 dist;
+    getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+
+    unsigned int KQK  = subgridDistanceIndices[k];
+    unsigned int kzero= KQK;
+    unsigned int ke   = KQK;
+    unsigned int kw   = neighborX[KQK];
+    unsigned int kn   = KQK;
+    unsigned int ks   = neighborY[KQK];
+    unsigned int kt   = KQK;
+    unsigned int kb   = neighborZ[KQK];
+    unsigned int ksw  = neighborY[kw];
+    unsigned int kne  = KQK;
+    unsigned int kse  = ks;
+    unsigned int knw  = kw;
+    unsigned int kbw  = neighborZ[kw];
+    unsigned int kte  = KQK;
+    unsigned int kbe  = kb;
+    unsigned int ktw  = kw;
+    unsigned int kbs  = neighborZ[ks];
+    unsigned int ktn  = KQK;
+    unsigned int kbn  = kb;
+    unsigned int kts  = ks;
+    unsigned int ktse = ks;
+    unsigned int kbnw = kbw;
+    unsigned int ktnw = kw;
+    unsigned int kbse = kbs;
+    unsigned int ktsw = ksw;
+    unsigned int kbne = kb;
+    unsigned int ktne = KQK;
+    unsigned int kbsw = neighborZ[ksw];
+
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Set local distributions
+    //!
+    real f_W    = (dist.f[DIR_P00   ])[ke   ];
+    real f_E    = (dist.f[DIR_M00   ])[kw   ];
+    real f_S    = (dist.f[DIR_0P0   ])[kn   ];
+    real f_N    = (dist.f[DIR_0M0   ])[ks   ];
+    real f_B    = (dist.f[DIR_00P   ])[kt   ];
+    real f_T    = (dist.f[DIR_00M   ])[kb   ];
+    real f_SW   = (dist.f[DIR_PP0  ])[kne  ];
+    real f_NE   = (dist.f[DIR_MM0  ])[ksw  ];
+    real f_NW   = (dist.f[DIR_PM0  ])[kse  ];
+    real f_SE   = (dist.f[DIR_MP0  ])[knw  ];
+    real f_BW   = (dist.f[DIR_P0P  ])[kte  ];
+    real f_TE   = (dist.f[DIR_M0M  ])[kbw  ];
+    real f_TW   = (dist.f[DIR_P0M  ])[kbe  ];
+    real f_BE   = (dist.f[DIR_M0P  ])[ktw  ];
+    real f_BS   = (dist.f[DIR_0PP  ])[ktn  ];
+    real f_TN   = (dist.f[DIR_0MM  ])[kbs  ];
+    real f_TS   = (dist.f[DIR_0PM  ])[kbn  ];
+    real f_BN   = (dist.f[DIR_0MP  ])[kts  ];
+    real f_BSW  = (dist.f[DIR_PPP ])[ktne ];
+    real f_BNE  = (dist.f[DIR_MMP ])[ktsw ];
+    real f_BNW  = (dist.f[DIR_PMP ])[ktse ];
+    real f_BSE  = (dist.f[DIR_MPP ])[ktnw ];
+    real f_TSW  = (dist.f[DIR_PPM ])[kbne ];
+    real f_TNE  = (dist.f[DIR_MMM ])[kbsw ];
+    real f_TNW  = (dist.f[DIR_PMM ])[kbse ];
+    real f_TSE  = (dist.f[DIR_MPM ])[kbnw ];
+    
+    SubgridDistances27 subgridD;
+    getPointersToSubgridDistances(subgridD, subgridDistances, numberOfBCnodes);
+    
+    ////////////////////////////////////////////////////////////////////////////////
+      real drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
+                     f_BN + f_TS + f_TN + f_BS + f_BE + f_TW + f_TE + f_BW + f_SE + f_NW + f_NE + f_SW + 
+                     f_T + f_B + f_N + f_S + f_E + f_W + ((dist.f[DIR_000])[kzero]); 
+
+      real vx1    =  (((f_TSE - f_BNW) - (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
+                      ((f_BE - f_TW)   + (f_TE - f_BW))   + ((f_SE - f_NW)   + (f_NE - f_SW)) +
+                      (f_E - f_W)) / (c1o1 + drho); 
+         
+
+      real vx2    =   ((-(f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
+                       ((f_BN - f_TS)   + (f_TN - f_BS))    + (-(f_SE - f_NW)  + (f_NE - f_SW)) +
+                       (f_N - f_S)) / (c1o1 + drho); 
+
+      real vx3    =   (((f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) + (f_TSW - f_BNE)) +
+                       (-(f_BN - f_TS)  + (f_TN - f_BS))   + ((f_TE - f_BW)   - (f_BE - f_TW)) +
+                       (f_T - f_B)) / (c1o1 + drho); 
+
+    
+    // if(k==16383 || k==0) printf("k %d kQ %d drho = %f u %f v %f w %f\n",k, KQK, drho, vx1, vx2, vx3);
+      real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3) * (c1o1 + drho);
+    //////////////////////////////////////////////////////////////////////////
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Update distributions with subgrid distance (q) between zero and one
+    real feq, q, velocityLB, velocityBC;
+    q = (subgridD.q[DIR_P00])[k];
+    if (q>=c0o1 && q<=c1o1) // only update distribution for q between zero and one
+    {
+        velocityLB = vx1;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
+        velocityBC = VeloX;
+        (dist.f[DIR_M00])[kw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_E, f_W, feq, omega, drho, velocityBC, c2o27);
+    }
+
+    q = (subgridD.q[DIR_M00])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
+        velocityBC = -VeloX;
+        (dist.f[DIR_P00])[ke] = getInterpolatedDistributionForVeloWithPressureBC(q, f_W, f_E, feq, omega, drho, velocityBC, c2o27);
+    }
+
+    q = (subgridD.q[DIR_0P0])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx2;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
+        velocityBC = VeloY;
+        (dist.f[DIR_0M0])[DIR_0M0] = getInterpolatedDistributionForVeloWithPressureBC(q, f_N, f_S, feq, omega, drho, velocityBC, c2o27);
+    }
+
+    q = (subgridD.q[DIR_0M0])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx2;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
+        velocityBC = -VeloY;
+        (dist.f[DIR_0P0])[kn] = getInterpolatedDistributionForVeloWithPressureBC(q, f_S, f_N, feq, omega, drho, velocityBC, c2o27);
+    }
+
+    q = (subgridD.q[DIR_00P])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
+        velocityBC = VeloZ;
+        (dist.f[DIR_00M])[kb] = getInterpolatedDistributionForVeloWithPressureBC(q, f_T, f_B, feq, omega, drho, velocityBC, c2o27);
+    }
+
+    q = (subgridD.q[DIR_00M])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
+        velocityBC = -VeloZ;
+        (dist.f[DIR_00P])[kt] = getInterpolatedDistributionForVeloWithPressureBC(q, f_B, f_T, feq, omega, drho, velocityBC, c2o27);
+    }
+
+    q = (subgridD.q[DIR_PP0])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 + vx2;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = VeloX + VeloY;
+        (dist.f[DIR_MM0])[ksw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_NE, f_SW, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_MM0])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 - vx2;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloX - VeloY;
+        (dist.f[DIR_PP0])[kne] = getInterpolatedDistributionForVeloWithPressureBC(q, f_SW, f_NE, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_PM0])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 - vx2;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = VeloX - VeloY;
+        (dist.f[DIR_MP0])[knw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_SE, f_NW, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_MP0])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 + vx2;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloX + VeloY;
+        (dist.f[DIR_PM0])[kse] = getInterpolatedDistributionForVeloWithPressureBC(q, f_NW, f_SE, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_P0P])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = VeloX + VeloZ;
+        (dist.f[DIR_M0M])[kbw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TE, f_BW, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_M0M])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloX - VeloZ;
+        (dist.f[DIR_P0P])[kte] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BW, f_TE, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_P0M])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = VeloX - VeloZ;
+        (dist.f[DIR_M0P])[ktw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BE, f_TW, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_M0P])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloX + VeloZ;
+        (dist.f[DIR_P0M])[kbe] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TW, f_BE, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_0PP])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx2 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = VeloY + VeloZ;
+        (dist.f[DIR_0MM])[kbs] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TN, f_BS, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_0MM])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx2 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloY - VeloZ;
+        (dist.f[DIR_0PP])[ktn] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BS, f_TN, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_0PM])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx2 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = VeloY - VeloZ;
+        (dist.f[DIR_0MP])[kts] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BN, f_TS, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_0MP])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx2 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
+        velocityBC = -VeloY + VeloZ;
+        (dist.f[DIR_0PM])[kbn] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TS, f_BN, feq, omega, drho, velocityBC, c1o54);
+    }
+
+    q = (subgridD.q[DIR_PPP])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 + vx2 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = VeloX + VeloY + VeloZ;
+        (dist.f[DIR_MMM])[kbsw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TNE, f_BSW, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_MMM])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 - vx2 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = -VeloX - VeloY - VeloZ;
+        (dist.f[DIR_PPP])[ktne] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BSW, f_TNE, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_PPM])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 + vx2 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = VeloX + VeloY - VeloZ;
+        (dist.f[DIR_MMP])[ktsw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BNE, f_TSW, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_MMP])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 - vx2 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = -VeloX - VeloY + VeloZ;
+        (dist.f[DIR_PPM])[kbne] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TSW, f_BNE, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_PMP])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 - vx2 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = VeloX - VeloY + VeloZ;
+        (dist.f[DIR_MPM])[kbnw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TSE, f_BNW, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_MPM])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 + vx2 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = -VeloX + VeloY - VeloZ;
+        (dist.f[DIR_PMP])[ktse] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BNW, f_TSE, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_PMM])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = vx1 - vx2 - vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = VeloX - VeloY - VeloZ;
+        (dist.f[DIR_MPP])[ktnw] = getInterpolatedDistributionForVeloWithPressureBC(q, f_BSE, f_TNW, feq, omega, drho, velocityBC, c1o216);
+    }
+
+    q = (subgridD.q[DIR_MPP])[k];
+    if (q>=c0o1 && q<=c1o1)
+    {
+        velocityLB = -vx1 + vx2 + vx3;
+        feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
+        velocityBC = -VeloX + VeloY + VeloZ;
+        (dist.f[DIR_PMM])[kbse] = getInterpolatedDistributionForVeloWithPressureBC(q, f_TNW, f_BSE, feq, omega, drho, velocityBC, c1o216);
+    }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+__global__ void PrecursorDeviceEQ27( 	int* subgridDistanceIndices,
+                                        int numberOfBCnodes,
+                                        int numberOfPrecursorNodes,
+                                        real omega,
+                                        real* distributions,
+                                        uint* neighborX, 
+                                        uint* neighborY, 
+                                        uint* neighborZ,
+                                        uint* neighborsNT, 
+                                        uint* neighborsNB,
+                                        uint* neighborsST,
+                                        uint* neighborsSB,
+                                        real* weightsNT, 
+                                        real* weightsNB,
+                                        real* weightsST,
+                                        real* weightsSB,
+                                        real* vLast, 
+                                        real* vCurrent,
+                                        real velocityX,
+                                        real velocityY,
+                                        real velocityZ,
+                                        real tRatio,
+                                        real velocityRatio,
+                                        unsigned long long numberOfLBnodes,
+                                        bool isEvenTimestep)
+{
+    const unsigned k = vf::gpu::getNodeIndex();
+
+    if(k>=numberOfBCnodes) return;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // interpolation of velocity
+    real vxLastInterpd, vyLastInterpd, vzLastInterpd; 
+    real vxNextInterpd, vyNextInterpd, vzNextInterpd; 
+
+    uint kNT = neighborsNT[k];
+    real dNT = weightsNT[k];
+
+    real* vxLast = vLast;
+    real* vyLast = &vLast[numberOfPrecursorNodes];
+    real* vzLast = &vLast[2*numberOfPrecursorNodes];
+
+    real* vxCurrent = vCurrent;
+    real* vyCurrent = &vCurrent[numberOfPrecursorNodes];
+    real* vzCurrent = &vCurrent[2*numberOfPrecursorNodes];
+
+    if(dNT < 1e6)
+    {
+        uint kNB = neighborsNB[k];
+        uint kST = neighborsST[k];
+        uint kSB = neighborsSB[k];
+
+        real dNB = weightsNB[k];
+        real dST = weightsST[k];
+        real dSB = weightsSB[k];
+
+        real invWeightSum = 1.f/(dNT+dNB+dST+dSB);
+
+        vxLastInterpd = (vxLast[kNT]*dNT + vxLast[kNB]*dNB + vxLast[kST]*dST + vxLast[kSB]*dSB)*invWeightSum;
+        vyLastInterpd = (vyLast[kNT]*dNT + vyLast[kNB]*dNB + vyLast[kST]*dST + vyLast[kSB]*dSB)*invWeightSum;
+        vzLastInterpd = (vzLast[kNT]*dNT + vzLast[kNB]*dNB + vzLast[kST]*dST + vzLast[kSB]*dSB)*invWeightSum;
+
+        vxNextInterpd = (vxCurrent[kNT]*dNT + vxCurrent[kNB]*dNB + vxCurrent[kST]*dST + vxCurrent[kSB]*dSB)*invWeightSum;
+        vyNextInterpd = (vyCurrent[kNT]*dNT + vyCurrent[kNB]*dNB + vyCurrent[kST]*dST + vyCurrent[kSB]*dSB)*invWeightSum;
+        vzNextInterpd = (vzCurrent[kNT]*dNT + vzCurrent[kNB]*dNB + vzCurrent[kST]*dST + vzCurrent[kSB]*dSB)*invWeightSum;
+    }
+    else
+    {
+        vxLastInterpd = vxLast[kNT];
+        vyLastInterpd = vyLast[kNT];
+        vzLastInterpd = vzLast[kNT];
+
+        vxNextInterpd = vxCurrent[kNT];
+        vyNextInterpd = vyCurrent[kNT];
+        vzNextInterpd = vzCurrent[kNT];
+    }
+
+    // if(k==16300) printf("%f %f %f\n", vxLastInterpd, vyLastInterpd, vzLastInterpd);
+    real VeloX = (velocityX + (1.f-tRatio)*vxLastInterpd + tRatio*vxNextInterpd)/velocityRatio;
+    real VeloY = (velocityY + (1.f-tRatio)*vyLastInterpd + tRatio*vyNextInterpd)/velocityRatio; 
+    real VeloZ = (velocityZ + (1.f-tRatio)*vzLastInterpd + tRatio*vzNextInterpd)/velocityRatio;
+    // From here on just a copy of QVelDeviceCompZeroPress
+    ////////////////////////////////////////////////////////////////////////////////
+
+    Distributions27 dist;
+    getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+
+    unsigned int KQK  = subgridDistanceIndices[k];
+    unsigned int kzero= KQK;
+    unsigned int ke   = KQK;
+    unsigned int kw   = neighborX[KQK];
+    unsigned int kn   = KQK;
+    unsigned int ks   = neighborY[KQK];
+    unsigned int kt   = KQK;
+    unsigned int kb   = neighborZ[KQK];
+    unsigned int ksw  = neighborY[kw];
+    unsigned int kne  = KQK;
+    unsigned int kse  = ks;
+    unsigned int knw  = kw;
+    unsigned int kbw  = neighborZ[kw];
+    unsigned int kte  = KQK;
+    unsigned int kbe  = kb;
+    unsigned int ktw  = kw;
+    unsigned int kbs  = neighborZ[ks];
+    unsigned int ktn  = KQK;
+    unsigned int kbn  = kb;
+    unsigned int kts  = ks;
+    unsigned int ktse = ks;
+    unsigned int kbnw = kbw;
+    unsigned int ktnw = kw;
+    unsigned int kbse = kbs;
+    unsigned int ktsw = ksw;
+    unsigned int kbne = kb;
+    unsigned int ktne = KQK;
+    unsigned int kbsw = neighborZ[ksw];
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // based on BGK Plus Comp
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    real f_W    = (dist.f[DIR_P00])[ke   ];
+    real f_E    = (dist.f[DIR_M00])[kw   ];
+    real f_S    = (dist.f[DIR_0P0])[kn   ];
+    real f_N    = (dist.f[DIR_0M0])[ks   ];
+    real f_B    = (dist.f[DIR_00P])[kt   ];
+    real f_T    = (dist.f[DIR_00M])[kb   ];
+    real f_SW   = (dist.f[DIR_PP0])[kne  ];
+    real f_NE   = (dist.f[DIR_MM0])[ksw  ];
+    real f_NW   = (dist.f[DIR_PM0])[kse  ];
+    real f_SE   = (dist.f[DIR_MP0])[knw  ];
+    real f_BW   = (dist.f[DIR_P0P])[kte  ];
+    real f_TE   = (dist.f[DIR_M0M])[kbw  ];
+    real f_TW   = (dist.f[DIR_P0M])[kbe  ];
+    real f_BE   = (dist.f[DIR_M0P])[ktw  ];
+    real f_BS   = (dist.f[DIR_0PP])[ktn  ];
+    real f_TN   = (dist.f[DIR_0MM])[kbs  ];
+    real f_TS   = (dist.f[DIR_0PM])[kbn  ];
+    real f_BN   = (dist.f[DIR_0MP])[kts  ];
+    real f_ZERO = (dist.f[DIR_000])[kzero];
+    real f_BSW  = (dist.f[DIR_PPP])[ktne ];
+    real f_BNE  = (dist.f[DIR_MMP])[ktsw ];
+    real f_BNW  = (dist.f[DIR_PMP])[ktse ];
+    real f_BSE  = (dist.f[DIR_MPP])[ktnw ];
+    real f_TSW  = (dist.f[DIR_PPM])[kbne ];
+    real f_TNE  = (dist.f[DIR_MMM])[kbsw ];
+    real f_TNW  = (dist.f[DIR_PMM])[kbse ];
+    real f_TSE  = (dist.f[DIR_MPM])[kbnw ];
+
+      ////////////////////////////////////////////////////////////////////////////////
+      //! - Set macroscopic quantities
+      //!
+      real drho = c0o1;
+
+      real vx1  = VeloX;          
+
+      real vx2  = VeloY; 
+
+      real vx3  = VeloZ; 
+
+      real cusq = c3o2 * (vx1 * vx1 + vx2 * vx2 + vx3 * vx3);
+
+      ////////////////////////////////////////////////////////////////////////////////
+      f_ZERO  = c8o27*  (drho-(drho+c1o1)*cusq);
+      f_E     = c2o27*  (drho+(drho+c1o1)*(c3o1*( vx1        )+c9o2*( vx1        )*( vx1        )-cusq));
+      f_W     = c2o27*  (drho+(drho+c1o1)*(c3o1*(-vx1        )+c9o2*(-vx1        )*(-vx1        )-cusq));
+      f_N     = c2o27*  (drho+(drho+c1o1)*(c3o1*(    vx2     )+c9o2*(     vx2    )*(     vx2    )-cusq));
+      f_S     = c2o27*  (drho+(drho+c1o1)*(c3o1*(   -vx2     )+c9o2*(    -vx2    )*(    -vx2    )-cusq));
+      f_T     = c2o27*  (drho+(drho+c1o1)*(c3o1*(         vx3)+c9o2*(         vx3)*(         vx3)-cusq));
+      f_B     = c2o27*  (drho+(drho+c1o1)*(c3o1*(        -vx3)+c9o2*(        -vx3)*(        -vx3)-cusq));
+      f_NE    = c1o54*  (drho+(drho+c1o1)*(c3o1*( vx1+vx2    )+c9o2*( vx1+vx2    )*( vx1+vx2    )-cusq));
+      f_SW    = c1o54*  (drho+(drho+c1o1)*(c3o1*(-vx1-vx2    )+c9o2*(-vx1-vx2    )*(-vx1-vx2    )-cusq));
+      f_SE    =  c1o54* (drho+(drho+c1o1)*(c3o1*( vx1-vx2    )+c9o2*( vx1-vx2    )*( vx1-vx2    )-cusq));
+      f_NW    =  c1o54* (drho+(drho+c1o1)*(c3o1*(-vx1+vx2    )+c9o2*(-vx1+vx2    )*(-vx1+vx2    )-cusq));
+      f_TE    =  c1o54* (drho+(drho+c1o1)*(c3o1*( vx1    +vx3)+c9o2*( vx1    +vx3)*( vx1    +vx3)-cusq));
+      f_BW    =  c1o54* (drho+(drho+c1o1)*(c3o1*(-vx1    -vx3)+c9o2*(-vx1    -vx3)*(-vx1    -vx3)-cusq));
+      f_BE    =  c1o54* (drho+(drho+c1o1)*(c3o1*( vx1    -vx3)+c9o2*( vx1    -vx3)*( vx1    -vx3)-cusq));
+      f_TW    =  c1o54* (drho+(drho+c1o1)*(c3o1*(-vx1    +vx3)+c9o2*(-vx1    +vx3)*(-vx1    +vx3)-cusq));
+      f_TN    =  c1o54* (drho+(drho+c1o1)*(c3o1*(     vx2+vx3)+c9o2*(     vx2+vx3)*(     vx2+vx3)-cusq));
+      f_BS    =  c1o54* (drho+(drho+c1o1)*(c3o1*(    -vx2-vx3)+c9o2*(    -vx2-vx3)*(    -vx2-vx3)-cusq));
+      f_BN    =  c1o54* (drho+(drho+c1o1)*(c3o1*(     vx2-vx3)+c9o2*(     vx2-vx3)*(     vx2-vx3)-cusq));
+      f_TS    =  c1o54* (drho+(drho+c1o1)*(c3o1*(    -vx2+vx3)+c9o2*(    -vx2+vx3)*(    -vx2+vx3)-cusq));
+      f_TNE   =  c1o216*(drho+(drho+c1o1)*(c3o1*( vx1+vx2+vx3)+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cusq));
+      f_BSW   =  c1o216*(drho+(drho+c1o1)*(c3o1*(-vx1-vx2-vx3)+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cusq));
+      f_BNE   =  c1o216*(drho+(drho+c1o1)*(c3o1*( vx1+vx2-vx3)+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cusq));
+      f_TSW   =  c1o216*(drho+(drho+c1o1)*(c3o1*(-vx1-vx2+vx3)+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cusq));
+      f_TSE   =  c1o216*(drho+(drho+c1o1)*(c3o1*( vx1-vx2+vx3)+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cusq));
+      f_BNW   =  c1o216*(drho+(drho+c1o1)*(c3o1*(-vx1+vx2-vx3)+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cusq));
+      f_BSE   =  c1o216*(drho+(drho+c1o1)*(c3o1*( vx1-vx2-vx3)+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cusq));
+      f_TNW   =  c1o216*(drho+(drho+c1o1)*(c3o1*(-vx1+vx2+vx3)+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cusq));
+
+      ////////////////////////////////////////////////////////////////////////////////
+      //! write the new distributions to the bc nodes
+      //!
+      (dist.f[DIR_P00   ])[ke  ] = f_W   ;
+      (dist.f[DIR_M00   ])[kw  ] = f_E   ;
+      (dist.f[DIR_0P0   ])[kn  ] = f_S   ;
+      (dist.f[DIR_0M0   ])[ks  ] = f_N   ;
+      (dist.f[DIR_00P   ])[kt  ] = f_B   ;
+      (dist.f[DIR_00M   ])[kb  ] = f_T   ;
+      (dist.f[DIR_PP0  ])[kne  ] = f_SW  ;
+      (dist.f[DIR_MM0  ])[ksw  ] = f_NE  ;
+      (dist.f[DIR_PM0  ])[kse  ] = f_NW  ;
+      (dist.f[DIR_MP0  ])[knw  ] = f_SE  ;
+      (dist.f[DIR_P0P  ])[kte  ] = f_BW  ;
+      (dist.f[DIR_M0M  ])[kbw  ] = f_TE  ;
+      (dist.f[DIR_P0M  ])[kbe  ] = f_TW  ;
+      (dist.f[DIR_M0P  ])[ktw  ] = f_BE  ;
+      (dist.f[DIR_0PP  ])[ktn  ] = f_BS  ;
+      (dist.f[DIR_0MM  ])[kbs  ] = f_TN  ;
+      (dist.f[DIR_0PM  ])[kbn  ] = f_TS  ;
+      (dist.f[DIR_0MP  ])[kts  ] = f_BN  ;
+      (dist.f[DIR_000])[kzero] = f_ZERO;
+      (dist.f[DIR_PPP ])[ktne ] = f_BSW ;
+      (dist.f[DIR_MMP ])[ktsw ] = f_BNE ;
+      (dist.f[DIR_PMP ])[ktse ] = f_BNW ;
+      (dist.f[DIR_MPP ])[ktnw ] = f_BSE ;
+      (dist.f[DIR_PPM ])[kbne ] = f_TSW ;
+      (dist.f[DIR_MMM ])[kbsw ] = f_TNE ;
+      (dist.f[DIR_PMM ])[kbse ] = f_TNW ;
+      (dist.f[DIR_MPM ])[kbnw ] = f_TSE ;
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+__global__ void PrecursorDeviceDistributions( 	int* subgridDistanceIndices,
+												int numberOfBCnodes,
+                                                int numberOfPrecursorNodes,
+												real* distributions,
+												uint* neighborX, 
+												uint* neighborY, 
+												uint* neighborZ,
+												uint* neighborsNT, 
+												uint* neighborsNB,
+												uint* neighborsST,
+												uint* neighborsSB,
+												real* weightsNT, 
+												real* weightsNB,
+												real* weightsST,
+												real* weightsSB,
+												real* fsLast, 
+												real* fsNext,
+												real tRatio,
+												unsigned long long numberOfLBnodes,
+												bool isEvenTimestep)
+{
+    const unsigned k = vf::gpu::getNodeIndex();
+
+    if(k>=numberOfBCnodes) return;
+
+    uint kNT = neighborsNT[k];
+    real dNT = weightsNT[k];
+
+    real f0LastInterp, f1LastInterp, f2LastInterp, f3LastInterp, f4LastInterp, f5LastInterp, f6LastInterp, f7LastInterp, f8LastInterp;
+    real f0NextInterp, f1NextInterp, f2NextInterp, f3NextInterp, f4NextInterp, f5NextInterp, f6NextInterp, f7NextInterp, f8NextInterp;
+
+    real* f0Last = fsLast;
+    real* f1Last = &fsLast[  numberOfPrecursorNodes];
+    real* f2Last = &fsLast[2*numberOfPrecursorNodes];
+    real* f3Last = &fsLast[3*numberOfPrecursorNodes];
+    real* f4Last = &fsLast[4*numberOfPrecursorNodes];
+    real* f5Last = &fsLast[5*numberOfPrecursorNodes];
+    real* f6Last = &fsLast[6*numberOfPrecursorNodes];
+    real* f7Last = &fsLast[7*numberOfPrecursorNodes];
+    real* f8Last = &fsLast[8*numberOfPrecursorNodes];
+
+    real* f0Next = fsNext;
+    real* f1Next = &fsNext[  numberOfPrecursorNodes];
+    real* f2Next = &fsNext[2*numberOfPrecursorNodes];
+    real* f3Next = &fsNext[3*numberOfPrecursorNodes];
+    real* f4Next = &fsNext[4*numberOfPrecursorNodes];
+    real* f5Next = &fsNext[5*numberOfPrecursorNodes];
+    real* f6Next = &fsNext[6*numberOfPrecursorNodes];
+    real* f7Next = &fsNext[7*numberOfPrecursorNodes];
+    real* f8Next = &fsNext[8*numberOfPrecursorNodes];
+
+
+    if(dNT<1e6)
+    {
+        uint kNB = neighborsNB[k];
+        uint kST = neighborsST[k];
+        uint kSB = neighborsSB[k];
+
+        real dNB = weightsNB[k];
+        real dST = weightsST[k];
+        real dSB = weightsSB[k];
+
+        real invWeightSum = 1.f/(dNT+dNB+dST+dSB);
+
+        f0LastInterp = (f0Last[kNT]*dNT + f0Last[kNB]*dNB + f0Last[kST]*dST + f0Last[kSB]*dSB)*invWeightSum;
+        f0NextInterp = (f0Next[kNT]*dNT + f0Next[kNB]*dNB + f0Next[kST]*dST + f0Next[kSB]*dSB)*invWeightSum;
+        
+        f1LastInterp = (f1Last[kNT]*dNT + f1Last[kNB]*dNB + f1Last[kST]*dST + f1Last[kSB]*dSB)*invWeightSum;
+        f1NextInterp = (f1Next[kNT]*dNT + f1Next[kNB]*dNB + f1Next[kST]*dST + f1Next[kSB]*dSB)*invWeightSum;
+        
+        f2LastInterp = (f2Last[kNT]*dNT + f2Last[kNB]*dNB + f2Last[kST]*dST + f2Last[kSB]*dSB)*invWeightSum;
+        f2NextInterp = (f2Next[kNT]*dNT + f2Next[kNB]*dNB + f2Next[kST]*dST + f2Next[kSB]*dSB)*invWeightSum;
+        
+        f3LastInterp = (f3Last[kNT]*dNT + f3Last[kNB]*dNB + f3Last[kST]*dST + f3Last[kSB]*dSB)*invWeightSum;
+        f3NextInterp = (f3Next[kNT]*dNT + f3Next[kNB]*dNB + f3Next[kST]*dST + f3Next[kSB]*dSB)*invWeightSum;
+        
+        f4LastInterp = (f4Last[kNT]*dNT + f4Last[kNB]*dNB + f4Last[kST]*dST + f4Last[kSB]*dSB)*invWeightSum;
+        f4NextInterp = (f4Next[kNT]*dNT + f4Next[kNB]*dNB + f4Next[kST]*dST + f4Next[kSB]*dSB)*invWeightSum;
+        
+        f5LastInterp = (f5Last[kNT]*dNT + f5Last[kNB]*dNB + f5Last[kST]*dST + f5Last[kSB]*dSB)*invWeightSum;
+        f5NextInterp = (f5Next[kNT]*dNT + f5Next[kNB]*dNB + f5Next[kST]*dST + f5Next[kSB]*dSB)*invWeightSum;
+        
+        f6LastInterp = (f6Last[kNT]*dNT + f6Last[kNB]*dNB + f6Last[kST]*dST + f6Last[kSB]*dSB)*invWeightSum;
+        f6NextInterp = (f6Next[kNT]*dNT + f6Next[kNB]*dNB + f6Next[kST]*dST + f6Next[kSB]*dSB)*invWeightSum;
+        
+        f7LastInterp = (f7Last[kNT]*dNT + f7Last[kNB]*dNB + f7Last[kST]*dST + f7Last[kSB]*dSB)*invWeightSum;
+        f7NextInterp = (f7Next[kNT]*dNT + f7Next[kNB]*dNB + f7Next[kST]*dST + f7Next[kSB]*dSB)*invWeightSum;
+        
+        f8LastInterp = (f8Last[kNT]*dNT + f8Last[kNB]*dNB + f8Last[kST]*dST + f8Last[kSB]*dSB)*invWeightSum;
+        f8NextInterp = (f8Next[kNT]*dNT + f8Next[kNB]*dNB + f8Next[kST]*dST + f8Next[kSB]*dSB)*invWeightSum;
+    
+    } else {
+        f0LastInterp = f0Last[kNT];
+        f1LastInterp = f1Last[kNT];
+        f2LastInterp = f2Last[kNT];
+        f3LastInterp = f3Last[kNT];
+        f4LastInterp = f4Last[kNT];
+        f5LastInterp = f5Last[kNT];
+        f6LastInterp = f6Last[kNT];
+        f7LastInterp = f7Last[kNT];
+        f8LastInterp = f8Last[kNT];
+
+        f0NextInterp = f0Next[kNT];
+        f1NextInterp = f1Next[kNT];
+        f2NextInterp = f2Next[kNT];
+        f3NextInterp = f3Next[kNT];
+        f4NextInterp = f4Next[kNT];
+        f5NextInterp = f5Next[kNT];
+        f6NextInterp = f6Next[kNT];
+        f7NextInterp = f7Next[kNT];
+        f8NextInterp = f8Next[kNT];
+    }
+    Distributions27 dist;
+    getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+
+    unsigned int KQK  = subgridDistanceIndices[k];
+    // unsigned int kzero= KQK;
+    unsigned int ke   = KQK;
+    // unsigned int kw   = neighborX[KQK];
+    // unsigned int kn   = KQK;
+    unsigned int ks   = neighborY[KQK];
+    // unsigned int kt   = KQK;
+    unsigned int kb   = neighborZ[KQK];
+    // unsigned int ksw  = neighborY[kw];
+    unsigned int kne  = KQK;
+    unsigned int kse  = ks;
+    // unsigned int knw  = kw;
+    // unsigned int kbw  = neighborZ[kw];
+    unsigned int kte  = KQK;
+    unsigned int kbe  = kb;
+    // unsigned int ktw  = kw;
+    unsigned int kbs  = neighborZ[ks];
+    // unsigned int ktn  = KQK;
+    // unsigned int kbn  = kb;
+    // unsigned int kts  = ks;
+    unsigned int ktse = ks;
+    // unsigned int kbnw = kbw;
+    // unsigned int ktnw = kw;
+    unsigned int kbse = kbs;
+    // unsigned int ktsw = ksw;
+    unsigned int kbne = kb;
+    unsigned int ktne = KQK;
+    // unsigned int kbsw = neighborZ[ksw];
+
+    dist.f[DIR_P00][ke]   = f0LastInterp*(1.f-tRatio) + f0NextInterp*tRatio;
+    dist.f[DIR_PP0][kne]  = f1LastInterp*(1.f-tRatio) + f1NextInterp*tRatio;
+    dist.f[DIR_PM0][kse]  = f2LastInterp*(1.f-tRatio) + f2NextInterp*tRatio;
+    dist.f[DIR_P0P][kte]  = f3LastInterp*(1.f-tRatio) + f3NextInterp*tRatio;
+    dist.f[DIR_P0M][kbe]  = f4LastInterp*(1.f-tRatio) + f4NextInterp*tRatio;
+    dist.f[DIR_PPP][ktne] = f5LastInterp*(1.f-tRatio) + f5NextInterp*tRatio;
+    dist.f[DIR_PMP][ktse] = f6LastInterp*(1.f-tRatio) + f6NextInterp*tRatio;
+    dist.f[DIR_PPM][kbne] = f7LastInterp*(1.f-tRatio) + f7NextInterp*tRatio;
+    dist.f[DIR_PMM][kbse] = f8LastInterp*(1.f-tRatio) + f8NextInterp*tRatio;
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__global__ void QPrecursorDeviceDistributions( 	int* subgridDistanceIndices,
+                                                real* subgridDistances,
+                                                int sizeQ,
+												int numberOfBCnodes,
+                                                int numberOfPrecursorNodes,
+												real* distributions,
+												uint* neighborX, 
+												uint* neighborY, 
+												uint* neighborZ,
+												uint* neighborsNT, 
+												uint* neighborsNB,
+												uint* neighborsST,
+												uint* neighborsSB,
+												real* weightsNT, 
+												real* weightsNB,
+												real* weightsST,
+												real* weightsSB,
+												real* fsLast, 
+												real* fsNext,
+												real tRatio,
+												unsigned long long numberOfLBnodes,
+												bool isEvenTimestep)
+{
+    const unsigned k = vf::gpu::getNodeIndex();
+
+    if(k>=numberOfBCnodes) return;
+
+    uint kNT = neighborsNT[k];
+    real dNT = weightsNT[k];
+
+    real f0LastInterp, f1LastInterp, f2LastInterp, f3LastInterp, f4LastInterp, f5LastInterp, f6LastInterp, f7LastInterp, f8LastInterp;
+    real f0NextInterp, f1NextInterp, f2NextInterp, f3NextInterp, f4NextInterp, f5NextInterp, f6NextInterp, f7NextInterp, f8NextInterp;
+
+    real* f0Last = fsLast;
+    real* f1Last = &fsLast[  numberOfPrecursorNodes];
+    real* f2Last = &fsLast[2*numberOfPrecursorNodes];
+    real* f3Last = &fsLast[3*numberOfPrecursorNodes];
+    real* f4Last = &fsLast[4*numberOfPrecursorNodes];
+    real* f5Last = &fsLast[5*numberOfPrecursorNodes];
+    real* f6Last = &fsLast[6*numberOfPrecursorNodes];
+    real* f7Last = &fsLast[7*numberOfPrecursorNodes];
+    real* f8Last = &fsLast[8*numberOfPrecursorNodes];
+
+    real* f0Next = fsNext;
+    real* f1Next = &fsNext[  numberOfPrecursorNodes];
+    real* f2Next = &fsNext[2*numberOfPrecursorNodes];
+    real* f3Next = &fsNext[3*numberOfPrecursorNodes];
+    real* f4Next = &fsNext[4*numberOfPrecursorNodes];
+    real* f5Next = &fsNext[5*numberOfPrecursorNodes];
+    real* f6Next = &fsNext[6*numberOfPrecursorNodes];
+    real* f7Next = &fsNext[7*numberOfPrecursorNodes];
+    real* f8Next = &fsNext[8*numberOfPrecursorNodes];
+
+
+    if(dNT<1e6)
+    {
+        uint kNB = neighborsNB[k];
+        uint kST = neighborsST[k];
+        uint kSB = neighborsSB[k];
+
+        real dNB = weightsNB[k];
+        real dST = weightsST[k];
+        real dSB = weightsSB[k];
+
+        real invWeightSum = 1.f/(dNT+dNB+dST+dSB);
+
+        f0LastInterp = (f0Last[kNT]*dNT + f0Last[kNB]*dNB + f0Last[kST]*dST + f0Last[kSB]*dSB)*invWeightSum;
+        f0NextInterp = (f0Next[kNT]*dNT + f0Next[kNB]*dNB + f0Next[kST]*dST + f0Next[kSB]*dSB)*invWeightSum;
+        
+        f1LastInterp = (f1Last[kNT]*dNT + f1Last[kNB]*dNB + f1Last[kST]*dST + f1Last[kSB]*dSB)*invWeightSum;
+        f1NextInterp = (f1Next[kNT]*dNT + f1Next[kNB]*dNB + f1Next[kST]*dST + f1Next[kSB]*dSB)*invWeightSum;
+        
+        f2LastInterp = (f2Last[kNT]*dNT + f2Last[kNB]*dNB + f2Last[kST]*dST + f2Last[kSB]*dSB)*invWeightSum;
+        f2NextInterp = (f2Next[kNT]*dNT + f2Next[kNB]*dNB + f2Next[kST]*dST + f2Next[kSB]*dSB)*invWeightSum;
+        
+        f3LastInterp = (f3Last[kNT]*dNT + f3Last[kNB]*dNB + f3Last[kST]*dST + f3Last[kSB]*dSB)*invWeightSum;
+        f3NextInterp = (f3Next[kNT]*dNT + f3Next[kNB]*dNB + f3Next[kST]*dST + f3Next[kSB]*dSB)*invWeightSum;
+        
+        f4LastInterp = (f4Last[kNT]*dNT + f4Last[kNB]*dNB + f4Last[kST]*dST + f4Last[kSB]*dSB)*invWeightSum;
+        f4NextInterp = (f4Next[kNT]*dNT + f4Next[kNB]*dNB + f4Next[kST]*dST + f4Next[kSB]*dSB)*invWeightSum;
+        
+        f5LastInterp = (f5Last[kNT]*dNT + f5Last[kNB]*dNB + f5Last[kST]*dST + f5Last[kSB]*dSB)*invWeightSum;
+        f5NextInterp = (f5Next[kNT]*dNT + f5Next[kNB]*dNB + f5Next[kST]*dST + f5Next[kSB]*dSB)*invWeightSum;
+        
+        f6LastInterp = (f6Last[kNT]*dNT + f6Last[kNB]*dNB + f6Last[kST]*dST + f6Last[kSB]*dSB)*invWeightSum;
+        f6NextInterp = (f6Next[kNT]*dNT + f6Next[kNB]*dNB + f6Next[kST]*dST + f6Next[kSB]*dSB)*invWeightSum;
+        
+        f7LastInterp = (f7Last[kNT]*dNT + f7Last[kNB]*dNB + f7Last[kST]*dST + f7Last[kSB]*dSB)*invWeightSum;
+        f7NextInterp = (f7Next[kNT]*dNT + f7Next[kNB]*dNB + f7Next[kST]*dST + f7Next[kSB]*dSB)*invWeightSum;
+        
+        f8LastInterp = (f8Last[kNT]*dNT + f8Last[kNB]*dNB + f8Last[kST]*dST + f8Last[kSB]*dSB)*invWeightSum;
+        f8NextInterp = (f8Next[kNT]*dNT + f8Next[kNB]*dNB + f8Next[kST]*dST + f8Next[kSB]*dSB)*invWeightSum;
+    
+    } else {
+        f0LastInterp = f0Last[kNT];
+        f1LastInterp = f1Last[kNT];
+        f2LastInterp = f2Last[kNT];
+        f3LastInterp = f3Last[kNT];
+        f4LastInterp = f4Last[kNT];
+        f5LastInterp = f5Last[kNT];
+        f6LastInterp = f6Last[kNT];
+        f7LastInterp = f7Last[kNT];
+        f8LastInterp = f8Last[kNT];
+
+        f0NextInterp = f0Next[kNT];
+        f1NextInterp = f1Next[kNT];
+        f2NextInterp = f2Next[kNT];
+        f3NextInterp = f3Next[kNT];
+        f4NextInterp = f4Next[kNT];
+        f5NextInterp = f5Next[kNT];
+        f6NextInterp = f6Next[kNT];
+        f7NextInterp = f7Next[kNT];
+        f8NextInterp = f8Next[kNT];
+    }
+    Distributions27 dist;
+    getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
+
+    unsigned int KQK  = subgridDistanceIndices[k];
+    // unsigned int kzero= KQK;
+    unsigned int ke   = KQK;
+    // unsigned int kw   = neighborX[KQK];
+    // unsigned int kn   = KQK;
+    unsigned int ks   = neighborY[KQK];
+    // unsigned int kt   = KQK;
+    unsigned int kb   = neighborZ[KQK];
+    // unsigned int ksw  = neighborY[kw];
+    unsigned int kne  = KQK;
+    unsigned int kse  = ks;
+    // unsigned int knw  = kw;
+    // unsigned int kbw  = neighborZ[kw];
+    unsigned int kte  = KQK;
+    unsigned int kbe  = kb;
+    // unsigned int ktw  = kw;
+    unsigned int kbs  = neighborZ[ks];
+    // unsigned int ktn  = KQK;
+    // unsigned int kbn  = kb;
+    // unsigned int kts  = ks;
+    unsigned int ktse = ks;
+    // unsigned int kbnw = kbw;
+    // unsigned int ktnw = kw;
+    unsigned int kbse = kbs;
+    // unsigned int ktsw = ksw;
+    unsigned int kbne = kb;
+    unsigned int ktne = KQK;
+    // unsigned int kbsw = neighborZ[ksw];
+    SubgridDistances27 qs;
+    getPointersToSubgridDistances(qs, subgridDistances, sizeQ);
+
+    real q;
+    q = qs.q[DIR_P00][k]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_P00][ke]   = f0LastInterp*(1.f-tRatio) + f0NextInterp*tRatio;
+    q = qs.q[DIR_PP0][k]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_PP0][kne]  = f1LastInterp*(1.f-tRatio) + f1NextInterp*tRatio;
+    q = qs.q[DIR_PM0][k]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_PM0][kse]  = f2LastInterp*(1.f-tRatio) + f2NextInterp*tRatio;
+    q = qs.q[DIR_P0P][k]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_P0P][kte]  = f3LastInterp*(1.f-tRatio) + f3NextInterp*tRatio;
+    q = qs.q[DIR_P0M][k]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_P0M][kbe]  = f4LastInterp*(1.f-tRatio) + f4NextInterp*tRatio;
+    q = qs.q[DIR_PPP][k]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_PPP][ktne] = f5LastInterp*(1.f-tRatio) + f5NextInterp*tRatio;
+    q = qs.q[DIR_PMP][k]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_PMP][ktse] = f6LastInterp*(1.f-tRatio) + f6NextInterp*tRatio;
+    q = qs.q[DIR_PPM][k]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_PPM][kbne] = f7LastInterp*(1.f-tRatio) + f7NextInterp*tRatio;
+    q = qs.q[DIR_PMM][k]; if(q>= c0o1 && q <= c1o1) dist.f[DIR_PMM][kbse] = f8LastInterp*(1.f-tRatio) + f8NextInterp*tRatio;
+
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/GPU/PressBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/PressBCs27.cu
index ccb2ce79c63515e59e4f9ae75016f44ced71a170..29e82196bdc2a22f03306b97a1ffd1bb6d5bc8a4 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/PressBCs27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/PressBCs27.cu
@@ -2,6 +2,9 @@
 #include "LBM/LB.h" 
 #include "lbm/constants/D3Q27.h"
 #include "lbm/constants/NumericConstants.h"
+#include "lbm/MacroscopicQuantities.h"
+#include "Kernel/Utilities/DistributionHelper.cuh"
+
 #include "KernelUtilities.h"
 
 using namespace vf::lbm::constant;
@@ -2793,12 +2796,14 @@ __global__ void QPressDeviceDirDepBot27(  real* rhoBC,
 
 
 
-
-
+__host__ __device__ real computeOutflowDistribution(const real* const &f, const real* const &f1, const int dir, const real cs)
+{
+   return f1[dir] * cs + (c1o1 - cs) * f[dir];
+}
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-__global__ void QPressNoRhoDevice27(  real* rhoBC,
-												 real* DD, 
+__global__ void QPressNoRhoDevice27( real* rhoBC,
+												 real* distributions, 
 												 int* k_Q, 
 												 int* k_N, 
 												 int numberOfBCnodes, 
@@ -2806,238 +2811,176 @@ __global__ void QPressNoRhoDevice27(  real* rhoBC,
 												 unsigned int* neighborX,
 												 unsigned int* neighborY,
 												 unsigned int* neighborZ,
-												 unsigned int size_Mat, 
-												 bool isEvenTimestep)
+												 unsigned int numberOfLBnodes, 
+												 bool isEvenTimestep,
+                                     int direction)
 {
    ////////////////////////////////////////////////////////////////////////////////
-   const unsigned  x = threadIdx.x;  // Globaler x-Index 
-   const unsigned  y = blockIdx.x;   // Globaler y-Index 
-   const unsigned  z = blockIdx.y;   // Globaler z-Index 
 
-   const unsigned nx = blockDim.x;
-   const unsigned ny = gridDim.x;
 
-   const unsigned k = nx*(ny*z + y) + x;
+   const unsigned k = vf::gpu::getNodeIndex();
    //////////////////////////////////////////////////////////////////////////
 
-   if(k<numberOfBCnodes)
-   {
-      ////////////////////////////////////////////////////////////////////////////////
-      //index
-      unsigned int KQK  = k_Q[k];
-      //unsigned int kzero= KQK;
-      unsigned int ke   = KQK;
-      unsigned int kw   = neighborX[KQK];
-      unsigned int kn   = KQK;
-      unsigned int ks   = neighborY[KQK];
-      unsigned int kt   = KQK;
-      unsigned int kb   = neighborZ[KQK];
-      unsigned int ksw  = neighborY[kw];
-      unsigned int kne  = KQK;
-      unsigned int kse  = ks;
-      unsigned int knw  = kw;
-      unsigned int kbw  = neighborZ[kw];
-      unsigned int kte  = KQK;
-      unsigned int kbe  = kb;
-      unsigned int ktw  = kw;
-      unsigned int kbs  = neighborZ[ks];
-      unsigned int ktn  = KQK;
-      unsigned int kbn  = kb;
-      unsigned int kts  = ks;
-      unsigned int ktse = ks;
-      unsigned int kbnw = kbw;
-      unsigned int ktnw = kw;
-      unsigned int kbse = kbs;
-      unsigned int ktsw = ksw;
-      unsigned int kbne = kb;
-      unsigned int ktne = KQK;
-      unsigned int kbsw = neighborZ[ksw];
-      ////////////////////////////////////////////////////////////////////////////////
-      //index1
-      unsigned int K1QK  = k_N[k];
-      //unsigned int k1zero= K1QK;
-      unsigned int k1e   = K1QK;
-      unsigned int k1w   = neighborX[K1QK];
-      unsigned int k1n   = K1QK;
-      unsigned int k1s   = neighborY[K1QK];
-      unsigned int k1t   = K1QK;
-      unsigned int k1b   = neighborZ[K1QK];
-      unsigned int k1sw  = neighborY[k1w];
-      unsigned int k1ne  = K1QK;
-      unsigned int k1se  = k1s;
-      unsigned int k1nw  = k1w;
-      unsigned int k1bw  = neighborZ[k1w];
-      unsigned int k1te  = K1QK;
-      unsigned int k1be  = k1b;
-      unsigned int k1tw  = k1w;
-      unsigned int k1bs  = neighborZ[k1s];
-      unsigned int k1tn  = K1QK;
-      unsigned int k1bn  = k1b;
-      unsigned int k1ts  = k1s;
-      unsigned int k1tse = k1s;
-      unsigned int k1bnw = k1bw;
-      unsigned int k1tnw = k1w;
-      unsigned int k1bse = k1bs;
-      unsigned int k1tsw = k1sw;
-      unsigned int k1bne = k1b;
-      unsigned int k1tne = K1QK;
-      unsigned int k1bsw = neighborZ[k1sw];
-      ////////////////////////////////////////////////////////////////////////////////
-      Distributions27 D;
-      if (isEvenTimestep==true)
-      {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
-      else
-      {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
-      }
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      real f1_E    = (D.f[DIR_P00   ])[k1e   ];
-      real f1_W    = (D.f[DIR_M00   ])[k1w   ];
-      real f1_N    = (D.f[DIR_0P0   ])[k1n   ];
-      real f1_S    = (D.f[DIR_0M0   ])[k1s   ];
-      real f1_T    = (D.f[DIR_00P   ])[k1t   ];
-      real f1_B    = (D.f[DIR_00M   ])[k1b   ];
-      real f1_NE   = (D.f[DIR_PP0  ])[k1ne  ];
-      real f1_SW   = (D.f[DIR_MM0  ])[k1sw  ];
-      real f1_SE   = (D.f[DIR_PM0  ])[k1se  ];
-      real f1_NW   = (D.f[DIR_MP0  ])[k1nw  ];
-      real f1_TE   = (D.f[DIR_P0P  ])[k1te  ];
-      real f1_BW   = (D.f[DIR_M0M  ])[k1bw  ];
-      real f1_BE   = (D.f[DIR_P0M  ])[k1be  ];
-      real f1_TW   = (D.f[DIR_M0P  ])[k1tw  ];
-      real f1_TN   = (D.f[DIR_0PP  ])[k1tn  ];
-      real f1_BS   = (D.f[DIR_0MM  ])[k1bs  ];
-      real f1_BN   = (D.f[DIR_0PM  ])[k1bn  ];
-      real f1_TS   = (D.f[DIR_0MP  ])[k1ts  ];
-      //real f1_ZERO = (D.f[DIR_000])[k1zero];
-      real f1_TNE  = (D.f[DIR_PPP ])[k1tne ];
-      real f1_TSW  = (D.f[DIR_MMP ])[k1tsw ];
-      real f1_TSE  = (D.f[DIR_PMP ])[k1tse ];
-      real f1_TNW  = (D.f[DIR_MPP ])[k1tnw ];
-      real f1_BNE  = (D.f[DIR_PPM ])[k1bne ];
-      real f1_BSW  = (D.f[DIR_MMM ])[k1bsw ];
-      real f1_BSE  = (D.f[DIR_PMM ])[k1bse ];
-      real f1_BNW  = (D.f[DIR_MPM ])[k1bnw ];
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      real f_E    = (D.f[DIR_P00   ])[ke   ];
-      real f_W    = (D.f[DIR_M00   ])[kw   ];
-      real f_N    = (D.f[DIR_0P0   ])[kn   ];
-      real f_S    = (D.f[DIR_0M0   ])[ks   ];
-      real f_T    = (D.f[DIR_00P   ])[kt   ];
-      real f_B    = (D.f[DIR_00M   ])[kb   ];
-      real f_NE   = (D.f[DIR_PP0  ])[kne  ];
-      real f_SW   = (D.f[DIR_MM0  ])[ksw  ];
-      real f_SE   = (D.f[DIR_PM0  ])[kse  ];
-      real f_NW   = (D.f[DIR_MP0  ])[knw  ];
-      real f_TE   = (D.f[DIR_P0P  ])[kte  ];
-      real f_BW   = (D.f[DIR_M0M  ])[kbw  ];
-      real f_BE   = (D.f[DIR_P0M  ])[kbe  ];
-      real f_TW   = (D.f[DIR_M0P  ])[ktw  ];
-      real f_TN   = (D.f[DIR_0PP  ])[ktn  ];
-      real f_BS   = (D.f[DIR_0MM  ])[kbs  ];
-      real f_BN   = (D.f[DIR_0PM  ])[kbn  ];
-      real f_TS   = (D.f[DIR_0MP  ])[kts  ];
-      //real f_ZERO = (D.f[DIR_000])[kzero];
-      real f_TNE  = (D.f[DIR_PPP ])[ktne ];
-      real f_TSW  = (D.f[DIR_MMP ])[ktsw ];
-      real f_TSE  = (D.f[DIR_PMP ])[ktse ];
-      real f_TNW  = (D.f[DIR_MPP ])[ktnw ];
-      real f_BNE  = (D.f[DIR_PPM ])[kbne ];
-      real f_BSW  = (D.f[DIR_MMM ])[kbsw ];
-      real f_BSE  = (D.f[DIR_PMM ])[kbse ];
-      real f_BNW  = (D.f[DIR_MPM ])[kbnw ];
-      //////////////////////////////////////////////////////////////////////////
+   if(k>=numberOfBCnodes) return;
 
-      //real vx1, vx2, vx3, drho;
-      //real vx1, vx2, vx3, drho, drho1;
-      //////////////////////////////////////////////////////////////////////////
-	  //Dichte
-    //   drho1  =  f1_TSE + f1_TNW + f1_TNE + f1_TSW + f1_BSE + f1_BNW + f1_BNE + f1_BSW +
-    //             f1_BN + f1_TS + f1_TN + f1_BS + f1_BE + f1_TW + f1_TE + f1_BW + f1_SE + f1_NW + f1_NE + f1_SW + 
-    //             f1_T + f1_B + f1_N + f1_S + f1_E + f1_W + ((D.f[DIR_000])[k1zero]); 
-    //   drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
-    //             f_BN + f_TS + f_TN + f_BS + f_BE + f_TW + f_TE + f_BW + f_SE + f_NW + f_NE + f_SW + 
-    //             f_T + f_B + f_N + f_S + f_E + f_W + ((D.f[DIR_000])[kzero]); 
-      
-      //////////////////////////////////////////////////////////////////////////
-	  //Ux
+   ////////////////////////////////////////////////////////////////////////////////
+   //index
+   unsigned int KQK  = k_Q[k];
+   // unsigned int kzero= KQK;
+   unsigned int ke   = KQK;
+   unsigned int kw   = neighborX[KQK];
+   unsigned int kn   = KQK;
+   unsigned int ks   = neighborY[KQK];
+   unsigned int kt   = KQK;
+   unsigned int kb   = neighborZ[KQK];
+   unsigned int ksw  = neighborY[kw];
+   unsigned int kne  = KQK;
+   unsigned int kse  = ks;
+   unsigned int knw  = kw;
+   unsigned int kbw  = neighborZ[kw];
+   unsigned int kte  = KQK;
+   unsigned int kbe  = kb;
+   unsigned int ktw  = kw;
+   unsigned int kbs  = neighborZ[ks];
+   unsigned int ktn  = KQK;
+   unsigned int kbn  = kb;
+   unsigned int kts  = ks;
+   unsigned int ktse = ks;
+   unsigned int kbnw = kbw;
+   unsigned int ktnw = kw;
+   unsigned int kbse = kbs;
+   unsigned int ktsw = ksw;
+   unsigned int kbne = kb;
+   unsigned int ktne = KQK;
+   unsigned int kbsw = neighborZ[ksw];
+   ////////////////////////////////////////////////////////////////////////////////
+   //index1
+   unsigned int K1QK  = k_N[k];
+   //unsigned int k1zero= K1QK;
+   unsigned int k1e   = K1QK;
+   unsigned int k1w   = neighborX[K1QK];
+   unsigned int k1n   = K1QK;
+   unsigned int k1s   = neighborY[K1QK];
+   unsigned int k1t   = K1QK;
+   unsigned int k1b   = neighborZ[K1QK];
+   unsigned int k1sw  = neighborY[k1w];
+   unsigned int k1ne  = K1QK;
+   unsigned int k1se  = k1s;
+   unsigned int k1nw  = k1w;
+   unsigned int k1bw  = neighborZ[k1w];
+   unsigned int k1te  = K1QK;
+   unsigned int k1be  = k1b;
+   unsigned int k1tw  = k1w;
+   unsigned int k1bs  = neighborZ[k1s];
+   unsigned int k1tn  = K1QK;
+   unsigned int k1bn  = k1b;
+   unsigned int k1ts  = k1s;
+   unsigned int k1tse = k1s;
+   unsigned int k1bnw = k1bw;
+   unsigned int k1tnw = k1w;
+   unsigned int k1bse = k1bs;
+   unsigned int k1tsw = k1sw;
+   unsigned int k1bne = k1b;
+   unsigned int k1tne = K1QK;
+   unsigned int k1bsw = neighborZ[k1sw];
+   ////////////////////////////////////////////////////////////////////////////////
+   Distributions27 dist;
+   getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);      
+   real f[27], f1[27]; 
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   f1[DIR_P00] = (dist.f[DIR_P00])[k1e   ];
+   f1[DIR_M00] = (dist.f[DIR_M00])[k1w   ];
+   f1[DIR_0P0] = (dist.f[DIR_0P0])[k1n   ];
+   f1[DIR_0M0] = (dist.f[DIR_0M0])[k1s   ];
+   f1[DIR_00P] = (dist.f[DIR_00P])[k1t   ];
+   f1[DIR_00M] = (dist.f[DIR_00M])[k1b   ];
+   f1[DIR_PP0] = (dist.f[DIR_PP0])[k1ne  ];
+   f1[DIR_MM0] = (dist.f[DIR_MM0])[k1sw  ];
+   f1[DIR_PM0] = (dist.f[DIR_PM0])[k1se  ];
+   f1[DIR_MP0] = (dist.f[DIR_MP0])[k1nw  ];
+   f1[DIR_P0P] = (dist.f[DIR_P0P])[k1te  ];
+   f1[DIR_M0M] = (dist.f[DIR_M0M])[k1bw  ];
+   f1[DIR_P0M] = (dist.f[DIR_P0M])[k1be  ];
+   f1[DIR_M0P] = (dist.f[DIR_M0P])[k1tw  ];
+   f1[DIR_0PP] = (dist.f[DIR_0PP])[k1tn  ];
+   f1[DIR_0MM] = (dist.f[DIR_0MM])[k1bs  ];
+   f1[DIR_0PM] = (dist.f[DIR_0PM])[k1bn  ];
+   f1[DIR_0MP] = (dist.f[DIR_0MP])[k1ts  ];
+   // f1[DIR_000] = (dist.f[DIR_000])[k1zero];
+   f1[DIR_PPP] = (dist.f[DIR_PPP])[k1tne ];
+   f1[DIR_MMP] = (dist.f[DIR_MMP])[k1tsw ];
+   f1[DIR_PMP] = (dist.f[DIR_PMP])[k1tse ];
+   f1[DIR_MPP] = (dist.f[DIR_MPP])[k1tnw ];
+   f1[DIR_PPM] = (dist.f[DIR_PPM])[k1bne ];
+   f1[DIR_MMM] = (dist.f[DIR_MMM])[k1bsw ];
+   f1[DIR_PMM] = (dist.f[DIR_PMM])[k1bse ];
+   f1[DIR_MPM] = (dist.f[DIR_MPM])[k1bnw ];
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   f[DIR_P00] = (dist.f[DIR_P00])[ke   ];
+   f[DIR_M00] = (dist.f[DIR_M00])[kw   ];
+   f[DIR_0P0] = (dist.f[DIR_0P0])[kn   ];
+   f[DIR_0M0] = (dist.f[DIR_0M0])[ks   ];
+   f[DIR_00P] = (dist.f[DIR_00P])[kt   ];
+   f[DIR_00M] = (dist.f[DIR_00M])[kb   ];
+   f[DIR_PP0] = (dist.f[DIR_PP0])[kne  ];
+   f[DIR_MM0] = (dist.f[DIR_MM0])[ksw  ];
+   f[DIR_PM0] = (dist.f[DIR_PM0])[kse  ];
+   f[DIR_MP0] = (dist.f[DIR_MP0])[knw  ];
+   f[DIR_P0P] = (dist.f[DIR_P0P])[kte  ];
+   f[DIR_M0M] = (dist.f[DIR_M0M])[kbw  ];
+   f[DIR_P0M] = (dist.f[DIR_P0M])[kbe  ];
+   f[DIR_M0P] = (dist.f[DIR_M0P])[ktw  ];
+   f[DIR_0PP] = (dist.f[DIR_0PP])[ktn  ];
+   f[DIR_0MM] = (dist.f[DIR_0MM])[kbs  ];
+   f[DIR_0PM] = (dist.f[DIR_0PM])[kbn  ];
+   f[DIR_0MP] = (dist.f[DIR_0MP])[kts  ];
+   // f[DIR_000] = (dist.f[DIR_000])[kzero];
+   f[DIR_PPP] = (dist.f[DIR_PPP])[ktne ];
+   f[DIR_MMP] = (dist.f[DIR_MMP])[ktsw ];
+   f[DIR_PMP] = (dist.f[DIR_PMP])[ktse ];
+   f[DIR_MPP] = (dist.f[DIR_MPP])[ktnw ];
+   f[DIR_PPM] = (dist.f[DIR_PPM])[kbne ];
+   f[DIR_MMM] = (dist.f[DIR_MMM])[kbsw ];
+   f[DIR_PMM] = (dist.f[DIR_PMM])[kbse ];
+   f[DIR_MPM] = (dist.f[DIR_MPM])[kbnw ];
+   //////////////////////////////////////////////////////////////////////////
 
-	  //vx1    =  (((f_TSE - f_BNW) - (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
+   //real vx1, vx2, vx3, drho;
+   //real vx1, vx2, vx3, drho, drho1;
+   //////////////////////////////////////////////////////////////////////////
+   ////Dichte
+   //   drho1  =  f1_TSE + f1_TNW + f1_TNE + f1_TSW + f1_BSE + f1_BNW + f1_BNE + f1_BSW +
+   //             f1_BN + f1_TS + f1_TN + f1_BS + f1_BE + f1_TW + f1_TE + f1_BW + f1_SE + f1_NW + f1_NE + f1_SW + 
+   //             f1_T + f1_B + f1_N + f1_S + f1_E + f1_W + ((D.f[DIR_000])[k1zero]); 
+   //   drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
+   //             f_BN + f_TS + f_TN + f_BS + f_BE + f_TW + f_TE + f_BW + f_SE + f_NW + f_NE + f_SW + 
+   //             f_T + f_B + f_N + f_S + f_E + f_W + ((D.f[DIR_000])[kzero]); 
+
+   //////////////////////////////////////////////////////////////////////////
+   ////Ux
+
+   //vx1    =  (((f_TSE - f_BNW) - (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
    //               ((f_BE - f_TW)   + (f_TE - f_BW))   + ((f_SE - f_NW)   + (f_NE - f_SW)) +
    //               (f_E - f_W)) /(one + drho); 
 
 
-   //   vx2    =   ((-(f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
+   //vx2    =   ((-(f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
    //               ((f_BN - f_TS)   + (f_TN - f_BS))    + (-(f_SE - f_NW)  + (f_NE - f_SW)) +
    //               (f_N - f_S)) /(one + drho); 
 
-   //   vx3    =   (((f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) + (f_TSW - f_BNE)) +
+   //vx3    =   (((f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) + (f_TSW - f_BNE)) +
    //               (-(f_BN - f_TS)  + (f_TN - f_BS))   + ((f_TE - f_BW)   - (f_BE - f_TW)) +
    //               (f_T - f_B)) /(one + drho); 
 
 
-      //real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
+   //real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
-   //   //////////////////////////////////////////////////////////////////////////
-	  ////real omega = om1;
+   //////////////////////////////////////////////////////////////////////////
+	////real omega = om1;
    //   real cusq  = c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
    //   //////////////////////////////////////////////////////////////////////////
-	  ////T�st MK
-	  ////if(vx1 < zero) vx1 = zero;
+   ////T�st MK
+   ////if(vx1 < zero) vx1 = zero;
    //   //////////////////////////////////////////////////////////////////////////
    //   real fZERO = c8over27*  (drho1-(one + drho1)*(cusq))                                                           ;
    //   real fE    = c2over27*  (drho1+(one + drho1)*(three*( vx1        )+c9over2*( vx1        )*( vx1        )-cusq));
@@ -3050,10 +2993,75 @@ __global__ void QPressNoRhoDevice27(  real* rhoBC,
    //   real fSW   = c1over54*  (drho1+(one + drho1)*(three*(-vx1-vx2    )+c9over2*(-vx1-vx2    )*(-vx1-vx2    )-cusq));
    //   real fSE   = c1over54*  (drho1+(one + drho1)*(three*( vx1-vx2    )+c9over2*( vx1-vx2    )*( vx1-vx2    )-cusq));
    //   real fNW   = c1over54*  (drho1+(one + drho1)*(three*(-vx1+vx2    )+c9over2*(-vx1+vx2    )*(-vx1+vx2    )-cusq));
-   //   real fTE   = c1over54*  (drho1+(one + drho1)*(three*( vx1    +vx3)+c9over2*( vx1    +vx3)*( vx1    +vx3)-cusq));
-   //   real fBW   = c1over54*  (drho1+(one + drho1)*(three*(-vx1    -vx3)+c9over2*(-vx1    -vx3)*(-vx1    -vx3)-cusq));
-   //   real fBE   = c1over54*  (drho1+(one + drho1)*(three*( vx1    -vx3)+c9over2*( vx1    -vx3)*( vx1    -vx3)-cusq));
-   //   real fTW   = c1over54*  (drho1+(one + drho1)*(three*(-vx1    +vx3)+c9over2*(-vx1    +vx3)*(-vx1    +vx3)-cusq));
+   //   real fTE	  /////////////////////////////////////////////////////////////
+   //with velocity
+   //if(true){//vx1 >= zero){
+      // real csMvx = one / sqrtf(three) - vx1;
+      // //real csMvy = one / sqrtf(three) - vx2;
+      // ///////////////////////////////////////////
+      // // X
+      // f_W   = f1_W   * csMvx + (one - csMvx) * f_W   ;//- c2over27  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
+      // f_NW  = f1_NW  * csMvx + (one - csMvx) * f_NW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
+      // f_SW  = f1_SW  * csMvx + (one - csMvx) * f_SW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
+      // f_TW  = f1_TW  * csMvx + (one - csMvx) * f_TW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
+      // f_BW  = f1_BW  * csMvx + (one - csMvx) * f_BW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
+      // f_TNW = f1_TNW * csMvx + (one - csMvx) * f_TNW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
+      // f_TSW = f1_TSW * csMvx + (one - csMvx) * f_TSW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
+      // f_BNW = f1_BNW * csMvx + (one - csMvx) * f_BNW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
+      // f_BSW = f1_BSW * csMvx + (one - csMvx) * f_BSW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
+      // ///////////////////////////////////////////
+      // // Y
+      // //f_S   = f1_S   * csMvy + (one - csMvy) * f_S   ;//- c2over27  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
+      // //f_SE  = f1_SE  * csMvy + (one - csMvy) * f_SE  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
+      // //f_SW  = f1_SW  * csMvy + (one - csMvy) * f_SW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
+      // //f_TS  = f1_TS  * csMvy + (one - csMvy) * f_TS  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
+      // //f_BS  = f1_BS  * csMvy + (one - csMvy) * f_BS  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
+      // //f_TSE = f1_TSE * csMvy + (one - csMvy) * f_TSE ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
+      // //f_TSW = f1_TSW * csMvy + (one - csMvy) * f_TSW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
+      // //f_BSE = f1_BSE * csMvy + (one - csMvy) * f_BSE ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
+      // //f_BSW = f1_BSW * csMvy + (one - csMvy) * f_BSW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
+      // //f_S   = f1_S   * csMvy + (one - csMvy) * f_S;
+      // //f_SE  = f1_SE  * csMvy + (one - csMvy) * f_SE;
+      // //f_SW  = f1_SW  * csMvy + (one - csMvy) * f_SW;
+      // //f_TS  = f1_TS  * csMvy + (one - csMvy) * f_TS;
+      // //f_BS  = f1_BS  * csMvy + (one - csMvy) * f_BS;
+      // //f_TSE = f1_TSE * csMvy + (one - csMvy) * f_TSE;
+      // //f_TSW = f1_TSW * csMvy + (one - csMvy) * f_TSW;
+      // //f_BSE = f1_BSE * csMvy + (one - csMvy) * f_BSE;
+      // //f_BSW = f1_BSW * csMvy + (one - csMvy) * f_BSW;
+      // //////////////////////////////////////////////////////////////////////////
+   //}
+   //else
+   //{
+      // ///////////////////////////////////////////
+      // // X
+      // vx1   = vx1 * 0.9;
+      // f_W   = f_E   - six * c2over27  * ( vx1        );
+      // f_NW  = f_SE  - six * c1over54  * ( vx1-vx2    );
+      // f_SW  = f_NE  - six * c1over54  * ( vx1+vx2    );
+      // f_TW  = f_BE  - six * c1over54  * ( vx1    -vx3);
+      // f_BW  = f_TE  - six * c1over54  * ( vx1    +vx3);
+      // f_TNW = f_BSE - six * c1over216 * ( vx1-vx2-vx3);
+      // f_TSW = f_BNE - six * c1over216 * ( vx1+vx2-vx3);
+      // f_BNW = f_TSE - six * c1over216 * ( vx1-vx2+vx3);
+      // f_BSW = f_TNE - six * c1over216 * ( vx1+vx2+vx3);
+      // ///////////////////////////////////////////
+      // // Y
+      // //vx2   = vx2 * 0.9;
+      // //f_S   = f_N   - six * c2over27  * (     vx2    );
+      // //f_SE  = f_NW  - six * c1over54  * (-vx1+vx2    );
+      // //f_SW  = f_NE  - six * c1over54  * ( vx1+vx2    );
+      // //f_TS  = f_BN  - six * c1over54  * (     vx2-vx3);
+      // //f_BS  = f_TN  - six * c1over54  * (     vx2+vx3);
+      // //f_TSE = f_BNW - six * c1over216 * (-vx1+vx2-vx3);
+      // //f_TSW = f_BNE - six * c1over216 * ( vx1+vx2-vx3);
+      // //f_BSE = f_TNW - six * c1over216 * (-vx1+vx2+vx3);
+      // //f_BSW = f_TNE - six * c1over216 * ( vx1+vx2+vx3);
+      // ///////////////////////////////////////////
+   //}
+   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+   //   = c1over54*  (drho1+(one + drho1)*(three*(-vx1    +vx3)+c9over2*(-vx1    +vx3)*(-vx1    +vx3)-cusq));
    //   real fTN   = c1over54*  (drho1+(one + drho1)*(three*(     vx2+vx3)+c9over2*(     vx2+vx3)*(     vx2+vx3)-cusq));
    //   real fBS   = c1over54*  (drho1+(one + drho1)*(three*(    -vx2-vx3)+c9over2*(    -vx2-vx3)*(    -vx2-vx3)-cusq));
    //   real fBN   = c1over54*  (drho1+(one + drho1)*(three*(     vx2-vx3)+c9over2*(     vx2-vx3)*(     vx2-vx3)-cusq));
@@ -3067,222 +3075,322 @@ __global__ void QPressNoRhoDevice27(  real* rhoBC,
    //   real fBSE  = c1over216* (drho1+(one + drho1)*(three*( vx1-vx2-vx3)+c9over2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cusq));
    //   real fTNW  = c1over216* (drho1+(one + drho1)*(three*(-vx1+vx2+vx3)+c9over2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cusq));
 
-	  real cs = c1o1 / sqrtf(c3o1);
-	  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //no velocity
-	  //////////////////////////////////////////
-      f_E    = f1_E   * cs + (c1o1 - cs) * f_E   ;
-      f_W    = f1_W   * cs + (c1o1 - cs) * f_W   ;
-      f_N    = f1_N   * cs + (c1o1 - cs) * f_N   ;
-      f_S    = f1_S   * cs + (c1o1 - cs) * f_S   ;
-      f_T    = f1_T   * cs + (c1o1 - cs) * f_T   ;
-      f_B    = f1_B   * cs + (c1o1 - cs) * f_B   ;
-      f_NE   = f1_NE  * cs + (c1o1 - cs) * f_NE  ;
-      f_SW   = f1_SW  * cs + (c1o1 - cs) * f_SW  ;
-      f_SE   = f1_SE  * cs + (c1o1 - cs) * f_SE  ;
-      f_NW   = f1_NW  * cs + (c1o1 - cs) * f_NW  ;
-      f_TE   = f1_TE  * cs + (c1o1 - cs) * f_TE  ;
-      f_BW   = f1_BW  * cs + (c1o1 - cs) * f_BW  ;
-      f_BE   = f1_BE  * cs + (c1o1 - cs) * f_BE  ;
-      f_TW   = f1_TW  * cs + (c1o1 - cs) * f_TW  ;
-      f_TN   = f1_TN  * cs + (c1o1 - cs) * f_TN  ;
-      f_BS   = f1_BS  * cs + (c1o1 - cs) * f_BS  ;
-      f_BN   = f1_BN  * cs + (c1o1 - cs) * f_BN  ;
-      f_TS   = f1_TS  * cs + (c1o1 - cs) * f_TS  ;
-      f_TNE  = f1_TNE * cs + (c1o1 - cs) * f_TNE ;
-      f_TSW  = f1_TSW * cs + (c1o1 - cs) * f_TSW ;
-      f_TSE  = f1_TSE * cs + (c1o1 - cs) * f_TSE ;
-      f_TNW  = f1_TNW * cs + (c1o1 - cs) * f_TNW ;
-      f_BNE  = f1_BNE * cs + (c1o1 - cs) * f_BNE ;
-      f_BSW  = f1_BSW * cs + (c1o1 - cs) * f_BSW ;
-      f_BSE  = f1_BSE * cs + (c1o1 - cs) * f_BSE ;
-      f_BNW  = f1_BNW * cs + (c1o1 - cs) * f_BNW ;
-	  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-	  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //with velocity
-	  //if(true){//vx1 >= zero){
-		 // real csMvx = one / sqrtf(three) - vx1;
-		 // //real csMvy = one / sqrtf(three) - vx2;
-		 // ///////////////////////////////////////////
-		 // // X
-		 // f_W   = f1_W   * csMvx + (one - csMvx) * f_W   ;//- c2over27  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_NW  = f1_NW  * csMvx + (one - csMvx) * f_NW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_SW  = f1_SW  * csMvx + (one - csMvx) * f_SW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_TW  = f1_TW  * csMvx + (one - csMvx) * f_TW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_BW  = f1_BW  * csMvx + (one - csMvx) * f_BW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_TNW = f1_TNW * csMvx + (one - csMvx) * f_TNW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_TSW = f1_TSW * csMvx + (one - csMvx) * f_TSW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_BNW = f1_BNW * csMvx + (one - csMvx) * f_BNW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // f_BSW = f1_BSW * csMvx + (one - csMvx) * f_BSW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx1);
-		 // ///////////////////////////////////////////
-		 // // Y
-		 // //f_S   = f1_S   * csMvy + (one - csMvy) * f_S   ;//- c2over27  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_SE  = f1_SE  * csMvy + (one - csMvy) * f_SE  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_SW  = f1_SW  * csMvy + (one - csMvy) * f_SW  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_TS  = f1_TS  * csMvy + (one - csMvy) * f_TS  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_BS  = f1_BS  * csMvy + (one - csMvy) * f_BS  ;//- c1over54  * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_TSE = f1_TSE * csMvy + (one - csMvy) * f_TSE ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_TSW = f1_TSW * csMvy + (one - csMvy) * f_TSW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_BSE = f1_BSE * csMvy + (one - csMvy) * f_BSE ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_BSW = f1_BSW * csMvy + (one - csMvy) * f_BSW ;//- c1over216 * ((drho + drho1)*c1o2-((drho + drho1)*c1o2 )*three*vx2);
-		 // //f_S   = f1_S   * csMvy + (one - csMvy) * f_S;
-		 // //f_SE  = f1_SE  * csMvy + (one - csMvy) * f_SE;
-		 // //f_SW  = f1_SW  * csMvy + (one - csMvy) * f_SW;
-		 // //f_TS  = f1_TS  * csMvy + (one - csMvy) * f_TS;
-		 // //f_BS  = f1_BS  * csMvy + (one - csMvy) * f_BS;
-		 // //f_TSE = f1_TSE * csMvy + (one - csMvy) * f_TSE;
-		 // //f_TSW = f1_TSW * csMvy + (one - csMvy) * f_TSW;
-		 // //f_BSE = f1_BSE * csMvy + (one - csMvy) * f_BSE;
-		 // //f_BSW = f1_BSW * csMvy + (one - csMvy) * f_BSW;
-		 // //////////////////////////////////////////////////////////////////////////
-	  //}
-	  //else
-	  //{
-		 // ///////////////////////////////////////////
-		 // // X
-		 // vx1   = vx1 * 0.9;
-		 // f_W   = f_E   - six * c2over27  * ( vx1        );
-		 // f_NW  = f_SE  - six * c1over54  * ( vx1-vx2    );
-		 // f_SW  = f_NE  - six * c1over54  * ( vx1+vx2    );
-		 // f_TW  = f_BE  - six * c1over54  * ( vx1    -vx3);
-		 // f_BW  = f_TE  - six * c1over54  * ( vx1    +vx3);
-		 // f_TNW = f_BSE - six * c1over216 * ( vx1-vx2-vx3);
-		 // f_TSW = f_BNE - six * c1over216 * ( vx1+vx2-vx3);
-		 // f_BNW = f_TSE - six * c1over216 * ( vx1-vx2+vx3);
-		 // f_BSW = f_TNE - six * c1over216 * ( vx1+vx2+vx3);
-		 // ///////////////////////////////////////////
-		 // // Y
-		 // //vx2   = vx2 * 0.9;
-		 // //f_S   = f_N   - six * c2over27  * (     vx2    );
-		 // //f_SE  = f_NW  - six * c1over54  * (-vx1+vx2    );
-		 // //f_SW  = f_NE  - six * c1over54  * ( vx1+vx2    );
-		 // //f_TS  = f_BN  - six * c1over54  * (     vx2-vx3);
-		 // //f_BS  = f_TN  - six * c1over54  * (     vx2+vx3);
-		 // //f_TSE = f_BNW - six * c1over216 * (-vx1+vx2-vx3);
-		 // //f_TSW = f_BNE - six * c1over216 * ( vx1+vx2-vx3);
-		 // //f_BSE = f_TNW - six * c1over216 * (-vx1+vx2+vx3);
-		 // //f_BSW = f_TNE - six * c1over216 * ( vx1+vx2+vx3);
-		 // ///////////////////////////////////////////
-	  //}
-	  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   real cs = c1o1 / sqrtf(c3o1);
 
-	  //////////////////////////////////////////////////////////////////////////
-      if (isEvenTimestep==false)
-      {
-         D.f[DIR_P00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_M00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_PMP *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_MPM *size_Mat];
-      } 
-      else
-      {
-         D.f[DIR_M00   ] = &DD[DIR_P00   *size_Mat];
-         D.f[DIR_P00   ] = &DD[DIR_M00   *size_Mat];
-         D.f[DIR_0M0   ] = &DD[DIR_0P0   *size_Mat];
-         D.f[DIR_0P0   ] = &DD[DIR_0M0   *size_Mat];
-         D.f[DIR_00M   ] = &DD[DIR_00P   *size_Mat];
-         D.f[DIR_00P   ] = &DD[DIR_00M   *size_Mat];
-         D.f[DIR_MM0  ] = &DD[DIR_PP0  *size_Mat];
-         D.f[DIR_PP0  ] = &DD[DIR_MM0  *size_Mat];
-         D.f[DIR_MP0  ] = &DD[DIR_PM0  *size_Mat];
-         D.f[DIR_PM0  ] = &DD[DIR_MP0  *size_Mat];
-         D.f[DIR_M0M  ] = &DD[DIR_P0P  *size_Mat];
-         D.f[DIR_P0P  ] = &DD[DIR_M0M  *size_Mat];
-         D.f[DIR_M0P  ] = &DD[DIR_P0M  *size_Mat];
-         D.f[DIR_P0M  ] = &DD[DIR_M0P  *size_Mat];
-         D.f[DIR_0MM  ] = &DD[DIR_0PP  *size_Mat];
-         D.f[DIR_0PP  ] = &DD[DIR_0MM  *size_Mat];
-         D.f[DIR_0MP  ] = &DD[DIR_0PM  *size_Mat];
-         D.f[DIR_0PM  ] = &DD[DIR_0MP  *size_Mat];
-         D.f[DIR_000] = &DD[DIR_000*size_Mat];
-         D.f[DIR_PPP ] = &DD[DIR_MMM *size_Mat];
-         D.f[DIR_MMP ] = &DD[DIR_PPM *size_Mat];
-         D.f[DIR_PMP ] = &DD[DIR_MPM *size_Mat];
-         D.f[DIR_MPP ] = &DD[DIR_PMM *size_Mat];
-         D.f[DIR_PPM ] = &DD[DIR_MMP *size_Mat];
-         D.f[DIR_MMM ] = &DD[DIR_PPP *size_Mat];
-         D.f[DIR_PMM ] = &DD[DIR_MPP *size_Mat];
-         D.f[DIR_MPM ] = &DD[DIR_PMP *size_Mat];
-      }
-      //////////////////////////////////////////////////////////////////////////
-      //__syncthreads();
-	  // -X
-	  //(D.f[DIR_P00   ])[ke   ] = f_E   ;
-	  //(D.f[DIR_PM0  ])[kse  ] = f_SE  ;
-	  //(D.f[DIR_PP0  ])[kne  ] = f_NE  ;
-	  //(D.f[DIR_P0M  ])[kbe  ] = f_BE  ;
-	  //(D.f[DIR_P0P  ])[kte  ] = f_TE  ;
-	  //(D.f[DIR_PMP ])[ktse ] = f_TSE ;
-	  //(D.f[DIR_PPP ])[ktne ] = f_TNE ;
-	  //(D.f[DIR_PMM ])[kbse ] = f_BSE ;
-	  //(D.f[DIR_PPM ])[kbne ] = f_BNE ;     
-	  // X
-	  (D.f[DIR_M00   ])[kw   ] = f_W   ;
-	  (D.f[DIR_MM0  ])[ksw  ] = f_SW  ;
-	  (D.f[DIR_MP0  ])[knw  ] = f_NW  ;
-	  (D.f[DIR_M0M  ])[kbw  ] = f_BW  ;
-	  (D.f[DIR_M0P  ])[ktw  ] = f_TW  ;
-	  (D.f[DIR_MMP ])[ktsw ] = f_TSW ;
-	  (D.f[DIR_MPP ])[ktnw ] = f_TNW ;
-	  (D.f[DIR_MMM ])[kbsw ] = f_BSW ;
-	  (D.f[DIR_MPM ])[kbnw ] = f_BNW ;     
-	  // Y
-	  //(D.f[DIR_0M0   ])[ks   ] = f_S   ;
-	  //(D.f[DIR_PM0  ])[kse  ] = f_SE  ;
-	  //(D.f[DIR_MM0  ])[ksw  ] = f_SW  ;
-	  //(D.f[DIR_0MP  ])[kts  ] = f_TS  ;
-	  //(D.f[DIR_0MM  ])[kbs  ] = f_BS  ;
-	  //(D.f[DIR_PMP ])[ktse ] = f_TSE ;
-	  //(D.f[DIR_MMP ])[ktsw ] = f_TSW ;
-	  //(D.f[DIR_PMM ])[kbse ] = f_BSE ;
-	  //(D.f[DIR_MMM ])[kbsw ] = f_BSW ;     
-	  // Z
-	  //(D.f[DIR_00M   ])[kb   ] = f_B   ;
-	  //(D.f[DIR_P0M  ])[kbe  ] = f_BE  ;
-	  //(D.f[DIR_M0M  ])[kbw  ] = f_BW  ;
-	  //(D.f[DIR_0PM  ])[kbn  ] = f_BN  ;
-	  //(D.f[DIR_0MM  ])[kbs  ] = f_BS  ;
-	  //(D.f[DIR_PPM ])[kbne ] = f_BNE ;
-	  //(D.f[DIR_MPM ])[kbnw ] = f_BNW ;
-	  //(D.f[DIR_PMM ])[kbse ] = f_BSE ;
-	  //(D.f[DIR_MMM ])[kbsw ] = f_BSW ;     
-      //////////////////////////////////////////////////////////////////////////
+   //////////////////////////////////////////////////////////////////////////
+   getPointersToDistributions(dist, distributions, numberOfLBnodes, !isEvenTimestep);
+   switch(direction)
+   {
+      case MZZ:
+         (dist.f[DIR_P00])[ke   ] = computeOutflowDistribution(f, f1, DIR_P00, cs);
+         (dist.f[DIR_PM0])[kse  ] = computeOutflowDistribution(f, f1, DIR_PM0, cs);
+         (dist.f[DIR_PP0])[kne  ] = computeOutflowDistribution(f, f1, DIR_PP0, cs);
+         (dist.f[DIR_P0M])[kbe  ] = computeOutflowDistribution(f, f1, DIR_P0M, cs);
+         (dist.f[DIR_P0P])[kte  ] = computeOutflowDistribution(f, f1, DIR_P0P, cs);
+         (dist.f[DIR_PMP])[ktse ] = computeOutflowDistribution(f, f1, DIR_PMP, cs);
+         (dist.f[DIR_PPP])[ktne ] = computeOutflowDistribution(f, f1, DIR_PPP, cs);
+         (dist.f[DIR_PMM])[kbse ] = computeOutflowDistribution(f, f1, DIR_PMM, cs);
+         (dist.f[DIR_PPM])[kbne ] = computeOutflowDistribution(f, f1, DIR_PPM, cs);
+         break;
+
+      case PZZ:
+         (dist.f[DIR_M00])[kw   ] = computeOutflowDistribution(f, f1, DIR_M00, cs);
+         (dist.f[DIR_MM0])[ksw  ] = computeOutflowDistribution(f, f1, DIR_MM0, cs);
+         (dist.f[DIR_MP0])[knw  ] = computeOutflowDistribution(f, f1, DIR_MP0, cs);
+         (dist.f[DIR_M0M])[kbw  ] = computeOutflowDistribution(f, f1, DIR_M0M, cs);
+         (dist.f[DIR_M0P])[ktw  ] = computeOutflowDistribution(f, f1, DIR_M0P, cs);
+         (dist.f[DIR_MMP])[ktsw ] = computeOutflowDistribution(f, f1, DIR_MMP, cs);
+         (dist.f[DIR_MPP])[ktnw ] = computeOutflowDistribution(f, f1, DIR_MPP, cs);
+         (dist.f[DIR_MMM])[kbsw ] = computeOutflowDistribution(f, f1, DIR_MMM, cs);
+         (dist.f[DIR_MPM])[kbnw ] = computeOutflowDistribution(f, f1, DIR_MPM, cs);
+         break;
+
+      case ZMZ:
+         (dist.f[DIR_0P0])[kn   ] = computeOutflowDistribution(f, f1, DIR_0P0, cs);
+         (dist.f[DIR_PP0])[kne  ] = computeOutflowDistribution(f, f1, DIR_PP0, cs);
+         (dist.f[DIR_MP0])[knw  ] = computeOutflowDistribution(f, f1, DIR_MP0, cs);
+         (dist.f[DIR_0PP])[ktn  ] = computeOutflowDistribution(f, f1, DIR_0PP, cs);
+         (dist.f[DIR_0PM])[kbn  ] = computeOutflowDistribution(f, f1, DIR_0PM, cs);
+         (dist.f[DIR_PPP])[ktne ] = computeOutflowDistribution(f, f1, DIR_PPP, cs);
+         (dist.f[DIR_MPP])[ktnw ] = computeOutflowDistribution(f, f1, DIR_MPP, cs);
+         (dist.f[DIR_PPM])[kbne ] = computeOutflowDistribution(f, f1, DIR_PPM, cs);
+         (dist.f[DIR_MPM])[kbnw ] = computeOutflowDistribution(f, f1, DIR_MPM, cs);
+         break;  
+
+      case ZPZ:   
+         (dist.f[DIR_0M0])[ks   ] = computeOutflowDistribution(f, f1, DIR_0M0, cs);
+         (dist.f[DIR_PM0])[kse  ] = computeOutflowDistribution(f, f1, DIR_PM0, cs);
+         (dist.f[DIR_MM0])[ksw  ] = computeOutflowDistribution(f, f1, DIR_MM0, cs);
+         (dist.f[DIR_0MP])[kts  ] = computeOutflowDistribution(f, f1, DIR_0MP, cs);
+         (dist.f[DIR_0MM])[kbs  ] = computeOutflowDistribution(f, f1, DIR_0MM, cs);
+         (dist.f[DIR_PMP])[ktse ] = computeOutflowDistribution(f, f1, DIR_PMP, cs);
+         (dist.f[DIR_MMP])[ktsw ] = computeOutflowDistribution(f, f1, DIR_MMP, cs);
+         (dist.f[DIR_PMM])[kbse ] = computeOutflowDistribution(f, f1, DIR_PMM, cs);
+         (dist.f[DIR_MMM])[kbsw ] = computeOutflowDistribution(f, f1, DIR_MMM, cs);
+         break;
+
+      case ZZM:
+         (dist.f[DIR_00P])[kt   ] = computeOutflowDistribution(f, f1, DIR_00P, cs);
+         (dist.f[DIR_P0P])[kte  ] = computeOutflowDistribution(f, f1, DIR_P0P, cs);
+         (dist.f[DIR_M0P])[ktw  ] = computeOutflowDistribution(f, f1, DIR_M0P, cs);
+         (dist.f[DIR_0PP])[ktn  ] = computeOutflowDistribution(f, f1, DIR_0PP, cs);
+         (dist.f[DIR_0MP])[kts  ] = computeOutflowDistribution(f, f1, DIR_0MP, cs);
+         (dist.f[DIR_PPP])[ktne ] = computeOutflowDistribution(f, f1, DIR_PPP, cs);
+         (dist.f[DIR_MPP])[ktnw ] = computeOutflowDistribution(f, f1, DIR_MPP, cs);
+         (dist.f[DIR_PMP])[ktse ] = computeOutflowDistribution(f, f1, DIR_PMP, cs);
+         (dist.f[DIR_MMP])[ktsw ] = computeOutflowDistribution(f, f1, DIR_MMP, cs); 
+         break;
+
+      case ZZP:
+         (dist.f[DIR_00M])[kb   ] = computeOutflowDistribution(f, f1, DIR_00M, cs);
+         (dist.f[DIR_P0M])[kbe  ] = computeOutflowDistribution(f, f1, DIR_P0M, cs);
+         (dist.f[DIR_M0M])[kbw  ] = computeOutflowDistribution(f, f1, DIR_M0M, cs);
+         (dist.f[DIR_0PM])[kbn  ] = computeOutflowDistribution(f, f1, DIR_0PM, cs);
+         (dist.f[DIR_0MM])[kbs  ] = computeOutflowDistribution(f, f1, DIR_0MM, cs);
+         (dist.f[DIR_PPM])[kbne ] = computeOutflowDistribution(f, f1, DIR_PPM, cs);
+         (dist.f[DIR_MPM])[kbnw ] = computeOutflowDistribution(f, f1, DIR_MPM, cs);
+         (dist.f[DIR_PMM])[kbse ] = computeOutflowDistribution(f, f1, DIR_PMM, cs);
+         (dist.f[DIR_MMM])[kbsw ] = computeOutflowDistribution(f, f1, DIR_MMM, cs);     
+         break;
+      default:
+         break;
    }
 }
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-
 
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
+__host__ __device__ real computeOutflowDistribution(const real* const &f, const real* const &f1, const int dir, const real rhoCorrection, const real cs, const real weight)
+{
+   return f1[dir  ] * cs + (c1o1 - cs) * f[dir  ] - weight *rhoCorrection;
+}
 
+__global__ void QPressZeroRhoOutflowDevice27(  real* rhoBC,
+												 real* distributions, 
+												 int* k_Q, 
+												 int* k_N, 
+												 int numberOfBCnodes, 
+												 real om1, 
+												 unsigned int* neighborX,
+												 unsigned int* neighborY,
+												 unsigned int* neighborZ,
+												 unsigned int numberOfLBnodes, 
+												 bool isEvenTimestep,
+                                     int direction,
+                                     real densityCorrectionFactor)
+{
+   ////////////////////////////////////////////////////////////////////////////////
+   const unsigned k = vf::gpu::getNodeIndex();
+   
+   //////////////////////////////////////////////////////////////////////////
 
+   if(k>=numberOfBCnodes) return;
+   ////////////////////////////////////////////////////////////////////////////////
+   //index
+   unsigned int KQK  = k_Q[k];
+   unsigned int kzero= KQK;
+   unsigned int ke   = KQK;
+   unsigned int kw   = neighborX[KQK];
+   unsigned int kn   = KQK;
+   unsigned int ks   = neighborY[KQK];
+   unsigned int kt   = KQK;
+   unsigned int kb   = neighborZ[KQK];
+   unsigned int ksw  = neighborY[kw];
+   unsigned int kne  = KQK;
+   unsigned int kse  = ks;
+   unsigned int knw  = kw;
+   unsigned int kbw  = neighborZ[kw];
+   unsigned int kte  = KQK;
+   unsigned int kbe  = kb;
+   unsigned int ktw  = kw;
+   unsigned int kbs  = neighborZ[ks];
+   unsigned int ktn  = KQK;
+   unsigned int kbn  = kb;
+   unsigned int kts  = ks;
+   unsigned int ktse = ks;
+   unsigned int kbnw = kbw;
+   unsigned int ktnw = kw;
+   unsigned int kbse = kbs;
+   unsigned int ktsw = ksw;
+   unsigned int kbne = kb;
+   unsigned int ktne = KQK;
+   unsigned int kbsw = neighborZ[ksw];
+   ////////////////////////////////////////////////////////////////////////////////
+   //index1
+   unsigned int K1QK  = k_N[k];
+   // unsigned int k1zero= K1QK;
+   unsigned int k1e   = K1QK;
+   unsigned int k1w   = neighborX[K1QK];
+   unsigned int k1n   = K1QK;
+   unsigned int k1s   = neighborY[K1QK];
+   unsigned int k1t   = K1QK;
+   unsigned int k1b   = neighborZ[K1QK];
+   unsigned int k1sw  = neighborY[k1w];
+   unsigned int k1ne  = K1QK;
+   unsigned int k1se  = k1s;
+   unsigned int k1nw  = k1w;
+   unsigned int k1bw  = neighborZ[k1w];
+   unsigned int k1te  = K1QK;
+   unsigned int k1be  = k1b;
+   unsigned int k1tw  = k1w;
+   unsigned int k1bs  = neighborZ[k1s];
+   unsigned int k1tn  = K1QK;
+   unsigned int k1bn  = k1b;
+   unsigned int k1ts  = k1s;
+   unsigned int k1tse = k1s;
+   unsigned int k1bnw = k1bw;
+   unsigned int k1tnw = k1w;
+   unsigned int k1bse = k1bs;
+   unsigned int k1tsw = k1sw;
+   unsigned int k1bne = k1b;
+   unsigned int k1tne = K1QK;
+   unsigned int k1bsw = neighborZ[k1sw];
+   ////////////////////////////////////////////////////////////////////////////////
+   Distributions27 dist;
+   getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);   
+   real f1[27], f[27];   
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   f1[DIR_P00] = (dist.f[DIR_P00])[k1e   ];
+   f1[DIR_M00] = (dist.f[DIR_M00])[k1w   ];
+   f1[DIR_0P0] = (dist.f[DIR_0P0])[k1n   ];
+   f1[DIR_0M0] = (dist.f[DIR_0M0])[k1s   ];
+   f1[DIR_00P] = (dist.f[DIR_00P])[k1t   ];
+   f1[DIR_00M] = (dist.f[DIR_00M])[k1b   ];
+   f1[DIR_PP0] = (dist.f[DIR_PP0])[k1ne  ];
+   f1[DIR_MM0] = (dist.f[DIR_MM0])[k1sw  ];
+   f1[DIR_PM0] = (dist.f[DIR_PM0])[k1se  ];
+   f1[DIR_MP0] = (dist.f[DIR_MP0])[k1nw  ];
+   f1[DIR_P0P] = (dist.f[DIR_P0P])[k1te  ];
+   f1[DIR_M0M] = (dist.f[DIR_M0M])[k1bw  ];
+   f1[DIR_P0M] = (dist.f[DIR_P0M])[k1be  ];
+   f1[DIR_M0P] = (dist.f[DIR_M0P])[k1tw  ];
+   f1[DIR_0PP] = (dist.f[DIR_0PP])[k1tn  ];
+   f1[DIR_0MM] = (dist.f[DIR_0MM])[k1bs  ];
+   f1[DIR_0PM] = (dist.f[DIR_0PM])[k1bn  ];
+   f1[DIR_0MP] = (dist.f[DIR_0MP])[k1ts  ];
+   // f1[DIR_000] = (dist.f[DIR_000])[k1zero];
+   f1[DIR_PPP] = (dist.f[DIR_PPP])[k1tne ];
+   f1[DIR_MMP] = (dist.f[DIR_MMP])[k1tsw ];
+   f1[DIR_PMP] = (dist.f[DIR_PMP])[k1tse ];
+   f1[DIR_MPP] = (dist.f[DIR_MPP])[k1tnw ];
+   f1[DIR_PPM] = (dist.f[DIR_PPM])[k1bne ];
+   f1[DIR_MMM] = (dist.f[DIR_MMM])[k1bsw ];
+   f1[DIR_PMM] = (dist.f[DIR_PMM])[k1bse ];
+   f1[DIR_MPM] = (dist.f[DIR_MPM])[k1bnw ];
+   //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+   f[DIR_P00] = (dist.f[DIR_P00])[ke   ];
+   f[DIR_M00] = (dist.f[DIR_M00])[kw   ];
+   f[DIR_0P0] = (dist.f[DIR_0P0])[kn   ];
+   f[DIR_0M0] = (dist.f[DIR_0M0])[ks   ];
+   f[DIR_00P] = (dist.f[DIR_00P])[kt   ];
+   f[DIR_00M] = (dist.f[DIR_00M])[kb   ];
+   f[DIR_PP0] = (dist.f[DIR_PP0])[kne  ];
+   f[DIR_MM0] = (dist.f[DIR_MM0])[ksw  ];
+   f[DIR_PM0] = (dist.f[DIR_PM0])[kse  ];
+   f[DIR_MP0] = (dist.f[DIR_MP0])[knw  ];
+   f[DIR_P0P] = (dist.f[DIR_P0P])[kte  ];
+   f[DIR_M0M] = (dist.f[DIR_M0M])[kbw  ];
+   f[DIR_P0M] = (dist.f[DIR_P0M])[kbe  ];
+   f[DIR_M0P] = (dist.f[DIR_M0P])[ktw  ];
+   f[DIR_0PP] = (dist.f[DIR_0PP])[ktn  ];
+   f[DIR_0MM] = (dist.f[DIR_0MM])[kbs  ];
+   f[DIR_0PM] = (dist.f[DIR_0PM])[kbn  ];
+   f[DIR_0MP] = (dist.f[DIR_0MP])[kts  ];
+   f[DIR_000] = (dist.f[DIR_000])[kzero];
+   f[DIR_PPP] = (dist.f[DIR_PPP])[ktne ];
+   f[DIR_MMP] = (dist.f[DIR_MMP])[ktsw ];
+   f[DIR_PMP] = (dist.f[DIR_PMP])[ktse ];
+   f[DIR_MPP] = (dist.f[DIR_MPP])[ktnw ];
+   f[DIR_PPM] = (dist.f[DIR_PPM])[kbne ];
+   f[DIR_MMM] = (dist.f[DIR_MMM])[kbsw ];
+   f[DIR_PMM] = (dist.f[DIR_PMM])[kbse ];
+   f[DIR_MPM] = (dist.f[DIR_MPM])[kbnw ];
+   //////////////////////////////////////////////////////////////////////////
+   real drho = vf::lbm::getDensity(f);
+   
+   real rhoCorrection = densityCorrectionFactor*drho;
+   
+   real cs = c1o1 / sqrtf(c3o1);
 
+   getPointersToDistributions(dist, distributions, numberOfLBnodes, !isEvenTimestep);
 
+   switch(direction)
+   {
+      case MZZ:
+         (dist.f[DIR_P00])[ke   ] = computeOutflowDistribution(f, f1, DIR_P00  , rhoCorrection, cs, c2o27);
+         (dist.f[DIR_PM0])[kse  ] = computeOutflowDistribution(f, f1, DIR_PM0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_PP0])[kne  ] = computeOutflowDistribution(f, f1, DIR_PP0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_P0M])[kbe  ] = computeOutflowDistribution(f, f1, DIR_P0M, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_P0P])[kte  ] = computeOutflowDistribution(f, f1, DIR_P0P, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_PMP])[ktse ] = computeOutflowDistribution(f, f1, DIR_PMP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PPP])[ktne ] = computeOutflowDistribution(f, f1, DIR_PPP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PMM])[kbse ] = computeOutflowDistribution(f, f1, DIR_PMM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PPM])[kbne ] = computeOutflowDistribution(f, f1, DIR_PPM, rhoCorrection, cs, c1o216);
+         break;
+
+      case PZZ:
+         (dist.f[DIR_M00])[kw   ] = computeOutflowDistribution(f, f1, DIR_M00, rhoCorrection, cs, c2o27);
+         (dist.f[DIR_MM0])[ksw  ] = computeOutflowDistribution(f, f1, DIR_MM0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_MP0])[knw  ] = computeOutflowDistribution(f, f1, DIR_MP0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_M0M])[kbw  ] = computeOutflowDistribution(f, f1, DIR_M0M, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_M0P])[ktw  ] = computeOutflowDistribution(f, f1, DIR_M0P, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_MMP])[ktsw ] = computeOutflowDistribution(f, f1, DIR_MMP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MPP])[ktnw ] = computeOutflowDistribution(f, f1, DIR_MPP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MMM])[kbsw ] = computeOutflowDistribution(f, f1, DIR_MMM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MPM])[kbnw ] = computeOutflowDistribution(f, f1, DIR_MPM, rhoCorrection, cs, c1o216);
+         break;
+
+      case ZMZ:
+         (dist.f[DIR_0P0])[kn   ] = computeOutflowDistribution(f, f1, DIR_0P0, rhoCorrection, cs, c2o27);
+         (dist.f[DIR_PP0])[kne  ] = computeOutflowDistribution(f, f1, DIR_PP0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_MP0])[knw  ] = computeOutflowDistribution(f, f1, DIR_MP0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0PP])[ktn  ] = computeOutflowDistribution(f, f1, DIR_0PP, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0PM])[kbn  ] = computeOutflowDistribution(f, f1, DIR_0PM, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_PPP])[ktne ] = computeOutflowDistribution(f, f1, DIR_PPP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MPP])[ktnw ] = computeOutflowDistribution(f, f1, DIR_MPP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PPM])[kbne ] = computeOutflowDistribution(f, f1, DIR_PPM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MPM])[kbnw ] = computeOutflowDistribution(f, f1, DIR_MPM, rhoCorrection, cs, c1o216);
+         break;  
+
+      case ZPZ:   
+         (dist.f[DIR_0M0])[ks   ] =computeOutflowDistribution(f, f1, DIR_0M0, rhoCorrection, cs, c2o27);
+         (dist.f[DIR_PM0])[kse  ] =computeOutflowDistribution(f, f1, DIR_PM0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_MM0])[ksw  ] =computeOutflowDistribution(f, f1, DIR_MM0, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0MP])[kts  ] =computeOutflowDistribution(f, f1, DIR_0MP, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0MM])[kbs  ] =computeOutflowDistribution(f, f1, DIR_0MM, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_PMP])[ktse ] =computeOutflowDistribution(f, f1, DIR_PMP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MMP])[ktsw ] =computeOutflowDistribution(f, f1, DIR_MMP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PMM])[kbse ] =computeOutflowDistribution(f, f1, DIR_PMM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MMM])[kbsw ] =computeOutflowDistribution(f, f1, DIR_MMM, rhoCorrection, cs, c1o216);
+         break;
+
+      case ZZM:
+         (dist.f[DIR_00P])[kt   ] = computeOutflowDistribution(f, f1, DIR_00P, rhoCorrection, cs, c2o27);
+         (dist.f[DIR_P0P])[kte  ] = computeOutflowDistribution(f, f1, DIR_P0P, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_M0P])[ktw  ] = computeOutflowDistribution(f, f1, DIR_M0P, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0PP])[ktn  ] = computeOutflowDistribution(f, f1, DIR_0PP, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0MP])[kts  ] = computeOutflowDistribution(f, f1, DIR_0MP, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_PPP])[ktne ] = computeOutflowDistribution(f, f1, DIR_PPP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MPP])[ktnw ] = computeOutflowDistribution(f, f1, DIR_MPP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PMP])[ktse ] = computeOutflowDistribution(f, f1, DIR_PMP, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MMP])[ktsw ] = computeOutflowDistribution(f, f1, DIR_MMP, rhoCorrection, cs, c1o216); 
+         break;
+
+      case ZZP:
+         (dist.f[DIR_00M])[kb   ] = computeOutflowDistribution(f, f1, DIR_00M, rhoCorrection, cs, c2o27);
+         (dist.f[DIR_P0M])[kbe  ] = computeOutflowDistribution(f, f1, DIR_P0M, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_M0M])[kbw  ] = computeOutflowDistribution(f, f1, DIR_M0M, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0PM])[kbn  ] = computeOutflowDistribution(f, f1, DIR_0PM, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_0MM])[kbs  ] = computeOutflowDistribution(f, f1, DIR_0MM, rhoCorrection, cs, c1o54);
+         (dist.f[DIR_PPM])[kbne ] = computeOutflowDistribution(f, f1, DIR_PPM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MPM])[kbnw ] = computeOutflowDistribution(f, f1, DIR_MPM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_PMM])[kbse ] = computeOutflowDistribution(f, f1, DIR_PMM, rhoCorrection, cs, c1o216);
+         (dist.f[DIR_MMM])[kbsw ] = computeOutflowDistribution(f, f1, DIR_MMM, rhoCorrection, cs, c1o216);     
+         break;
+      default:
+         break;
+   }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 
 
diff --git a/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosityKernels.cu b/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosityKernels.cu
index 3719ca3712e6f63a77f62bf314af7d19eea01f4c..a8f02fee717caf7f67624243b873fe993b5c7927 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosityKernels.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosityKernels.cu
@@ -38,6 +38,7 @@
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 #include "LBM/LB.h"
+#include "Kernel/Utilities/DistributionHelper.cuh"
 
 using namespace vf::lbm::constant;
 
@@ -64,15 +65,7 @@ __global__ void calcAMD(real* vx,
                         uint size_Mat,
                         real SGSConstant)
 {
-
-    const uint x = threadIdx.x; 
-    const uint y = blockIdx.x; 
-    const uint z = blockIdx.y; 
-
-    const uint nx = blockDim.x;
-    const uint ny = gridDim.x;
-
-    const uint k = nx*(ny*z + y) + x;
+    const uint k = vf::gpu::getNodeIndex();
     if(k >= size_Mat) return;
     if(typeOfGridNode[k] != GEO_FLUID) return;
 
@@ -102,7 +95,7 @@ __global__ void calcAMD(real* vx,
                         (dvxdx*dvzdx + dvxdy*dvzdy + dvxdz*dvzdz) * (dvxdz+dvzdx) + 
                         (dvydx*dvzdx + dvydy*dvzdy + dvydz*dvzdz) * (dvydz+dvzdy);
 
-    turbulentViscosity[k] = max(c0o1,-SGSConstant*enumerator)/denominator;
+    turbulentViscosity[k] = denominator != c0o1 ? max(c0o1,-SGSConstant*enumerator)/denominator : c0o1;
 }
 
 void calcTurbulentViscosityAMD(Parameter* para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim.cu
index a9d518d14a286ae3f6b565176969162994afa269..8c5ba40baba928a627c375f32d8df914eec4fdb8 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim.cu
@@ -16,7 +16,8 @@ void TurbulentViscosityCumulantK17CompChim<turbulenceModel>::run()
 	vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, para->getParH(level)->numberOfNodes);
 
 	LB_Kernel_TurbulentViscosityCumulantK17CompChim < turbulenceModel  > <<< grid.grid, grid.threads >>>(   para->getParD(level)->omega, 	
-																											para->getParD(level)->typeOfGridNode, 										para->getParD(level)->neighborX,	
+																											para->getParD(level)->typeOfGridNode, 										
+																											para->getParD(level)->neighborX,	
 																											para->getParD(level)->neighborY,	
 																											para->getParD(level)->neighborZ,	
 																											para->getParD(level)->distributions.f[0],	
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cu
index 32350b95107b68103af0f238fefe095882919092..63ca7d0673432ebef35e8e6deaaef6cf9f2cf0d4 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cu
@@ -29,7 +29,7 @@
 //! \file TurbulentViscosityCumulantK17CompChim_Device.cu
 //! \author Henry Korb, Henrik Asmuth
 //! \date 16/05/2022
-//! \brief CumulantK17CompChim kernel by Martin Schönherr that inlcudes turbulent viscosity and other small mods.
+//! \brief CumulantK17CompChim kernel by Martin Schönherr that includes turbulent viscosity and other small mods.
 //!
 //! Additions to CumulantK17CompChim:
 //!     - can incorporate local body force 
@@ -43,6 +43,8 @@
 #include "lbm/constants/D3Q27.h"
 #include <lbm/constants/NumericConstants.h>
 #include "Kernel/Utilities/DistributionHelper.cuh"
+#include "VirtualFluids_GPU/GPU/KernelUtilities.h"
+#include "Kernel/ChimeraTransformation.h"
 
 #include "GPU/TurbulentViscosityInlines.cuh"
 
@@ -66,7 +68,7 @@ __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim(
     real* vz,
     real* turbulentViscosity,
     real SGSconstant,
-	unsigned long size_Mat,
+	unsigned long numberOfLBnodes,
 	int level,
     bool bodyForce,
 	real* forces,
@@ -91,14 +93,14 @@ __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim(
 
     //////////////////////////////////////////////////////////////////////////
     // run for all indices in size_Mat and fluid nodes
-    if ((k_000 < size_Mat) && (typeOfGridNode[k_000] == GEO_FLUID)) {
+    if ((k_000 < numberOfLBnodes) && (typeOfGridNode[k_000] == GEO_FLUID)) {
         //////////////////////////////////////////////////////////////////////////
         //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on
         //! timestep is based on the esoteric twist algorithm \ref <a
         //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
         //! DOI:10.3390/computation5020019 ]</b></a>
         //!
-        Distributions27 dist = vf::gpu::getDistributionReferences27(distributions, size_Mat, isEvenTimestep);
+        Distributions27 dist = vf::gpu::getDistributionReferences27(distributions, numberOfLBnodes, isEvenTimestep);
 
         ////////////////////////////////////////////////////////////////////////////////
         //! - Set neighbor indices (necessary for indirect addressing)
@@ -200,9 +202,9 @@ __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim(
         //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
         //!
         real factor = c1o1;
-        for (size_t i = 1; i <= level; i++) {
+        for (size_t i = 1; i <= level; i++){
             factor *= c2o1;
-        }
+        } 
         
         real fx = forces[0];
         real fy = forces[1];
@@ -680,8 +682,8 @@ __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim(
     }
 }
 
-template __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim < TurbulenceModel::AMD > ( real omega_in, uint* typeOfGridNode, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long size_Mat, int level, bool bodyForce, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep);
+template __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim < TurbulenceModel::AMD > ( real omega_in, uint* typeOfGridNode, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long numberOfLBnodes, int level, bool bodyForce, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep);
 
-template __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim < TurbulenceModel::Smagorinsky > ( real omega_in, uint* typeOfGridNode, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long size_Mat, int level, bool bodyForce, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep);
+template __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim < TurbulenceModel::Smagorinsky > ( real omega_in, uint* typeOfGridNode, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long numberOfLBnodes, int level, bool bodyForce, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep);
 
 template __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim < TurbulenceModel::QR > ( real omega_in, uint* typeOfGridNode, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long size_Mat, int level, bool bodyForce, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep);
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cuh b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cuh
index 5ef37557399f263d25edf03b02b00f6a03c6e1cb..6af4e0a85f1a242ff13d148a2aaecc89c5240308 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cuh
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cuh
@@ -17,7 +17,7 @@ template< TurbulenceModel turbulenceModel > __global__ void LB_Kernel_TurbulentV
     real* vz,
 	real* turbulentViscosity,
 	real SGSconstant,
-	unsigned long size_Mat,
+	unsigned long numberOfLBnodes,
 	int level,
 	bool bodyForce,
 	real* forces,
diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.cpp b/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.cpp
index cc945ea225a28c58dca4ceefdb80fffb76228b21..eae5c5f9965323a8debb62789c931a70ae462a56 100644
--- a/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.cpp
+++ b/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.cpp
@@ -38,6 +38,7 @@
 
 #include "BCKernelManager.h"
 #include "Factories/BoundaryConditionFactory.h"
+#include "GridGenerator/VelocitySetter/VelocitySetter.h"
 #include "Calculation/Cp.h"
 #include "Calculation/DragLift.h"
 #include "GPU/GPU_Interface.h"
@@ -51,6 +52,7 @@ BCKernelManager::BCKernelManager(SPtr<Parameter> parameter, BoundaryConditionFac
     this->pressureBoundaryConditionPre  = bcFactory->getPressureBoundaryConditionPre();
     this->geometryBoundaryConditionPost = bcFactory->getGeometryBoundaryConditionPost();
     this->stressBoundaryConditionPost   = bcFactory->getStressBoundaryConditionPost();
+    this->precursorBoundaryConditionPost = bcFactory->getPrecursorBoundaryConditionPost();
 
     checkBoundaryCondition(this->velocityBoundaryConditionPost, this->para->getParD(0)->velocityBC,
                            "velocityBoundaryConditionPost");
@@ -64,6 +66,8 @@ BCKernelManager::BCKernelManager(SPtr<Parameter> parameter, BoundaryConditionFac
                            "geometryBoundaryConditionPost");
     checkBoundaryCondition(this->stressBoundaryConditionPost, this->para->getParD(0)->stressBC,
                            "stressBoundaryConditionPost");
+    checkBoundaryCondition(this->precursorBoundaryConditionPost, this->para->getParD(0)->precursorBC,
+                           "precursorBoundaryConditionPost");
 }
 
 void BCKernelManager::runVelocityBCKernelPre(const int level) const
@@ -387,3 +391,75 @@ void BCKernelManager::runNoSlipBCKernelPost(const int level) const{
         noSlipBoundaryConditionPost(para->getParD(level).get(), &(para->getParD(level)->noSlipBC));
     }
 }
+
+// void LBKernelManager::calculateMacroscopicValues(const int level) const
+// {
+//     if (para->getIsADcalculationOn()) {
+//           CalcMacADCompSP27(
+//                para->getParD()->velocityX,
+//                para->getParD()->velocityY,
+//                para->getParD()->velocityZ,
+//                para->getParD()->rho,
+//                para->getParD()->pressure,
+//                para->getParD()->typeOfGridNode,
+//                para->getParD()->neighborX,
+//                para->getParD()->neighborY,
+//                para->getParD()->neighborZ,
+//                para->getParD()->numberOfNodes,
+//                para->getParD()->numberofthreads,
+//                para->getParD()->distributions.f[0],
+//                para->getParD()->distributionsAD.f[0],
+//             para->getParD()->forcing,
+//                para->getParD()->isEvenTimestep);
+//     } else {
+//           CalcMacCompSP27(
+//                para->getParD()->velocityX,
+//                para->getParD()->velocityY,
+//                para->getParD()->velocityZ,
+//                para->getParD()->rho,
+//                para->getParD()->pressure,
+//                para->getParD()->typeOfGridNode,
+//                para->getParD()->neighborX,
+//                para->getParD()->neighborY,
+//                para->getParD()->neighborZ,
+//                para->getParD()->numberOfNodes,
+//                para->getParD()->numberofthreads,
+//                para->getParD()->distributions.f[0],
+//                para->getParD()->isEvenTimestep);
+//      }
+// }
+
+void BCKernelManager::runPrecursorBCKernelPost(int level, uint t, CudaMemoryManager* cudaMemoryManager)
+{
+    if(para->getParH(level)->precursorBC.numberOfBCnodes == 0) return;
+
+    uint lastTime =    (para->getParD(level)->precursorBC.nPrecursorReads-2)*para->getParD(level)->precursorBC.nTRead; // timestep currently loaded into last arrays
+    uint currentTime = (para->getParD(level)->precursorBC.nPrecursorReads-1)*para->getParD(level)->precursorBC.nTRead; // timestep currently loaded into current arrays
+    uint nextTime =     para->getParD(level)->precursorBC.nPrecursorReads   *para->getParD(level)->precursorBC.nTRead; // timestep currently loaded into next arrays
+
+    if(t>=currentTime)
+    {
+        //cycle time
+        lastTime = currentTime;
+        currentTime = nextTime;
+        nextTime += para->getParD(level)->precursorBC.nTRead;
+
+        //cycle pointers
+        real* tmp = para->getParD(level)->precursorBC.last;
+        para->getParD(level)->precursorBC.last = para->getParD(level)->precursorBC.current;
+        para->getParD(level)->precursorBC.current = para->getParD(level)->precursorBC.next;
+        para->getParD(level)->precursorBC.next = tmp;
+
+        real loadTime = nextTime*pow(2,-level)*para->getTimeRatio();
+        for(auto reader : para->getParH(level)->velocityReader)
+        {   
+            reader->getNextData(para->getParH(level)->precursorBC.next, para->getParH(level)->precursorBC.numberOfPrecursorNodes, loadTime);
+        }
+        cudaMemoryManager->cudaCopyPrecursorData(level);
+        para->getParD(level)->precursorBC.nPrecursorReads++;
+        para->getParH(level)->precursorBC.nPrecursorReads++;  
+    }
+    
+    real tRatio = real(t-lastTime)/para->getParD(level)->precursorBC.nTRead;
+    precursorBoundaryConditionPost(para->getParD(level).get(), &para->getParD(level)->precursorBC, tRatio, para->getVelocityRatio());
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.h b/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.h
index 423a9cc9056281a3a2a135ae32fa26cc47f93967..a2987e9b40900d019f95dc0fa839beb775f522ef 100644
--- a/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.h
+++ b/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManager.h
@@ -48,6 +48,7 @@ struct LBMSimulationParameter;
 
 using boundaryCondition = std::function<void(LBMSimulationParameter *, QforBoundaryConditions *)>;
 using boundaryConditionWithParameter = std::function<void(Parameter *, QforBoundaryConditions *, const int level)>;
+using precursorBoundaryCondition = std::function<void(LBMSimulationParameter *, QforPrecursorBoundaryConditions *, real tRatio, real velocityRatio)>;
 
 //! \class BCKernelManager
 //! \brief manage the cuda kernel calls to boundary conditions
@@ -84,7 +85,10 @@ public:
     //! \brief calls the device function of the pressure boundary condition (post-collision)
     void runPressureBCKernelPost(const int level) const;
 
-    //! \brief calls the device function of the outflow boundary condition (pre-collision)
+	//! \brief calls the device function of the precursor boundary condition
+	void runPrecursorBCKernelPost(int level, uint t, CudaMemoryManager* cudaMemoryManager);
+
+    //! \brief calls the device function of the outflow boundary condition
     void runOutflowBCKernelPre(const int level) const;
 
     //! \brief calls the device function of the stress wall model (post-collision)
@@ -96,13 +100,16 @@ private:
     //! \param boundaryCondition: a kernel function for the boundary condition
     //! \param bcStruct: a struct containing the grid nodes which are part of the boundary condition
     //! \param bcName: the name of the checked boundary condition
-    template <typename bcFunction>
-    void checkBoundaryCondition(const bcFunction &boundaryCondition, const QforBoundaryConditions &bcStruct, const std::string &bcName)
+    template <typename bcFunction, typename QforBC>
+    void checkBoundaryCondition(const bcFunction &boundaryCondition, const QforBC &bcStruct, const std::string &bcName)
     {
         if (!boundaryCondition && bcStruct.numberOfBCnodes > 0)
             throw std::runtime_error("The boundary condition " + bcName + " was not set!");
     }
 
+    void runDistributionPrecursorBCKernelPost(int level, uint t, CudaMemoryManager* cudaMemoryManager);
+    void runVelocityPrecursorBCKernelPost(int level, uint t, CudaMemoryManager* cudaMemoryManager);
+
     SPtr<Parameter> para;
 
     boundaryCondition velocityBoundaryConditionPost = nullptr;
@@ -111,5 +118,6 @@ private:
     boundaryCondition pressureBoundaryConditionPre = nullptr;
     boundaryCondition geometryBoundaryConditionPost = nullptr;
     boundaryConditionWithParameter stressBoundaryConditionPost = nullptr;
+    precursorBoundaryCondition precursorBoundaryConditionPost = nullptr;
 };
 #endif
diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManagerTest.cpp b/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManagerTest.cpp
index d55fa51bd8a225dd4e89e684bc81cd56f3f450c0..a0e02112e821eedcfeb013d3465529f668309529 100644
--- a/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManagerTest.cpp
+++ b/src/gpu/VirtualFluids_GPU/KernelManager/BCKernelManagerTest.cpp
@@ -53,3 +53,9 @@ TEST_F(BCKernelManagerTest_BCsNotSpecified, stressBoundaryConditionPost_NotSpeci
     para->getParD(0)->stressBC.numberOfBCnodes = 1;
     EXPECT_THROW(BCKernelManager(para, &bcFactory), std::runtime_error);
 }
+
+TEST_F(BCKernelManagerTest_BCsNotSpecified, precursorBoundaryConditionPost_NotSpecified)
+{
+    para->getParD(0)->precursorBC.numberOfBCnodes = 1;
+    EXPECT_THROW(BCKernelManager(para, &bcFactory), std::runtime_error);
+}
diff --git a/src/gpu/VirtualFluids_GPU/LBM/LB.h b/src/gpu/VirtualFluids_GPU/LBM/LB.h
index eea4adfda3c1ef0862f39ef58fc6e065af7bab1b..e9831253923a90dc2daf3e509fc13c01de55d142 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/LB.h
+++ b/src/gpu/VirtualFluids_GPU/LBM/LB.h
@@ -46,6 +46,7 @@
 
 
 #include "Core/DataTypes.h"
+#include <cuda_runtime.h>
 
 #include <string>
 #include <vector>
@@ -144,6 +145,7 @@ struct InitCondition
    bool hasWallModelMonitor {false};
    bool simulatePorousMedia {false};
    bool streetVelocityFile {false};
+   real outflowPressureCorrectionFactor {0.0};
 };
 
 //Interface Cells
@@ -214,6 +216,22 @@ typedef struct QforBC{
    real *normalX, *normalY, *normalZ;
 }QforBoundaryConditions;
 
+typedef struct QforPrecursorBC{
+   int* k;
+   int numberOfBCnodes=0;
+   int sizeQ;
+   int numberOfPrecursorNodes=0;
+   uint nPrecursorReads=0;
+   uint nTRead;
+   size_t numberOfQuantities;
+   real* q27[27];
+   uint* planeNeighborNT, *planeNeighborNB, *planeNeighborST, *planeNeighborSB;
+   real* weightsNT, *weightsNB, *weightsST,  *weightsSB;
+   real* last, *current, *next;
+   real velocityX, velocityY, velocityZ;
+   cudaStream_t stream;
+}QforPrecursorBoundaryConditions;
+
 //BCTemp
 typedef struct TempforBC{
    int* k;
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/EdgeNodeFinderTest.cpp b/src/gpu/VirtualFluids_GPU/Parameter/EdgeNodeFinderTest.cpp
index c63c1620ae368cdb31ed582814b472b4695114bf..8e9919e3f583abe5b77163485924606646a8ec22 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/EdgeNodeFinderTest.cpp
+++ b/src/gpu/VirtualFluids_GPU/Parameter/EdgeNodeFinderTest.cpp
@@ -70,13 +70,13 @@ TEST_F(EdgeNodeFinderTest_findEdgeNodes, shouldReturnCorrectVectorForXY)
 
     vf::gpu::findEdgeNodesCommMultiGPU(*para);
 
-    const std::vector<std::pair<int, int>> expectedEdgeNodesXtoYRecv = { std::pair(numRecvNeighbor, 0),
-                                                                         std::pair(numRecvNeighbor, 4),
-                                                                         std::pair(numRecvNeighbor, 5) };
+    const std::vector<std::pair<int, int>> expectedEdgeNodesXtoYRecv = { std::pair<int, int>(numRecvNeighbor, 0),
+                                                                         std::pair<int, int>(numRecvNeighbor, 4),
+                                                                         std::pair<int, int>(numRecvNeighbor, 5) };
 
-    const std::vector<std::pair<int, int>> expectedEdgeNodesXtoYSend = { std::pair(numSendNeighbor, 1),
-                                                                         std::pair(numSendNeighbor, 6),
-                                                                         std::pair(numSendNeighbor, 4) };
+    const std::vector<std::pair<int, int>> expectedEdgeNodesXtoYSend = { std::pair<int, int>(numSendNeighbor, 1),
+                                                                         std::pair<int, int>(numSendNeighbor, 6),
+                                                                         std::pair<int, int>(numSendNeighbor, 4) };
 
     EXPECT_THAT(para->parH[level]->edgeNodesXtoY.size(), testing::Eq(expectedEdgeNodesXtoYRecv.size()));
     EXPECT_TRUE(compareEdgeNodesRecv(para->parH[level]->edgeNodesXtoY, expectedEdgeNodesXtoYRecv))
@@ -107,12 +107,12 @@ TEST_F(EdgeNodeFinderTest_findEdgeNodes, shouldReturnCorrectVectorForXZ)
 
     vf::gpu::findEdgeNodesCommMultiGPU(*para);
 
-    const std::vector<std::pair<int, int>> expectedEdgeNodesXtoZRecv = { std::pair(numRecvNeighbor, 1),
-                                                                         std::pair(numRecvNeighbor, 4),
-                                                                         std::pair(numRecvNeighbor, 6) };
-    const std::vector<std::pair<int, int>> expectedEdgeNodesXtoZSend = { std::pair(numSendNeighbor, 0),
-                                                                         std::pair(numSendNeighbor, 5),
-                                                                         std::pair(numSendNeighbor, 4) };
+    const std::vector<std::pair<int, int>> expectedEdgeNodesXtoZRecv = { std::pair<int, int>(numRecvNeighbor, 1),
+                                                                         std::pair<int, int>(numRecvNeighbor, 4),
+                                                                         std::pair<int, int>(numRecvNeighbor, 6) };
+    const std::vector<std::pair<int, int>> expectedEdgeNodesXtoZSend = { std::pair<int, int>(numSendNeighbor, 0),
+                                                                         std::pair<int, int>(numSendNeighbor, 5),
+                                                                         std::pair<int, int>(numSendNeighbor, 4) };
 
     EXPECT_THAT(para->parH[level]->edgeNodesXtoZ.size(), testing::Eq(expectedEdgeNodesXtoZRecv.size()));
     EXPECT_TRUE(compareEdgeNodesRecv(para->parH[level]->edgeNodesXtoZ, expectedEdgeNodesXtoZRecv))
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
index dc7d5cb07e573003bfebfa7ef327dddb1f9d4aa4..4123f39f351c4bf41d536bff0d1deea3fbe6e2aa 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
@@ -883,6 +883,10 @@ void Parameter::setPressOutZ(unsigned int PressOutZ)
 {
     ic.PressOutZ = PressOutZ;
 }
+void Parameter::setOutflowPressureCorrectionFactor(real pressBCrhoCorrectionFactor)
+{
+    ic.outflowPressureCorrectionFactor = pressBCrhoCorrectionFactor;
+}
 void Parameter::setMaxDev(int maxdev)
 {
     ic.maxdev = maxdev;
@@ -1906,6 +1910,10 @@ unsigned int Parameter::getPressOutZ()
 {
     return ic.PressOutZ;
 }
+real Parameter::getOutflowPressureCorrectionFactor()
+{
+    return ic.outflowPressureCorrectionFactor;
+}
 int Parameter::getMaxDev()
 {
     return ic.maxdev;
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
index a397948ef8fe642df377681404e870b90aac100a..aff100584abef9797a0f72c11319be4719503d92 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
@@ -54,6 +54,8 @@ class ConfigurationFile;
 }
 class CudaStreamManager;
 
+class VelocityReader;
+
 //! \struct LBMSimulationParameter
 //! \brief struct holds and manages the LB-parameter of the simulation
 //! \brief For this purpose it holds structures and pointer for host and device data, respectively.
@@ -218,16 +220,16 @@ struct LBMSimulationParameter {
     OffsetFC offFCBulk;
     unsigned int mem_size_kCF_off;
     unsigned int mem_size_kFC_off;
-
-    // BC's////////////////////
+    
     //! \brief stores the boundary condition data
     QforBoundaryConditions noSlipBC, velocityBC, outflowBC, slipBC, stressBC, pressureBC;
     //! \brief number of lattice nodes for the boundary conditions
-    unsigned int numberOfNoSlipBCnodesRead, numberOfVeloBCnodesRead, numberOfOutflowBCnodesRead, numberOfSlipBCnodesRead, numberOfStressBCnodesRead, numberOfPressureBCnodesRead;
+    unsigned int numberOfNoSlipBCnodesRead, numberOfVeloBCnodesRead, numberOfOutflowBCnodesRead, numberOfSlipBCnodesRead, numberOfStressBCnodesRead, numberOfPressureBCnodesRead, numberOfPrecursorBCnodesRead;
 
     QforBoundaryConditions QpressX0, QpressX1, QpressY0, QpressY1, QpressZ0, QpressZ1; // DEPRECATED
     QforBoundaryConditions propellerBC;
     QforBoundaryConditions geometryBC;
+    QforPrecursorBoundaryConditions precursorBC;
     QforBoundaryConditions geometryBCnormalX, geometryBCnormalY, geometryBCnormalZ;
     QforBoundaryConditions inflowBCnormalX, inflowBCnormalY, inflowBCnormalZ;
     QforBoundaryConditions outflowBCnormalX, outflowBCnormalY, outflowBCnormalZ;
@@ -235,6 +237,8 @@ struct LBMSimulationParameter {
     unsigned int kInletQread, kOutletQread;  // DEPRECATED
 
     WallModelParameters wallModel;
+    std::vector<SPtr<VelocityReader>> velocityReader;
+    real outflowPressureCorrectionFactor;
 
     // testRoundoffError
     Distributions27 kDistTestRE;
@@ -468,6 +472,7 @@ public:
     void setpressBcPos(std::string pressBcPos);
     void setpressBcQs(std::string pressBcQs);
     void setpressBcValue(std::string pressBcValue);
+    void setOutflowPressureCorrectionFactor(real correctionFactor);
     void setpressBcValues(std::string pressBcValues);
     void setvelBcQs(std::string velBcQs);
     void setvelBcValues(std::string velBcValues);
@@ -524,7 +529,6 @@ public:
     void setUseWale(bool useWale);
     void setTurbulenceModel(TurbulenceModel turbulenceModel);
     void setUseTurbulentViscosity(bool useTurbulentViscosity);
-    void setUseAMD(bool useAMD);
     void setSGSConstant(real SGSConstant);
     void setHasWallModelMonitor(bool hasWallModelMonitor);
     void setUseInitNeq(bool useInitNeq);
@@ -850,6 +854,7 @@ public:
     std::string getOutflowBoundaryNormalX();
     std::string getOutflowBoundaryNormalY();
     std::string getOutflowBoundaryNormalZ();
+    real getOutflowPressureCorrectionFactor();
     // CUDA random number
     curandState *getRandomState();
     // Kernel
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.cu
index 71897bd21ea4fb299d3cc0ffa385506d4503f360..60dd7d3b581a102ad7b9c77f9eb6fb9a56f64bd7 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.cu
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.cu
@@ -188,9 +188,9 @@ __global__ void applyBodyForces(real* gridCoordsX, real* gridCoordsY, real* grid
         }
     }
 
-    atomicAdd(&gridForcesX[gridIndex], gridForceX_RF);
-    atomicAdd(&gridForcesY[gridIndex], gridForceY_RF);
-    atomicAdd(&gridForcesZ[gridIndex], gridForceZ_RF);
+    gridForcesX[gridIndex] = gridForceX_RF;
+    gridForcesY[gridIndex] = gridForceY_RF;
+    gridForcesZ[gridIndex] = gridForceZ_RF;
 }
 
 
@@ -210,7 +210,7 @@ void ActuatorLine::interact(Parameter* para, CudaMemoryManager* cudaMemoryManage
 {
     if (level != this->level) return;
 
-    cudaMemoryManager->cudaCopyBladeCoordsHtoD(this);
+    if(useHostArrays) cudaMemoryManager->cudaCopyBladeCoordsHtoD(this);
 
     vf::cuda::CudaGrid bladeGrid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, this->nNodes);
 
@@ -225,11 +225,11 @@ void ActuatorLine::interact(Parameter* para, CudaMemoryManager* cudaMemoryManage
         this->turbinePosX, this->turbinePosY, this->turbinePosZ,
         this->bladeIndicesD, para->getVelocityRatio(), this->invDeltaX);
 
-    cudaMemoryManager->cudaCopyBladeVelocitiesDtoH(this);
+    if(useHostArrays) cudaMemoryManager->cudaCopyBladeVelocitiesDtoH(this);
 
     this->calcBladeForces();
 
-    cudaMemoryManager->cudaCopyBladeForcesHtoD(this);
+    if(useHostArrays) cudaMemoryManager->cudaCopyBladeForcesHtoD(this);
 
     vf::cuda::CudaGrid sphereGrid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, this->nIndices);
 
@@ -369,6 +369,14 @@ void ActuatorLine::initBladeIndices(Parameter* para, CudaMemoryManager* cudaMemo
     }
     cudaMemoryManager->cudaCopyBladeIndicesHtoD(this);
 }
+void ActuatorLine::setPreInitBladeRadii(real* _bladeRadii)
+{
+    this->bladeRadiiPreInit = (real*) malloc(this->nBladeNodes*sizeof(real));
+    for(uint node=0; node<this->nBladeNodes; node++)
+    {
+        this->bladeRadiiPreInit[node] = _bladeRadii[node];
+    }
+}
 
 void ActuatorLine::initBoundingSphere(Parameter* para, CudaMemoryManager* cudaMemoryManager)
 {
@@ -420,4 +428,27 @@ void ActuatorLine::setBladeForces(real* _bladeForcesX, real* _bladeForcesY, real
         this->bladeForcesYH[node] = _bladeForcesY[node];
         this->bladeForcesZH[node] = _bladeForcesZ[node];
     }
+}
+void ActuatorLine::setBladeCoordsD(real* _bladeCoordsX, real* _bladeCoordsY, real* _bladeCoordsZ)
+{
+    throw std::runtime_error("not implemented");
+    this->bladeCoordsXD = _bladeCoordsX;
+    this->bladeCoordsYD = _bladeCoordsY;
+    this->bladeCoordsZD = _bladeCoordsZ;
+}
+
+void ActuatorLine::setBladeVelocitiesD(real* _bladeVelocitiesX, real* _bladeVelocitiesY, real* _bladeVelocitiesZ)
+{
+    throw std::runtime_error("not implemented");
+    this->bladeVelocitiesXD = _bladeVelocitiesX;
+    this->bladeVelocitiesYD = _bladeVelocitiesY;
+    this->bladeVelocitiesZD = _bladeVelocitiesZ;
+}
+
+void ActuatorLine::setBladeForcesD(real* _bladeForcesX, real* _bladeForcesY, real* _bladeForcesZ)
+{
+    throw std::runtime_error("not implemented");
+    this->bladeCoordsXD = _bladeForcesX;
+    this->bladeCoordsYD = _bladeForcesY;
+    this->bladeCoordsZD = _bladeForcesZ;
 }
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h
index b44c89c5020eb206baa3bba1994b1e45f760c3bb..a441387512cc86e83453d9a4689d541b17dfde0f 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h
@@ -22,7 +22,8 @@ public:
         const real _diameter,
         int _level,
         const real _deltaT,
-        const real _deltaX
+        const real _deltaX,
+        const bool _useHostArrays
     ) : nBlades(_nBlades),
         density(_density),
         nBladeNodes(_nBladeNodes), 
@@ -30,6 +31,7 @@ public:
         turbinePosX(_turbinePosX), turbinePosY(_turbinePosY), turbinePosZ(_turbinePosZ),
         diameter(_diameter),
         level(_level),
+        useHostArrays(_useHostArrays),
         PreCollisionInteractor()
     {
         this->deltaT = _deltaT*exp2(-this->level);
@@ -58,6 +60,8 @@ public:
     real getAzimuth(){ return this->azimuth; };
     real getYaw(){ return this->yaw; };
     real getDensity(){ return this->density; };
+    real getDeltaT(){ return this->deltaT; };
+    real getDeltaX(){ return this->deltaX; };
     real getPositionX(){ return this->turbinePosX; };
     real getPositionY(){ return this->turbinePosY; };
     real getPositionZ(){ return this->turbinePosZ; };
@@ -72,12 +76,27 @@ public:
     real* getBladeForcesY(){ return this->bladeForcesYH; };
     real* getBladeForcesZ(){ return this->bladeForcesZH; };
 
+    real* getBladeRadiiD(){ return this->bladeRadiiD; };
+    real* getBladeCoordsXD(){ return this->bladeCoordsXD; };
+    real* getBladeCoordsYD(){ return this->bladeCoordsYD; };
+    real* getBladeCoordsZD(){ return this->bladeCoordsZD; };
+    real* getBladeVelocitiesXD(){ return this->bladeVelocitiesXD; };
+    real* getBladeVelocitiesYD(){ return this->bladeVelocitiesYD; };
+    real* getBladeVelocitiesZD(){ return this->bladeVelocitiesZD; };
+    real* getBladeForcesXD(){ return this->bladeForcesXD; };
+    real* getBladeForcesYD(){ return this->bladeForcesYD; };
+    real* getBladeForcesZD(){ return this->bladeForcesZD; };
+
     void setOmega(real _omega){ this->omega = _omega; };
     void setAzimuth(real _azimuth){ this->azimuth = _azimuth; };
     void setYaw(real _yaw){ this->yaw = _yaw; };
+    void setPreInitBladeRadii(real* _bladeRadii);
     void setBladeCoords(real* _bladeCoordsX, real* _bladeCoordsY, real* _bladeCoordsZ);
     void setBladeVelocities(real* _bladeVelocitiesX, real* _bladeVelocitiesY, real* _bladeVelocitiesZ);
     void setBladeForces(real* _bladeForcesX, real* _bladeForcesY, real* _bladeForcesZ);
+    void setBladeCoordsD(real* _bladeCoordsX, real* _bladeCoordsY, real* _bladeCoordsZ);
+    void setBladeVelocitiesD(real* _bladeVelocitiesX, real* _bladeVelocitiesY, real* _bladeVelocitiesZ);
+    void setBladeForcesD(real* _bladeForcesX, real* _bladeForcesY, real* _bladeForcesZ);
     virtual void calcBladeForces();
 
 private:
@@ -92,6 +111,7 @@ private:
     void calcForcesEllipticWing();
 
 public:
+    real* bladeRadiiPreInit;
     real* bladeRadiiH;
     real* bladeRadiiD;
     real* bladeCoordsXH, * bladeCoordsYH, * bladeCoordsZH;
@@ -106,6 +126,7 @@ public:
     uint* boundingSphereIndicesD;
     
 private:
+    const bool useHostArrays;
     const real density;
     real turbinePosX, turbinePosY, turbinePosZ;
     real omega, azimuth, yaw, deltaT, deltaX, invDeltaX, forceRatio, factorGaussian, invEpsilonSqrd;
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f2be9567b450f42627ee9647727b321a89baf387
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.cu
@@ -0,0 +1,300 @@
+#include "PrecursorWriter.h"
+#include "basics/writer/WbWriterVtkXmlImageBinary.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#include <cuda/CudaGrid.h>
+#include "Kernel/Utilities/DistributionHelper.cuh"
+
+#include <Core/StringUtilities/StringUtil.h>
+
+#include "Parameter/Parameter.h"
+#include "DataStructureInitializer/GridProvider.h"
+#include "GPU/CudaMemoryManager.h"
+
+using namespace vf::lbm::dir;
+
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//TODO check everything for multiple level
+void index1d(int& idx, int y, int z, int ny, int nz)
+{
+    idx = y+ny*z;
+}
+
+void index2d(int idx, int& y, int& z, int ny, int nz)
+{
+    z = idx/ny;
+    y = idx-ny*z;
+}
+
+__inline__ __host__ __device__ uint lIndex(const uint component, const uint node, const uint timestep, const uint nComponents, const uint nNodes)
+{
+    return node+nNodes*(component+timestep*nComponents);
+}
+
+__inline__ __host__ __device__ uint lIndex(const uint component, const uint node, const uint nNodes)
+{
+    return node+component*nNodes;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void fillArrayVelocities(const uint nNodes, 
+                                    uint* indices, 
+                                    real *precursorData,
+                                    real *vx,
+                                    real *vy,
+                                    real *vz,
+                                    real velocityRatio)
+
+
+{
+    const uint node = vf::gpu::getNodeIndex();
+
+    if(node>=nNodes) return;
+
+    precursorData[lIndex(0u, node, nNodes)] = vx[indices[node]]*velocityRatio;
+    precursorData[lIndex(1u, node, nNodes)] = vy[indices[node]]*velocityRatio;
+    precursorData[lIndex(2u, node, nNodes)] = vz[indices[node]]*velocityRatio;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void fillArrayDistributions( uint nNodes, uint* indices, 
+                                        real* precursorData,
+                                        real* distributions,
+                                        uint* neighborX, uint* neighborY, uint* neighborZ,
+                                        bool isEvenTimestep,
+                                        unsigned long numberOfLBnodes)
+{
+    const uint node = vf::gpu::getNodeIndex();
+
+    if(node>=nNodes) return;
+
+    Distributions27 dist = vf::gpu::getDistributionReferences27(distributions, numberOfLBnodes, isEvenTimestep);
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    // ! - Set neighbor indices (necessary for indirect addressing)
+    uint k_000 = indices[node];
+    // uint k_M00 = neighborX[k_000];
+    uint k_0M0 = neighborY[k_000];
+    uint k_00M = neighborZ[k_000];
+    // uint k_MM0 = neighborY[k_M00];
+    // uint k_M0M = neighborZ[k_M00];
+    uint k_0MM = neighborZ[k_0M0];
+    // uint k_MMM = neighborZ[k_MM0];
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Get local distributions in PX directions
+    //!
+    precursorData[lIndex(PrecP00, node, nNodes)] = (dist.f[DIR_P00])[k_000];
+    precursorData[lIndex(PrecPP0, node, nNodes)] = (dist.f[DIR_PP0])[k_000];
+    precursorData[lIndex(PrecPM0, node, nNodes)] = (dist.f[DIR_PM0])[k_0M0];
+    precursorData[lIndex(PrecP0P, node, nNodes)] = (dist.f[DIR_P0P])[k_000];
+    precursorData[lIndex(PrecP0M, node, nNodes)] = (dist.f[DIR_P0M])[k_00M];
+    precursorData[lIndex(PrecPPP, node, nNodes)] = (dist.f[DIR_PPP])[k_000];
+    precursorData[lIndex(PrecPMP, node, nNodes)] = (dist.f[DIR_PMP])[k_0M0];
+    precursorData[lIndex(PrecPPM, node, nNodes)] = (dist.f[DIR_PPM])[k_00M];
+    precursorData[lIndex(PrecPMM, node, nNodes)] = (dist.f[DIR_PMM])[k_0MM];
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void PrecursorWriter::init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager* cudaManager)
+{
+    precursorStructs.resize(para->getMaxLevel()+1);
+    for(int level=0; level<=para->getMaxLevel(); level++)
+    {
+
+        real dx = abs(para->getParH(level)->coordinateX[1]-para->getParH(level)->coordinateX[para->getParH(level)->neighborX[1]]);
+        int maxPoints = (int((yMax-yMin)/dx)+1)* (int((zMax-zMin)/dx)+1);
+
+        real lowestY, lowestZ, highestY, highestZ;
+
+        lowestY = para->getParH(level)->coordinateY[para->getParH(level)->numberOfNodes-1];
+        highestY = para->getParH(level)->coordinateY[1];        
+        
+        lowestZ = para->getParH(level)->coordinateZ[para->getParH(level)->numberOfNodes-1];
+        highestZ = para->getParH(level)->coordinateZ[1];
+
+        std::vector<uint> indicesOnGrid;
+        std::vector<int> indicesOnPlane;
+        std::vector<real> coordY, coordZ;
+
+        for(uint j=1; j<para->getParH(level)->numberOfNodes; j++ )
+        {
+            real pointCoordX = para->getParH(level)->coordinateX[j];
+            real pointCoordY = para->getParH(level)->coordinateY[j];
+            real pointCoordZ = para->getParH(level)->coordinateZ[j];
+            if( pointCoordX < (dx+xPos) && pointCoordX >= xPos &&
+                pointCoordY<=yMax && pointCoordY>=yMin && 
+                pointCoordZ<=zMax && pointCoordZ>=zMin)
+            {
+                highestY = max(highestY, pointCoordY);
+                highestZ = max(highestZ, pointCoordZ);
+
+                lowestY = min(lowestY, pointCoordY);
+                lowestZ = min(lowestZ, pointCoordZ);
+                indicesOnGrid.push_back(j);    
+                coordY.push_back(pointCoordY);            
+                coordZ.push_back(pointCoordZ);            
+            }
+        }
+        assert("PrecursorWriter did not find any points on the grid"&& indicesOnGrid.size()==0);
+        int ny = int((highestY-lowestY)/dx)+1;
+        int nz = int((highestZ-lowestZ)/dx)+1;
+
+        for(uint i=0;i<indicesOnGrid.size(); i++)
+        {
+                int idxY = int((coordY[i]-lowestY)/dx);
+                int idxZ = int((coordZ[i]-lowestZ)/dx);
+                int idx;
+                index1d(idx, idxY, idxZ, ny, nz);
+                indicesOnPlane.push_back(idx);
+                // printf("idx %d, idy %d, idz %d, ny %d, nz %d\n", idx, idxY, idxZ, ny, nz);
+        }
+
+        precursorStructs[level] = SPtr<PrecursorStruct>(new PrecursorStruct);
+        precursorStructs[level]->nPoints = (uint)indicesOnGrid.size();
+        precursorStructs[level]->indicesOnPlane = (int*) malloc(precursorStructs[level]->nPoints*sizeof(int));
+        precursorStructs[level]->spacing = makeUbTuple(dx, dx, tSave*para->getTimeRatio());
+        precursorStructs[level]->origin = makeUbTuple(lowestY, lowestZ);
+        precursorStructs[level]->extent = makeUbTuple(0, ny-1, 0, nz-1);
+        precursorStructs[level]->nPointsInPlane = ny*nz;
+        precursorStructs[level]->timestepsPerFile = min(para->getlimitOfNodesForVTK()/(ny*nz), maxtimestepsPerFile);
+        precursorStructs[level]->filesWritten = 0;
+        precursorStructs[level]->timestepsBuffered = 0;
+        
+        switch (outputVariable)
+        {
+        case OutputVariable::Velocities:
+            precursorStructs[level]->nQuantities = 3;
+            break;
+        case OutputVariable::Distributions:
+            precursorStructs[level]->nQuantities = 9;
+            break;
+        
+        default:
+            break;
+        }
+
+        // printf("points %zu points on plane %zu \n",  indicesOnGrid.size(),  indicesOnPlane.size());
+
+        cudaManager->cudaAllocPrecursorWriter(this, level);
+    
+        std::copy(indicesOnGrid.begin(), indicesOnGrid.end(), precursorStructs[level]->indicesH);
+        std::copy(indicesOnPlane.begin(), indicesOnPlane.end(), precursorStructs[level]->indicesOnPlane);
+
+        cudaManager->cudaCopyPrecursorWriterIndicesHtoD(this, level);
+    }
+}
+
+
+void PrecursorWriter::interact(Parameter* para, CudaMemoryManager* cudaManager, int level, uint t)
+{
+    if(t>tStartOut ? ((t-tStartOut) % tSave)==0 : false)
+    {
+        vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, precursorStructs[level]->nPoints);
+
+        if(this->outputVariable==OutputVariable::Velocities)
+        {
+            fillArrayVelocities<<<grid.grid, grid.threads>>>(   precursorStructs[level]->nPoints, precursorStructs[level]->indicesD, 
+                                                                precursorStructs[level]->bufferD, 
+                                                                para->getParD(level)->velocityX, para->getParD(level)->velocityY, para->getParD(level)->velocityZ,
+                                                                para->getVelocityRatio());
+            getLastCudaError("In PrecursorWriter::interact fillArrayVelocities execution failed");
+        }
+        else if(this->outputVariable==OutputVariable::Distributions)
+        {
+            fillArrayDistributions<<<grid.grid, grid.threads>>>(precursorStructs[level]->nPoints, precursorStructs[level]->indicesD, 
+                                                                precursorStructs[level]->bufferD,
+                                                                para->getParD(level)->distributions.f[0],
+                                                                para->getParD(level)->neighborX, para->getParD(level)->neighborY, para->getParD(level)->neighborZ,
+                                                                para->getEvenOrOdd(level), para->getParD(level)->numberOfNodes);
+            getLastCudaError("In PrecursorWriter::interact fillArrayDistributions execution failed");
+        }
+        cudaManager->cudaCopyPrecursorWriterOutputVariablesDtoH(this, level);
+
+        // switch device buffer and data pointer so precursor data is gathered in buffer and copied from bufferD to bufferH
+        real *tmp = precursorStructs[level]->bufferD;
+        precursorStructs[level]->bufferD = precursorStructs[level]->dataD;
+        precursorStructs[level]->dataD = tmp;
+
+        precursorStructs[level]->timestepsBuffered++;
+
+        if(precursorStructs[level]->timestepsBuffered >= precursorStructs[level]->timestepsPerFile)
+        {
+        // switch host buffer and data pointer so precursor data is copied in buffer and written from data
+
+            tmp = precursorStructs[level]->bufferH;
+            precursorStructs[level]->bufferH = precursorStructs[level]->dataH;
+            precursorStructs[level]->dataH = tmp;
+
+            writeFuture.wait();
+            writeFuture = std::async(std::launch::async, [this](Parameter* para, uint level, uint timesteps){ this->write(para, level, timesteps); }, para, level, precursorStructs[level]->timestepsBuffered);
+            precursorStructs[level]->timestepsBuffered = 0;
+        }
+    }
+}
+
+
+void PrecursorWriter::free(Parameter* para, CudaMemoryManager* cudaManager)
+{
+    writeFuture.wait();
+    for(int level=0; level<=para->getMaxLevel(); level++)
+    {
+        if(getPrecursorStruct(level)->timestepsBuffered>0)
+            write(para, level, getPrecursorStruct(level)->timestepsBuffered);
+
+        cudaManager->cudaFreePrecursorWriter(this, level);
+    }
+}
+
+
+void PrecursorWriter::write(Parameter* para, int level, uint timestepsBuffered)
+{
+    std::string fname = this->makeFileName(fileName, level, para->getMyProcessID(), precursorStructs[level]->filesWritten) + getWriter()->getFileExtension();
+    std::string wholeName = outputPath + "/" + fname;
+
+    uint nPointsInPlane = precursorStructs[level]->nPointsInPlane;
+
+    int startTime = precursorStructs[level]->filesWritten*precursorStructs[level]->timestepsPerFile;
+
+    // printf("points in plane %d, total timesteps %d, ntimesteps %d \n", nPointsInPlane, nTotalTimesteps, nTimesteps);
+
+    UbTupleInt6 extent = makeUbTuple(   val<1>(precursorStructs[level]->extent),    val<2>(precursorStructs[level]->extent), 
+                                        val<3>(precursorStructs[level]->extent),    val<4>(precursorStructs[level]->extent), 
+                                        startTime,                          startTime+(int)timestepsBuffered-1);
+
+    UbTupleFloat3 origin = makeUbTuple( val<1>(precursorStructs[level]->origin), val<2>(precursorStructs[level]->origin), 0.f);
+
+    std::vector<std::vector<double>> nodedata;
+    
+    for(uint quant=0; quant<precursorStructs[level]->nQuantities; quant++)
+    {
+        std::vector<double> doubleArr(nPointsInPlane*timestepsBuffered, NAN);
+        for( uint timestep=0; timestep<timestepsBuffered; timestep++)
+        {
+            for (uint pos=0; pos < precursorStructs[level]->nPoints; pos++)
+            {
+                int indexOnPlane = precursorStructs[level]->indicesOnPlane[pos]+timestep*nPointsInPlane;
+                doubleArr[indexOnPlane] = double(precursorStructs[level]->dataH[lIndex(quant, pos, timestep, precursorStructs[level]->nQuantities, precursorStructs[level]->nPoints)]);
+            }
+        }
+        nodedata.push_back(doubleArr);
+    }
+
+    std::vector<std::vector<double>> celldata;
+    getWriter()->writeData(wholeName, nodedatanames, celldatanames, nodedata, celldata, extent, origin, precursorStructs[level]->spacing, extent);
+    precursorStructs[level]->filesWritten++;
+}
+
+std::string PrecursorWriter::makeFileName(std::string fileName, int level, int id, uint filesWritten)
+{
+    return fileName + "_lev_" + StringUtil::toString<int>(level)
+                    + "_ID_" + StringUtil::toString<int>(id)
+                    + "_File_" + StringUtil::toString<int>(filesWritten);
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..68f69d8122c33f7283783cf002596e0b03d31513
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/PrecursorWriter.h
@@ -0,0 +1,119 @@
+#ifndef PRECURSORPROBE_H_
+#define PRECURSORPROBE_H_
+
+#include "PreCollisionInteractor.h"
+#include "WbWriterVtkXmlImageBinary.h"
+#include "LBM/LB.h"
+#include <string>
+#include <vector>
+#include <future>
+#include "PointerDefinitions.h"
+
+
+class Parameter;
+class CudaMemoryManager;
+class GridProvider;
+
+enum class OutputVariable {
+   //! - Velocities
+    Velocities,
+    //! - Distributions
+    Distributions    
+};
+
+static constexpr uint PrecP00 = 0;
+static constexpr uint PrecPP0 = 1;
+static constexpr uint PrecPM0 = 2;
+static constexpr uint PrecP0P = 3;
+static constexpr uint PrecP0M = 4;
+static constexpr uint PrecPPP = 5;
+static constexpr uint PrecPMP = 6;
+static constexpr uint PrecPPM = 7;
+static constexpr uint PrecPMM = 8;
+
+struct PrecursorStruct
+{
+    uint nPoints, nPointsInPlane, timestepsPerFile, filesWritten, timestepsBuffered;
+    uint *indicesH, *indicesD;
+    real *dataH, *dataD;
+    real *bufferH, *bufferD;
+    uint nQuantities;
+    UbTupleInt4 extent;
+    UbTupleFloat2 origin;
+    UbTupleFloat3 spacing;
+    int* indicesOnPlane;
+    cudaStream_t stream;
+};
+
+class PrecursorWriter : public PreCollisionInteractor
+{
+public:
+    PrecursorWriter(
+        const std::string _fileName,
+        const std::string _outputPath,
+        real _xPos,
+        real _yMin, real _yMax,
+        real _zMin, real _zMax,
+        uint _tStartOut,
+        uint _tSave,
+        OutputVariable _outputVariable,
+        uint _maxTimestepsPerFile=uint(1e4)
+    ): 
+    fileName(_fileName), 
+    outputPath(_outputPath), 
+    xPos(_xPos),
+    yMin(_yMin),
+    yMax(_yMax),
+    zMin(_zMin),
+    zMax(_zMax),
+    tStartOut(_tStartOut), 
+    tSave(_tSave),
+    outputVariable(_outputVariable),
+    maxtimestepsPerFile(_maxTimestepsPerFile)
+    {
+        nodedatanames = determineNodeDataNames();
+        writeFuture = std::async([](){});
+    };
+
+    void init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager* cudaManager) override;
+    void interact(Parameter* para, CudaMemoryManager* cudaManager, int level, uint t) override;
+    void free(Parameter* para, CudaMemoryManager* cudaManager) override;
+
+    OutputVariable getOutputVariable(){ return this->outputVariable; }
+
+    SPtr<PrecursorStruct> getPrecursorStruct(int level){return precursorStructs[level];}
+    static std::string makeFileName(std::string fileName, int level, int id, uint part);
+    
+private:
+    WbWriterVtkXmlImageBinary* getWriter(){ return WbWriterVtkXmlImageBinary::getInstance(); };
+    void write(Parameter* para, int level, uint timestepsBuffered);
+
+    std::vector<std::string> determineNodeDataNames()
+    {
+        switch (outputVariable)
+        {
+        case OutputVariable::Velocities:
+            return {"vx", "vy", "vz"};
+            break;       
+        case OutputVariable::Distributions:
+            return {"fP00", "fPP0", "fPM0", "fP0P", "fP0M", "fPPP", "fPMP", "fPPM", "fPMM"};
+            break;
+        
+        default:
+            throw std::runtime_error("Invalid OutputVariable for PrecursorWriter");
+            break;
+        }
+    }
+
+private:
+    std::vector<SPtr<PrecursorStruct>> precursorStructs;
+    std::string fileName, outputPath;
+    std::vector<std::string> nodedatanames;
+    std::vector<std::string> celldatanames;
+    uint tStartOut, tSave, maxtimestepsPerFile;
+    real xPos, yMin, yMax, zMin, zMax;
+    OutputVariable outputVariable;
+    std::future<void> writeFuture;
+};
+
+#endif //PRECURSORPROBE_H_
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h
index 3440c01020f9b3505be7148024e47373b76648ff..92b1923881526f631cdef1e7c1543d25997cb82f 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h
@@ -54,7 +54,7 @@ public:
     ): Probe(_probeName, 
              _outputPath,
              _tStartAvg, 
-             0,
+             _tStartAvg+1,
              _tAvg,
              _tStartOut, 
              _tOut,
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.cu
index cc027b07bded01455437e65e08ccdcd51bcf7dc0..0d42c5030363b6c0b3b67db0ed7c75f1ba3ab729 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.cu
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.cu
@@ -329,6 +329,22 @@ void Probe::addStatistic(Statistic variable)
     }
 }
 
+std::string Probe::makeParallelFileName(int id, int t)
+{
+    return this->probeName + "_bin_ID_" + StringUtil::toString<int>(id) 
+                                           + "_t_" + StringUtil::toString<int>(t) 
+                                           + ".vtk";
+}
+
+std::string Probe::makeGridFileName(int level, int id, int t, uint part)
+{
+    return this->probeName + "_bin_lev_" + StringUtil::toString<int>(level)
+                                         + "_ID_" + StringUtil::toString<int>(id)
+                                         + "_Part_" + StringUtil::toString<int>(part) 
+                                         + "_t_" + StringUtil::toString<int>(t) 
+                                         + ".vtk";
+}
+
 void Probe::addAllAvailableStatistics()
 {
     for( int var=0; var < int(Statistic::LAST); var++)
@@ -347,119 +363,75 @@ void Probe::write(Parameter* para, int level, int t)
     std::vector<std::string> fnames;
     for (uint i = 1; i <= numberOfParts; i++)
 	{
-        std::string fname = this->probeName + "_bin_lev_" + StringUtil::toString<int>(level)
-                                         + "_ID_" + StringUtil::toString<int>(para->getMyProcessID())
-                                         + "_Part_" + StringUtil::toString<int>(i);
-        if(!this->outputTimeSeries) fname += "_t_" + StringUtil::toString<int>(t_write);
-        fname += ".vtk";
-		fnames.push_back(fname);
-        this->fileNamesForCollectionFile.push_back(fname);
+        this->writeGridFile(para, level, t_write, i);
     }
-    this->writeGridFiles(para, level, fnames, t);
-
-    if(level == 0 && !this->outputTimeSeries) this->writeCollectionFile(para, t);
+    if(level == 0&& !this->outputTimeSeries) this->writeParallelFile(para, t);
 }
 
-void Probe::writeCollectionFile(Parameter* para, int t)
+void Probe::writeParallelFile(Parameter* para, int t)
 {
     int t_write = this->fileNameLU ? t: t/this->tOut; 
-    std::string filename = this->probeName + "_bin_ID_" + StringUtil::toString<int>(para->getMyProcessID()) 
-                                           + "_t_" + StringUtil::toString<int>(t_write) 
-                                           + ".vtk";
-
-    std::ofstream file;
-
-    file.open(this->outputPath + "/" + filename + ".pvtu" );
-
-    //////////////////////////////////////////////////////////////////////////
-    
-    file << "<VTKFile type=\"PUnstructuredGrid\" version=\"1.0\" byte_order=\"LittleEndian\" header_type=\"UInt64\">" << std::endl;
-    file << "  <PUnstructuredGrid GhostLevel=\"1\">" << std::endl;
-
-    file << "    <PPointData>" << std::endl;
-
-    for(std::string varName: this->getVarNames()) //TODO
-    {
-        file << "       <DataArray type=\"Float64\" Name=\""<< varName << "\" /> " << std::endl;
-    }
-    file << "    </PPointData>" << std::endl;
+    std::string filename = this->outputPath + "/" + this->makeParallelFileName(para->getMyProcessID(), t_write);
 
-    file << "    <PPoints>" << std::endl;
-    file << "      <PDataArray type=\"Float32\" Name=\"Points\" NumberOfComponents=\"3\"/>" << std::endl;
-    file << "    </PPoints>" << std::endl;
+    std::vector<std::string> cellNames;
 
-    for( auto& fname : this->fileNamesForCollectionFile )
-    {
-        const auto filenameWithoutPath=fname.substr( fname.find_last_of('/') + 1 );
-        file << "    <Piece Source=\"" << filenameWithoutPath << ".bin.vtu\"/>" << std::endl;
-    }
-
-    file << "  </PUnstructuredGrid>" << std::endl;
-    file << "</VTKFile>" << std::endl;
-
-    //////////////////////////////////////////////////////////////////////////
-
-    file.close();
+    getWriter()->writeParallelFile(filename, fileNamesForCollectionFile, varNames, cellNames);
 
     this->fileNamesForCollectionFile.clear();
 }
 
-void Probe::writeGridFiles(Parameter* para, int level, std::vector<std::string>& fnames, int t)
+void Probe::writeGridFile(Parameter* para, int level, int t, uint part)
 {
+    std::string fname = this->outputPath + "/" + this->makeGridFileName(level, para->getMyProcessID(), t, part);
+
     std::vector< UbTupleFloat3 > nodes;
     std::vector< std::string > nodedatanames = this->getVarNames();
 
-    uint startpos = 0;
-    uint endpos = 0;
-    uint sizeOfNodes = 0;
     std::vector< std::vector< double > > nodedata(nodedatanames.size());
 
     SPtr<ProbeStruct> probeStruct = this->getProbeStruct(level);
 
-    for (uint part = 0; part < fnames.size(); part++)
-    {        
-        startpos = part * para->getlimitOfNodesForVTK();
-        uint nDataPoints = this->outputTimeSeries? this->tProbe: probeStruct->nPoints;
-        sizeOfNodes = min(para->getlimitOfNodesForVTK(), nDataPoints - startpos);
-        endpos = startpos + sizeOfNodes;
+    uint startpos = (part-1) * para->getlimitOfNodesForVTK();
+    uint sizeOfNodes = min(para->getlimitOfNodesForVTK(), probeStruct->nPoints - startpos);
+    uint endpos = startpos + sizeOfNodes;
 
-        //////////////////////////////////////////////////////////////////////////
-        nodes.resize(sizeOfNodes);
+    //////////////////////////////////////////////////////////////////////////
+    nodes.resize(sizeOfNodes);
 
-        for (uint pos = startpos; pos < endpos; pos++)
-        {
-            nodes[pos-startpos] = makeUbTuple(  float(probeStruct->pointCoordsX[pos]),
-                                                float(probeStruct->pointCoordsY[pos]),
-                                                float(probeStruct->pointCoordsZ[pos]));
-        }
+    for (uint pos = startpos; pos < endpos; pos++)
+    {
+        nodes[pos-startpos] = makeUbTuple(  float(probeStruct->pointCoordsX[pos]),
+                                            float(probeStruct->pointCoordsY[pos]),
+                                            float(probeStruct->pointCoordsZ[pos]));
+    }
 
-        for( auto it=nodedata.begin(); it!=nodedata.end(); it++) it->resize(sizeOfNodes);
+    for( auto it=nodedata.begin(); it!=nodedata.end(); it++) it->resize(sizeOfNodes);
 
-        for( int var=0; var < int(Statistic::LAST); var++){           
-            if(this->quantities[var])
-            {
-                Statistic statistic = static_cast<Statistic>(var);
-                real coeff;
+    for( int var=0; var < int(Statistic::LAST); var++){           
+        if(this->quantities[var])
+        {
+            Statistic statistic = static_cast<Statistic>(var);
+            real coeff;
+
+            std::vector<PostProcessingVariable> postProcessingVariables = this->getPostProcessingVariables(statistic);
+            uint n_arrs = uint(postProcessingVariables.size());
 
-                std::vector<PostProcessingVariable> postProcessingVariables = this->getPostProcessingVariables(statistic);
-                uint n_arrs = uint(postProcessingVariables.size());
+            uint arrOff = probeStruct->arrayOffsetsH[var];
+            uint arrLen = probeStruct->nPoints;
 
-                uint arrOff = probeStruct->arrayOffsetsH[var];
-                uint arrLen = probeStruct->nPoints;
+            for(uint arr=0; arr<n_arrs; arr++)
+            {
+                coeff = postProcessingVariables[arr].conversionFactor(level);
                 
-                for(uint arr=0; arr<n_arrs; arr++)
+                for (uint pos = startpos; pos < endpos; pos++)
                 {
-                    coeff = postProcessingVariables[arr].conversionFactor(level);
-                    
-                    for (uint pos = startpos; pos < endpos; pos++)
-                    {
-                        nodedata[arrOff+arr][pos-startpos] = double(probeStruct->quantitiesArrayH[(arrOff+arr)*arrLen+pos]*coeff);
-                    }
+                    nodedata[arrOff+arr][pos-startpos] = double(probeStruct->quantitiesArrayH[(arrOff+arr)*arrLen+pos]*coeff);
                 }
             }
         }
-        WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(this->outputPath + "/" + fnames[part], nodes, nodedatanames, nodedata);
     }
+    
+    this->fileNamesForCollectionFile.push_back(getWriter()->writeNodesWithNodeData(fname, nodes, nodedatanames, nodedata));
 }
 
 std::vector<std::string> Probe::getVarNames()
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h
index 9cb0bd43e27fb7a28cae9c363ce245fbd9cc5677..4facdca87af55b57db85eeb0686e9e46c0771f47 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h
@@ -49,6 +49,7 @@
 
 #include "PreCollisionInteractor/PreCollisionInteractor.h"
 #include "PointerDefinitions.h"
+#include "WbWriterVtkXmlBinary.h"
 
 //=======================================================================================
 //! \note How to add new Statistics 
@@ -152,8 +153,8 @@ public:
         outputTimeSeries(_outputTimeSeries),        
         PreCollisionInteractor()
     {
-        if (_tStartOut<_tStartAvg)      throw std::runtime_error("Probe: tStartOut must be larger than tStartAvg!");
-        if (_tStartTmpAvg<_tStartAvg)   throw std::runtime_error("Probe: tStartTmpAvg must be larger than tStartAvg!");
+        if (_tStartOut<_tStartAvg)      throw std::runtime_error(_probeName + ": tStartOut must be larger than tStartAvg!");
+        if (_tStartTmpAvg<_tStartAvg)   throw std::runtime_error(_probeName + ": tStartTmpAvg must be larger than tStartAvg!");
     }
     
     void init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager* cudaMemoryManager) override;
@@ -171,6 +172,8 @@ public:
     void setFileNameToNOut(){this->fileNameLU = false;}
     void setTStartTmpAveraging(uint _tStartTmpAveraging){this->tStartTmpAveraging = _tStartTmpAveraging;}
 
+protected:
+    virtual WbWriterVtkXmlBinary* getWriter(){ return WbWriterVtkXmlBinary::getInstance(); };
     real getNondimensionalConversionFactor(int level);
 
 private:
@@ -188,12 +191,15 @@ private:
                         int level);
     virtual void calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, uint t, int level) = 0;
 
-    void write(Parameter* para, int level, int t);
-    void writeCollectionFile(Parameter* para, int t);
-    void writeGridFiles(Parameter* para, int level, std::vector<std::string >& fnames, int t);
+    virtual void write(Parameter* para, int level, int t);
+    virtual void writeParallelFile(Parameter* para, int t);
+    virtual void writeGridFile(Parameter* para, int level, int t, uint part);
+
     std::vector<std::string> getVarNames();
-    
-private:
+    std::string makeGridFileName(int level, int id, int t, uint part);
+    std::string makeParallelFileName(int id, int t);
+
+protected:
     const std::string probeName;
     const std::string outputPath;
 
@@ -215,7 +221,6 @@ protected:
 
     uint tProbe = 0; //!> counter for number of probe evaluations. Only used when outputting timeseries
 
-
     std::function<real(int)> velocityRatio;
     std::function<real(int)> densityRatio;
     std::function<real(int)> forceRatio;
diff --git a/src/lbm/constants/NumericConstants.h b/src/lbm/constants/NumericConstants.h
index 4918d49aaa0431de639ea8ba3320c4fa45e539d4..1a1350604bf23936cfe091a0291a0f3392697315 100644
--- a/src/lbm/constants/NumericConstants.h
+++ b/src/lbm/constants/NumericConstants.h
@@ -18,6 +18,7 @@ static constexpr double c1o8 = 0.125;
 static constexpr double c1o9 = 0.111111111111111;
 static constexpr double c2o9 = 0.222222222222222;
 static constexpr double c4o9 = 0.444444444444444;
+static constexpr double c4o10 = 0.4;
 static constexpr double c1o10 = 0.1;
 static constexpr double c1o12 = 0.083333333333333;
 static constexpr double c1o16 = 0.0625;
@@ -99,15 +100,15 @@ static constexpr double c72o1 = 72.;
 static constexpr double c84o1 = 84.;
 static constexpr double c88o1 = 88.;
 static constexpr double c96o1 = 96.;
-static constexpr double c100o1 = 10.;
-static constexpr double c130o1 = 13.;
-static constexpr double c152o1 = 15.;
-static constexpr double c166o1 = 16.;
-static constexpr double c195o1 = 19.;
-static constexpr double c216o1 = 21.;
-static constexpr double c264o1 = 26.;
-static constexpr double c290o1 = 29.;
-static constexpr double c367o1 = 36.;
+static constexpr double c100o1 = 100.;
+static constexpr double c130o1 = 130.;
+static constexpr double c152o1 = 152.;
+static constexpr double c166o1 = 166.;
+static constexpr double c195o1 = 195.;
+static constexpr double c216o1 = 216.;
+static constexpr double c264o1 = 264.;
+static constexpr double c290o1 = 290.;
+static constexpr double c367o1 = 367.;
 
 static constexpr double Op0000002 = 0.0000002;
 static constexpr double c10eM30 = 1e-30;
@@ -132,6 +133,7 @@ static constexpr float c1o8 = 0.125f;
 static constexpr float c1o9 = (1.0f / 9.0f);
 static constexpr float c2o9 = (2.0f / 9.0f);
 static constexpr float c4o9 = (4.0f / 9.0f);
+static constexpr float c4o10 = 0.4f;
 static constexpr float c1o10 = 0.1f;
 static constexpr float c1o12 = (1.0f / 12.0f);
 static constexpr float c1o16 = 0.0625f;
diff --git a/utilities/setup_builder.py b/utilities/setup_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..821d72ede650937a5fa2873505fd2898164a239e
--- /dev/null
+++ b/utilities/setup_builder.py
@@ -0,0 +1,35 @@
+from setuptools import build_meta
+
+class builder(build_meta._BuildMetaBackend):
+
+    def run_setup(self, setup_script='setup.py'):
+        # Note that we can reuse our build directory between calls
+        # Correctness comes first, then optimization later
+        __file__ = setup_script
+        __name__ = '__main__'
+
+        with build_meta._open_setup_script(__file__) as f:
+            code = f.read().replace(r'\r\n', r'\n')
+        args = locals()
+        args["cmake_args"] = self.extra_args
+        exec(code, args)
+
+
+    def add_settings(self, config_settings):
+        self.extra_args = dict()
+        print(config_settings)
+        if config_settings:
+            self.extra_args = {k:v for k,v in config_settings.items() if k[:2] == "-D"}
+
+    def build_wheel(self, wheel_directory, config_settings=None,
+                    metadata_directory=None):
+        self.add_settings(config_settings)
+        return super().build_wheel(wheel_directory, config_settings, metadata_directory)
+
+    def build_sdist(self, sdist_directory, config_settings=None):
+        self.add_settings(config_settings)
+        return super().build_wheel(sdist_directory, config_settings)
+
+build = builder()
+build_wheel = build.build_wheel
+build_sdist = build.build_sdist
\ No newline at end of file