diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7534033e923c90154cbc872c71e9ffd0057398a1..b84d22038b3159fd5aedb951fb48c35b99babe94 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,4 @@
-image: irmb/virtualfluids-python-deps
+image: irmb/virtualfluids-python-deps:latest
 
 stages:
   - build
@@ -11,7 +11,6 @@ stages:
 ###############################################################################
 ##                                Builds                                     ##
 ###############################################################################
-
 .gnu_build_template:
   stage: build
 
@@ -42,8 +41,8 @@ stages:
     - mkdir -p $CI_PROJECT_DIR/$BUILD_FOLDER
     - cd $CI_PROJECT_DIR/$BUILD_FOLDER
     - cmake ..
-      -DBUILD_VF_CPU=ON
-      -DBUILD_VF_GPU=ON
+      --preset=all_make
+      -DCMAKE_CUDA_ARCHITECTURES=60
     - cmake . -LAH
     - make -j4
 
@@ -55,6 +54,7 @@ gcc_9:
   extends: .gnu_build_template
 
 ###############################################################################
+
 clang_10:
   extends: .gnu_build_template
 
@@ -83,11 +83,8 @@ gcc_9_rebuild:
     - cd $CI_PROJECT_DIR/build
     - rm -r -f ./*
     - cmake ..
-      -DBUILD_VF_CPU=ON
-      -DBUILD_VF_GPU=ON
-      -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-      -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache
-      -DCMAKE_C_COMPILER_LAUNCHER=ccache
+      --preset=all_make_ccache
+      -DCMAKE_CUDA_ARCHITECTURES=60
     - make -j4  2>&1 | tee gcc_warnings.txt
     - ccache -s
 
@@ -100,6 +97,37 @@ gcc_9_rebuild:
     paths:
       - $CI_PROJECT_DIR/cache
 
+
+###############################################################################
+gcc_9_cpu_warning_like_errors:
+  stage: build
+
+  image: irmb/virtualfluids-deps-ubuntu20.04
+
+  tags:
+    - gpu
+    - linux
+
+  before_script:
+    - export CCACHE_BASEDIR=$CI_PROJECT_DIR
+    - export CCACHE_DIR=$CI_PROJECT_DIR/cache
+    - ccache -s
+
+  script:
+    - mkdir -p $CI_PROJECT_DIR/build
+    - cd $CI_PROJECT_DIR/build
+    - rm -r -f ./*
+    - cmake ..
+      --preset=cpu_make_ccache
+      -DBUILD_WARNINGS_AS_ERRORS=ON
+    - make -j4
+    - ccache -s
+
+  cache:
+    key: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
+    paths:
+      - $CI_PROJECT_DIR/cache
+
 ###############################################################################
 msvc_16:
   stage: build
@@ -109,12 +137,8 @@ msvc_16:
     - gpu
 
   variables:
-    CMAKE_GENERATOR: "Visual Studio 16 2019"
     BUILD_CONFIGURATION: "Release"
     BUILD_FOLDER: "build"
-    BUILD_VF_CPU: "ON"
-    BUILD_VF_GPU: "ON"
-    BUILD_VF_UNIT_TESTS: "ON"
 
   # add cmake and MSBuild.exe to the path.
   # This Needs to be adapted when moved to a new build machine.
@@ -129,7 +153,7 @@ msvc_16:
     - cd $CI_PROJECT_DIR
     - md -force $env:BUILD_FOLDER
     - cd $env:BUILD_FOLDER
-    - cmake .. -DBUILD_VF_CPU=$env:BUILD_VF_CPU -DBUILD_VF_GPU=$env:BUILD_VF_GPU -DBUILD_VF_UNIT_TESTS=$env:BUILD_VF_UNIT_TESTS -G "$env:CMAKE_GENERATOR" ..
+    - cmake .. --preset=all_msvc
     - MSBuild.exe VirtualFluids.sln /property:Configuration=$env:BUILD_CONFIGURATION /verbosity:minimal /maxcpucount:4
 
   cache:
@@ -144,6 +168,11 @@ msvc_16:
 ###############################################################################
 build_singularity_image:
   stage: build
+
+  tags:
+    - priviliged
+    - linux
+
   rules:
     - if: $CI_COMMIT_TAG
 
@@ -152,13 +181,24 @@ build_singularity_image:
       - Containers/VirtualFluidsOpenMPI.sif
 
   script:
-    - singularity build Containers/VirtualFluidsOpenMPI.sif Containers/VirtualFluidsOpenMPI.def
+    - singularity build --fakeroot Containers/VirtualFluidsOpenMPI.sif Containers/VirtualFluidsOpenMPI.def
     - ls -sh Containers/VirtualFluidsOpenMPI.sif
 
 ###############################################################################
 ##                                Tests                                      ##
 ###############################################################################
+gcc_9_unit_tests:
+  stage: test
+
+  needs: ["gcc_9"]
 
+  before_script:
+    - cd $CI_PROJECT_DIR/build
+
+  script:
+    - ./bin/basicsTests
+
+###############################################################################
 msvc_16_unit_tests:
   stage: test
 
@@ -203,7 +243,7 @@ clang_build_analyzer_clang_10:
     - cmake ..
       -DBUILD_VF_CPU=ON
       -DBUILD_VF_GPU=ON
-      -DUSE_OPENMP=OFF
+      -DCMAKE_CUDA_ARCHITECTURES=60
       -DCMAKE_CXX_FLAGS=-ftime-trace
     - ClangBuildAnalyzer --start .
     - make
@@ -238,7 +278,7 @@ include_what_you_use_clang_10:
     - cmake ..
       -DBUILD_VF_CPU=ON
       -DBUILD_VF_GPU=ON
-      -DUSE_OPENMP=OFF
+      -DCMAKE_CUDA_ARCHITECTURES=60
       -DBUILD_VF_INCLUDE_WHAT_YOU_USE=ON
     - make
 
@@ -246,6 +286,9 @@ include_what_you_use_clang_10:
 cppcheck:
   stage: analyze
 
+  only:
+    - open_source@irmb/VirtualFluids_dev
+
   image: irmb/virtualfluids-deps-ubuntu20.04
 
   needs: []
@@ -272,6 +315,9 @@ cppcheck:
 lizard:
   stage: analyze
 
+  only:
+    - open_source@irmb/VirtualFluids_dev
+
   image: irmb/virtualfluids-python-deps-ubuntu20.04
 
   needs: []
@@ -285,7 +331,7 @@ lizard:
 
   script:
     - cd $CI_PROJECT_DIR
-    - lizard -l cpp src/ > lizard.txt --ignore_warnings 191
+    - lizard -l cpp src/ > lizard.txt --warnings_only --ignore_warnings 400
 
   artifacts:
     paths:
@@ -296,6 +342,9 @@ lizard:
 gcov_gcc_9:
   stage: analyze
 
+  only:
+    - open_source@irmb/VirtualFluids_dev
+
   image: irmb/virtualfluids-python-deps-ubuntu20.04
 
   needs: []
@@ -310,10 +359,9 @@ gcov_gcc_9:
     - mkdir -p $CI_PROJECT_DIR/build
     - cd $CI_PROJECT_DIR/build
     - cmake ..
-      -DBUILD_VF_CPU=ON
-      -DBUILD_VF_GPU=ON
+      --preset=all_make
+      -DCMAKE_CUDA_ARCHITECTURES=60
       -DBUILD_VF_COVERAGE=ON
-      -DBUILD_VF_UNIT_TESTS=ON
     - make -j4
     - ./bin/basicsTests
     - cd ..
@@ -338,6 +386,9 @@ gcov_gcc_9:
 clang-tidy:
   stage: analyze
 
+  only:
+    - open_source@irmb/VirtualFluids_dev
+
   image: irmb/virtualfluids-python-deps-ubuntu20.04
 
   needs: []
@@ -369,6 +420,8 @@ clang-tidy:
 # the reports in this file needs to match the artifacts.
 sonar-scanner:
   stage: deploy
+  tags:
+    - linux
 
   only:
     - open_source@irmb/VirtualFluids_dev
@@ -412,4 +465,4 @@ create_release:
       --ref "$CI_COMMIT_SHA" \
       --job-token "$CI_JOB_TOKEN" \
       --assets-link="{'name':'VirtualFluidsSingularityImage_OpenMPI','url':'','type':'other','filepath':'Containers/VirtualFluidsOpenMPI.sif'}"
-    - build/bin/basicsTests
\ No newline at end of file
+    - build/bin/basicsTests
diff --git a/3rdParty/MuParser/CMakeLists.txt b/3rdParty/MuParser/CMakeLists.txt
index 49aa3ac25552f6050de4eb5ae08928fddb8a6cf4..1634a2f15a71aeec53bd0ffb5f14c22aec7893aa 100644
--- a/3rdParty/MuParser/CMakeLists.txt
+++ b/3rdParty/MuParser/CMakeLists.txt
@@ -14,15 +14,9 @@ set(MUPARSER_VERSION ${MUPARSER_VERSION_MAJOR}.${MUPARSER_VERSION_MINOR}.${MUPAR
 
 # Build options
 option(ENABLE_SAMPLES "Build the samples" OFF)
-option(ENABLE_OPENMP "Enable OpenMP for multithreading" ON)
+#option(ENABLE_OPENMP "Enable OpenMP for multithreading" ON)
 #option(BUILD_SHARED_LIBS "Build shared/static libs" ON)
 
-if(ENABLE_OPENMP)
-    find_package(OpenMP REQUIRED)
-    set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_CXX_FLAGS}")
-    set(CMAKE_SHARED_LIBRARY_CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_SHARED_LIBRARY_CXX_FLAGS}")
-endif()
-
 
 # Credit: https://stackoverflow.com/questions/2368811/how-to-set-warning-level-in-cmake/3818084
 if(MSVC)
@@ -50,7 +44,9 @@ add_library(muparser
         src/muParserTokenReader.cpp
         )
 
-target_link_libraries(muparser PUBLIC OpenMP::OpenMP_CXX)
+if(BUILD_USE_OPENMP)
+   target_link_libraries(muparser PUBLIC OpenMP::OpenMP_CXX)
+endif()
 target_include_directories(muparser PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 # this compiles the "DLL" interface (C API)
@@ -66,7 +62,7 @@ if (CMAKE_BUILD_TYPE STREQUAL Debug)
     target_compile_definitions(muparser PRIVATE _DEBUG)
 endif ()
 
-if(ENABLE_OPENMP)
+if(BUILD_USE_OPENMP)
     target_compile_definitions(muparser PRIVATE MUP_USE_OPENMP)
 endif()
 set_target_properties(muparser PROPERTIES
diff --git a/3rdParty/MuParser/include/muParser.h b/3rdParty/MuParser/include/muParser.h
index ab1e21e5b2a7506593de90d4845b6c2e5e39904a..3a2bedc39180c678a63cfa81b893a339b85324f7 100644
--- a/3rdParty/MuParser/include/muParser.h
+++ b/3rdParty/MuParser/include/muParser.h
@@ -29,7 +29,9 @@
 #ifndef MU_PARSER_H
 #define MU_PARSER_H
 
+#ifdef __clang__
 #pragma clang system_header
+#endif
 
 //--- Standard includes ------------------------------------------------------------------------
 #include <vector>
diff --git a/3rdParty/MuParser/include/muParserBase.h b/3rdParty/MuParser/include/muParserBase.h
index e42aca8b121c95b3d61e4ef2414f2f0bea76cab1..5ccadd99d1d33b6e69a123480a31b82079234945 100644
--- a/3rdParty/MuParser/include/muParserBase.h
+++ b/3rdParty/MuParser/include/muParserBase.h
@@ -29,7 +29,9 @@
 #ifndef MU_PARSER_BASE_H
 #define MU_PARSER_BASE_H
 
+#ifdef __clang__
 #pragma clang system_header
+#endif
 
 //--- Standard includes ------------------------------------------------------------------------
 #include <cmath>
diff --git a/3rdParty/MuParser/include/muParserDLL.h b/3rdParty/MuParser/include/muParserDLL.h
index 18051c4d7835b266a463bd36f974d66d5729b13d..14c65b48a463c6bfb52fb92cca2daf62988df937 100644
--- a/3rdParty/MuParser/include/muParserDLL.h
+++ b/3rdParty/MuParser/include/muParserDLL.h
@@ -29,6 +29,10 @@
 #ifndef MU_PARSER_DLL_H
 #define MU_PARSER_DLL_H
 
+#ifdef __clang__
+#pragma clang system_header
+#endif
+
 #include "muParserFixes.h"
 
 #ifdef __cplusplus
diff --git a/3rdParty/cuda_samples/README b/3rdParty/cuda_samples/README
new file mode 100644
index 0000000000000000000000000000000000000000..5db13e7bda8365d792e3e68840d02c442348596e
--- /dev/null
+++ b/3rdParty/cuda_samples/README
@@ -0,0 +1,2 @@
+# 3rd party cuda
+The files in this folder are added from here https://github.com/NVIDIA/cuda-samples/blob/v11.2/Common/.
\ No newline at end of file
diff --git a/3rdParty/cuda_samples/exception.h b/3rdParty/cuda_samples/exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..84e348b59fb6892439e8057b03093bac48cadcd4
--- /dev/null
+++ b/3rdParty/cuda_samples/exception.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* CUda UTility Library */
+#ifndef COMMON_EXCEPTION_H_
+#define COMMON_EXCEPTION_H_
+
+// includes, system
+#include <stdlib.h>
+#include <exception>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template <class Std_Exception>
+class Exception : public Std_Exception {
+public:
+    //! @brief Static construction interface
+    //! @return Alwayss throws ( Located_Exception<Exception>)
+    //! @param file file in which the Exception occurs
+    //! @param line line in which the Exception occurs
+    //! @param detailed details on the code fragment causing the Exception
+    static void throw_it(const char *file, const int line,
+                         const char *detailed = "-");
+
+    //! Static construction interface
+    //! @return Alwayss throws ( Located_Exception<Exception>)
+    //! @param file file in which the Exception occurs
+    //! @param line line in which the Exception occurs
+    //! @param detailed details on the code fragment causing the Exception
+    static void throw_it(const char *file, const int line,
+                         const std::string &detailed);
+
+    //! Destructor
+    virtual ~Exception() throw();
+
+private:
+    //! Constructor, default (private)
+    Exception();
+
+    //! Constructor, standard
+    //! @param str string returned by what()
+    explicit Exception(const std::string &str);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template <class Exception_Typ>
+inline void handleException(const Exception_Typ &ex) {
+    std::cerr << ex.what() << std::endl;
+
+    exit(EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION(msg) \
+  Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION(msg) \
+  Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION(msg) \
+  Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const char *detailed) {
+    std::stringstream s;
+
+    // Quiet heavy-weight but exceptions are not for
+    // performance / release versions
+    s << "Exception in file '" << file << "' in line " << line << "\n"
+      << "Detailed description: " << detailed << "\n";
+
+    throw Exception(s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const std::string &msg) {
+    throw_it(file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n") {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception(const std::string &s) : Std_Exception(s) {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() {}
+
+// functions, exported
+
+#endif  // COMMON_EXCEPTION_H_
\ No newline at end of file
diff --git a/3rdParty/cuda_samples/helper_cuda.h b/3rdParty/cuda_samples/helper_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..81d3f9e76983a32daff4c2649ab0880e29f9881a
--- /dev/null
+++ b/3rdParty/cuda_samples/helper_cuda.h
@@ -0,0 +1,967 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <helper_string.h>
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header
+// files, please refer the CUDA examples for examples of the needed CUDA
+// headers, which may change depending on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+#endif
+
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+
+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "CUSPARSE_STATUS_SUCCESS";
+
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "CUSPARSE_STATUS_INVALID_VALUE";
+
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return "CUSOLVER_STATUS_MAPPING_ERROR";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return "CUSOLVER_STATUS_ZERO_PIVOT";
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error) {
+  switch (error) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+    case NVJPEG_STATUS_SUCCESS:
+      return "NVJPEG_STATUS_SUCCESS";
+
+    case NVJPEG_STATUS_NOT_INITIALIZED:
+      return "NVJPEG_STATUS_NOT_INITIALIZED";
+
+    case NVJPEG_STATUS_INVALID_PARAMETER:
+      return "NVJPEG_STATUS_INVALID_PARAMETER";
+
+    case NVJPEG_STATUS_BAD_JPEG:
+      return "NVJPEG_STATUS_BAD_JPEG";
+
+    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+
+    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+
+    case NVJPEG_STATUS_EXECUTION_FAILED:
+      return "NVJPEG_STATUS_EXECUTION_FAILED";
+
+    case NVJPEG_STATUS_ARCH_MISMATCH:
+      return "NVJPEG_STATUS_ARCH_MISMATCH";
+
+    case NVJPEG_STATUS_INTERNAL_ERROR:
+      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error) {
+  switch (error) {
+    case NPP_NOT_SUPPORTED_MODE_ERROR:
+      return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_RESIZE_NO_OPERATION_ERROR:
+      return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_BAD_ARG_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFF_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECT_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUAD_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEM_ALLOC_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_INPUT:
+      return "NPP_INVALID_INPUT";
+
+    case NPP_POINTER_ERROR:
+      return "NPP_POINTER_ERROR";
+
+    case NPP_WARNING:
+      return "NPP_WARNING";
+
+    case NPP_ODD_ROI_WARNING:
+      return "NPP_ODD_ROI_WARNING";
+#else
+
+    // These are for CUDA 5.5 or higher
+    case NPP_BAD_ARGUMENT_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFFICIENT_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECTANGLE_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUADRANGLE_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEMORY_ALLOCATION_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_HOST_POINTER_ERROR:
+      return "NPP_INVALID_HOST_POINTER_ERROR";
+
+    case NPP_INVALID_DEVICE_POINTER_ERROR:
+      return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_TEXTURE_BIND_ERROR:
+      return "NPP_TEXTURE_BIND_ERROR";
+
+    case NPP_WRONG_INTERSECTION_ROI_ERROR:
+      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+    case NPP_NOT_EVEN_STEP_ERROR:
+      return "NPP_NOT_EVEN_STEP_ERROR";
+
+    case NPP_INTERPOLATION_ERROR:
+      return "NPP_INTERPOLATION_ERROR";
+
+    case NPP_RESIZE_FACTOR_ERROR:
+      return "NPP_RESIZE_FACTOR_ERROR";
+
+    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_MEMFREE_ERR:
+      return "NPP_MEMFREE_ERR";
+
+    case NPP_MEMSET_ERR:
+      return "NPP_MEMSET_ERR";
+
+    case NPP_MEMCPY_ERR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERR:
+      return "NPP_MIRROR_FLIP_ERR";
+#else
+
+    case NPP_MEMFREE_ERROR:
+      return "NPP_MEMFREE_ERROR";
+
+    case NPP_MEMSET_ERROR:
+      return "NPP_MEMSET_ERROR";
+
+    case NPP_MEMCPY_ERROR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERROR:
+      return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+    case NPP_ALIGNMENT_ERROR:
+      return "NPP_ALIGNMENT_ERROR";
+
+    case NPP_STEP_ERROR:
+      return "NPP_STEP_ERROR";
+
+    case NPP_SIZE_ERROR:
+      return "NPP_SIZE_ERROR";
+
+    case NPP_NULL_POINTER_ERROR:
+      return "NPP_NULL_POINTER_ERROR";
+
+    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+    case NPP_NOT_IMPLEMENTED_ERROR:
+      return "NPP_NOT_IMPLEMENTED_ERROR";
+
+    case NPP_ERROR:
+      return "NPP_ERROR";
+
+    case NPP_SUCCESS:
+      return "NPP_SUCCESS";
+
+    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+    case NPP_MISALIGNED_DST_ROI_WARNING:
+      return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+    case NPP_DOUBLE_SIZE_WARNING:
+      return "NPP_DOUBLE_SIZE_WARNING";
+
+    case NPP_WRONG_INTERSECTION_ROI_WARNING:
+      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+    /* These are 6.0 or higher */
+    case NPP_LUT_PALETTE_BITSIZE_ERROR:
+      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_QUALITY_INDEX_ERROR:
+      return "NPP_QUALITY_INDEX_ERROR";
+
+    case NPP_CHANNEL_ORDER_ERROR:
+      return "NPP_CHANNEL_ORDER_ERROR";
+
+    case NPP_ZERO_MASK_VALUE_ERROR:
+      return "NPP_ZERO_MASK_VALUE_ERROR";
+
+    case NPP_NUMBER_OF_CHANNELS_ERROR:
+      return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+    case NPP_COI_ERROR:
+      return "NPP_COI_ERROR";
+
+    case NPP_DIVISOR_ERROR:
+      return "NPP_DIVISOR_ERROR";
+
+    case NPP_CHANNEL_ERROR:
+      return "NPP_CHANNEL_ERROR";
+
+    case NPP_STRIDE_ERROR:
+      return "NPP_STRIDE_ERROR";
+
+    case NPP_ANCHOR_ERROR:
+      return "NPP_ANCHOR_ERROR";
+
+    case NPP_MASK_SIZE_ERROR:
+      return "NPP_MASK_SIZE_ERROR";
+
+    case NPP_MOMENT_00_ZERO_ERROR:
+      return "NPP_MOMENT_00_ZERO_ERROR";
+
+    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+    case NPP_THRESHOLD_ERROR:
+      return "NPP_THRESHOLD_ERROR";
+
+    case NPP_CONTEXT_MATCH_ERROR:
+      return "NPP_CONTEXT_MATCH_ERROR";
+
+    case NPP_FFT_FLAG_ERROR:
+      return "NPP_FFT_FLAG_ERROR";
+
+    case NPP_FFT_ORDER_ERROR:
+      return "NPP_FFT_ORDER_ERROR";
+
+    case NPP_SCALE_RANGE_ERROR:
+      return "NPP_SCALE_RANGE_ERROR";
+
+    case NPP_DATA_TYPE_ERROR:
+      return "NPP_DATA_TYPE_ERROR";
+
+    case NPP_OUT_OFF_RANGE_ERROR:
+      return "NPP_OUT_OFF_RANGE_ERROR";
+
+    case NPP_DIVIDE_BY_ZERO_ERROR:
+      return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+    case NPP_RANGE_ERROR:
+      return "NPP_RANGE_ERROR";
+
+    case NPP_NO_MEMORY_ERROR:
+      return "NPP_NO_MEMORY_ERROR";
+
+    case NPP_ERROR_RESERVED:
+      return "NPP_ERROR_RESERVED";
+
+    case NPP_NO_OPERATION_WARNING:
+      return "NPP_NO_OPERATION_WARNING";
+
+    case NPP_DIVIDE_BY_ZERO_WARNING:
+      return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+    /* These are 7.0 or higher */
+    case NPP_OVERFLOW_ERROR:
+      return "NPP_OVERFLOW_ERROR";
+
+    case NPP_CORRUPTED_DATA_ERROR:
+      return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+  }
+
+  return "<unknown>";
+}
+#endif
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+    if (result) {
+        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+                static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+        exit(EXIT_FAILURE);
+    }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char *errorMessage, const char *file,
+                                 const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+  }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value) {
+    return (value >= 0 ? static_cast<int>(value + 0.5)
+                       : static_cast<int>(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor) {
+    // Defines for GPU Architecture types (using the SM version to determine
+    // the # of cores per SM
+    typedef struct {
+        int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+        // and m = SM minor version
+        int Cores;
+    } sSMtoCores;
+
+    sSMtoCores nGpuArchCoresPerSM[] = {
+        {0x30, 192},
+        {0x32, 192},
+        {0x35, 192},
+        {0x37, 192},
+        {0x50, 128},
+        {0x52, 128},
+        {0x53, 128},
+        {0x60,  64},
+        {0x61, 128},
+        {0x62, 128},
+        {0x70,  64},
+        {0x72,  64},
+        {0x75,  64},
+        {0x80,  64},
+        {0x86, 128},
+        {-1, -1}};
+
+    int index = 0;
+
+    while (nGpuArchCoresPerSM[index].SM != -1) {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+            return nGpuArchCoresPerSM[index].Cores;
+        }
+
+        index++;
+    }
+
+    // If we don't find the values, we default use the previous one
+    // to run properly
+    printf(
+        "MapSMtoCores for SM %d.%d is undefined."
+        "  Default to use %d Cores/SM\n",
+        major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+    return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char* _ConvertSMVer2ArchName(int major, int minor) {
+    // Defines for GPU Architecture types (using the SM version to determine
+    // the GPU Arch name)
+    typedef struct {
+        int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+        // and m = SM minor version
+        const char* name;
+    } sSMtoArchName;
+
+    sSMtoArchName nGpuArchNameSM[] = {
+        {0x30, "Kepler"},
+        {0x32, "Kepler"},
+        {0x35, "Kepler"},
+        {0x37, "Kepler"},
+        {0x50, "Maxwell"},
+        {0x52, "Maxwell"},
+        {0x53, "Maxwell"},
+        {0x60, "Pascal"},
+        {0x61, "Pascal"},
+        {0x62, "Pascal"},
+        {0x70, "Volta"},
+        {0x72, "Xavier"},
+        {0x75, "Turing"},
+        {0x80, "Ampere"},
+        {0x86, "Ampere"},
+        {-1, "Graphics Device"}};
+
+    int index = 0;
+
+    while (nGpuArchNameSM[index].SM != -1) {
+        if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
+            return nGpuArchNameSM[index].name;
+        }
+
+        index++;
+    }
+
+    // If we don't find the values, we default use the previous one
+    // to run properly
+    printf(
+        "MapSMtoArchName for SM %d.%d is undefined."
+        "  Default to use %s\n",
+        major, minor, nGpuArchNameSM[index - 1].name);
+    return nGpuArchNameSM[index - 1].name;
+}
+// end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID) {
+  int device_count;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuDeviceInit() CUDA error: "
+            "no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (devID < 0) {
+    devID = 0;
+  }
+
+  if (devID > device_count - 1) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            device_count);
+    fprintf(stderr,
+            ">> gpuDeviceInit (-device=%d) is not a valid"
+            " GPU device. <<\n",
+            devID);
+    fprintf(stderr, "\n");
+    return -devID;
+  }
+
+  int computeMode = -1, major = 0, minor = 0;
+  checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+  if (computeMode == cudaComputeModeProhibited) {
+    fprintf(stderr,
+            "Error: device is running in <Compute Mode "
+            "Prohibited>, no threads can use cudaSetDevice().\n");
+    return -1;
+  }
+
+  if (major < 1) {
+    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  checkCudaErrors(cudaSetDevice(devID));
+  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
+
+  return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId() {
+  int current_device = 0, sm_per_multiproc = 0;
+  int max_perf_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  uint64_t max_compute_perf = 0;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the best CUDA capable GPU device
+  current_device = 0;
+
+  while (current_device < device_count) {
+    int computeMode = -1, major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+
+    // If this GPU is not running on Compute Mode prohibited,
+    // then we can add it to the list
+    if (computeMode != cudaComputeModeProhibited) {
+      if (major == 9999 && minor == 9999) {
+        sm_per_multiproc = 1;
+      } else {
+        sm_per_multiproc =
+            _ConvertSMVer2Cores(major,  minor);
+      }
+      int multiProcessorCount = 0, clockRate = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
+      cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device);
+      if (result != cudaSuccess) {
+        // If cudaDevAttrClockRate attribute is not supported we
+        // set clockRate as 1, to consider GPU with most SMs and CUDA Cores.
+        if(result == cudaErrorInvalidValue) {
+          clockRate = 1;
+        }
+        else {
+          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result));
+          exit(EXIT_FAILURE);
+        }
+      }
+      uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
+
+      if (compute_perf > max_compute_perf) {
+        max_compute_perf = compute_perf;
+        max_perf_device = current_device;
+      }
+    } else {
+      devices_prohibited++;
+    }
+
+    ++current_device;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " all devices have compute mode prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return max_perf_device;
+}
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv) {
+  int devID = 0;
+
+  // If the command-line has a device number specified, use it
+  if (checkCmdLineFlag(argc, argv, "device")) {
+    devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+    if (devID < 0) {
+      printf("Invalid command line parameter\n ");
+      exit(EXIT_FAILURE);
+    } else {
+      devID = gpuDeviceInit(devID);
+
+      if (devID < 0) {
+        printf("exiting...\n");
+        exit(EXIT_FAILURE);
+      }
+    }
+  } else {
+    // Otherwise pick the device with highest Gflops/s
+    devID = gpuGetMaxGflopsDeviceId();
+    checkCudaErrors(cudaSetDevice(devID));
+    int major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+           devID, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+  }
+
+  return devID;
+}
+
+inline int findIntegratedGPU() {
+  int current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the integrated GPU which is compute capable
+  while (current_device < device_count) {
+    int computeMode = -1, integrated = -1;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device));
+    // If GPU is integrated and is not running on Compute Mode prohibited,
+    // then cuda can map to GLES resource
+    if (integrated && (computeMode != cudaComputeModeProhibited)) {
+      checkCudaErrors(cudaSetDevice(current_device));
+
+      int major = 0, minor = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+      checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+      return current_device;
+    } else {
+      devices_prohibited++;
+    }
+
+    current_device++;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "CUDA error:"
+            " No GLES-CUDA Interop capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return -1;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version) {
+  int dev;
+  int major = 0, minor = 0;
+
+  checkCudaErrors(cudaGetDevice(&dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
+
+  if ((major > major_version) ||
+      (major == major_version &&
+       minor >= minor_version)) {
+    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
+    return true;
+  } else {
+    printf(
+        "  No GPU device was found that can support "
+        "CUDA compute capability %d.%d.\n",
+        major_version, minor_version);
+    return false;
+  }
+}
+#endif
+
+// end of CUDA Helper Functions
+
+#endif  // COMMON_HELPER_CUDA_H_
\ No newline at end of file
diff --git a/3rdParty/cuda_samples/helper_functions.h b/3rdParty/cuda_samples/helper_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fc2ea47ba7d39a4bf6a882f65b16779de6de0ac
--- /dev/null
+++ b/3rdParty/cuda_samples/helper_functions.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing,
+// timers, image helpers, etc)
+#ifndef COMMON_HELPER_FUNCTIONS_H_
+#define COMMON_HELPER_FUNCTIONS_H_
+
+#ifdef WIN32
+#pragma warning(disable : 4996)
+#endif
+
+// includes, project
+#include <assert.h>
+#include <exception.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// includes, timer, string parsing, image helpers
+#include <helper_image.h>  // helper functions for image compare, dump, data comparisons
+#include <helper_string.h>  // helper functions for string parsing
+#include <helper_timer.h>   // helper functions for timers
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#endif  // COMMON_HELPER_FUNCTIONS_H_
\ No newline at end of file
diff --git a/3rdParty/cuda_samples/helper_image.h b/3rdParty/cuda_samples/helper_image.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb7190c21b39463222beb579337d14c1a54b0000
--- /dev/null
+++ b/3rdParty/cuda_samples/helper_image.h
@@ -0,0 +1,1009 @@
+//
+// Created by Soeren Peters on 05.02.21.
+//
+
+#ifndef VIRTUALFLUIDS_HELPER_IMAGE_H
+#define VIRTUALFLUIDS_HELPER_IMAGE_H
+
+#endif // VIRTUALFLUIDS_HELPER_IMAGE_H
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (image,bitmap)
+#ifndef COMMON_HELPER_IMAGE_H_
+#define COMMON_HELPER_IMAGE_H_
+
+#include <assert.h>
+#include <exception.h>
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#ifndef MIN
+#define MIN(a, b) ((a < b) ? a : b)
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a > b) ? a : b)
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#include <helper_string.h>
+
+// namespace unnamed (internal)
+namespace helper_image_internal {
+//! size of PGM file header
+const unsigned int PGMHeaderSize = 0x40;
+
+// types
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterFromUByte;
+
+//! Data converter from unsigned char / unsigned byte
+template <>
+struct ConverterFromUByte<unsigned char> {
+    //! Conversion operator
+    //! @return converted value
+    //! @param  val  value to convert
+    float operator()(const unsigned char &val) {
+        return static_cast<unsigned char>(val);
+    }
+};
+
+//! Data converter from unsigned char / unsigned byte to float
+template <>
+struct ConverterFromUByte<float> {
+    //! Conversion operator
+    //! @return converted value
+    //! @param  val  value to convert
+    float operator()(const unsigned char &val) {
+        return static_cast<float>(val) / 255.0f;
+    }
+};
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterToUByte;
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<unsigned char> {
+    //! Conversion operator (essentially a passthru
+    //! @return converted value
+    //! @param  val  value to convert
+    unsigned char operator()(const unsigned char &val) { return val; }
+};
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<float> {
+    //! Conversion operator
+    //! @return converted value
+    //! @param  val  value to convert
+    unsigned char operator()(const float &val) {
+        return static_cast<unsigned char>(val * 255.0f);
+    }
+};
+}  // namespace helper_image_internal
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#else
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#endif
+
+inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w,
+                      unsigned int *h, unsigned int *channels) {
+    FILE *fp = NULL;
+
+    if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) {
+        std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl;
+        return false;
+    }
+
+    // check header
+    char header[helper_image_internal::PGMHeaderSize];
+
+    if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+        std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
+        return false;
+    }
+
+    if (strncmp(header, "P5", 2) == 0) {
+        *channels = 1;
+    } else if (strncmp(header, "P6", 2) == 0) {
+        *channels = 3;
+    } else {
+        std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl;
+        *channels = 0;
+        return false;
+    }
+
+    // parse header, read maxval, width and height
+    unsigned int width = 0;
+    unsigned int height = 0;
+    unsigned int maxval = 0;
+    unsigned int i = 0;
+
+    while (i < 3) {
+        if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+            std::cerr << "__LoadPPM() : reading PGM header returned NULL"
+                      << std::endl;
+            return false;
+        }
+
+        if (header[0] == '#') {
+            continue;
+        }
+
+        if (i == 0) {
+            i += SSCANF(header, "%u %u %u", &width, &height, &maxval);
+        } else if (i == 1) {
+            i += SSCANF(header, "%u %u", &height, &maxval);
+        } else if (i == 2) {
+            i += SSCANF(header, "%u", &maxval);
+        }
+    }
+
+    // check if given handle for the data is initialized
+    if (NULL != *data) {
+        if (*w != width || *h != height) {
+            std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl;
+        }
+    } else {
+        *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height *
+                                        *channels);
+        *w = width;
+        *h = height;
+    }
+
+    // read and close file
+    if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) ==
+        0) {
+        std::cerr << "__LoadPPM() read data returned error." << std::endl;
+    }
+
+    fclose(fp);
+
+    return true;
+}
+
+template <class T>
+inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w,
+                       unsigned int *h) {
+    unsigned char *idata = NULL;
+    unsigned int channels;
+
+    if (true != __loadPPM(file, &idata, w, h, &channels)) {
+        return false;
+    }
+
+    unsigned int size = *w * *h * channels;
+
+    // initialize mem if necessary
+    // the correct size is checked / set in loadPGMc()
+    if (NULL == *data) {
+        *data = reinterpret_cast<T *>(malloc(sizeof(T) * size));
+    }
+
+    // copy and cast data
+    std::transform(idata, idata + size, *data,
+                   helper_image_internal::ConverterFromUByte<T>());
+
+    free(idata);
+
+    return true;
+}
+
+template <class T>
+inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w,
+                        unsigned int *h) {
+    unsigned char *idata = 0;
+    unsigned int channels;
+
+    if (__loadPPM(file, &idata, w, h, &channels)) {
+        // pad 4th component
+        int size = *w * *h;
+        // keep the original pointer
+        unsigned char *idata_orig = idata;
+        *data = reinterpret_cast<T *>(malloc(sizeof(T) * size * 4));
+        unsigned char *ptr = *data;
+
+        for (int i = 0; i < size; i++) {
+            *ptr++ = *idata++;
+            *ptr++ = *idata++;
+            *ptr++ = *idata++;
+            *ptr++ = 0;
+        }
+
+        free(idata_orig);
+        return true;
+    } else {
+        free(idata);
+        return false;
+    }
+}
+
+inline bool __savePPM(const char *file, unsigned char *data, unsigned int w,
+                      unsigned int h, unsigned int channels) {
+    assert(NULL != data);
+    assert(w > 0);
+    assert(h > 0);
+
+    std::fstream fh(file, std::fstream::out | std::fstream::binary);
+
+    if (fh.bad()) {
+        std::cerr << "__savePPM() : Opening file failed." << std::endl;
+        return false;
+    }
+
+    if (channels == 1) {
+        fh << "P5\n";
+    } else if (channels == 3) {
+        fh << "P6\n";
+    } else {
+        std::cerr << "__savePPM() : Invalid number of channels." << std::endl;
+        return false;
+    }
+
+    fh << w << "\n" << h << "\n" << 0xff << std::endl;
+
+    for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) {
+        fh << data[i];
+    }
+
+    fh.flush();
+
+    if (fh.bad()) {
+        std::cerr << "__savePPM() : Writing data failed." << std::endl;
+        return false;
+    }
+
+    fh.close();
+
+    return true;
+}
+
+template <class T>
+inline bool sdkSavePGM(const char *file, T *data, unsigned int w,
+                       unsigned int h) {
+    unsigned int size = w * h;
+    unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size);
+
+    std::transform(data, data + size, idata,
+                   helper_image_internal::ConverterToUByte<T>());
+
+    // write file
+    bool result = __savePPM(file, idata, w, h, 1);
+
+    // cleanup
+    free(idata);
+
+    return result;
+}
+
+inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w,
+                          unsigned int h) {
+    // strip 4th component
+    int size = w * h;
+    unsigned char *ndata =
+        (unsigned char *)malloc(sizeof(unsigned char) * size * 3);
+    unsigned char *ptr = ndata;
+
+    for (int i = 0; i < size; i++) {
+        *ptr++ = *data++;
+        *ptr++ = *data++;
+        *ptr++ = *data++;
+        data++;
+    }
+
+    bool result = __savePPM(file, ndata, w, h, 3);
+    free(ndata);
+    return result;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFile(const char *filename, T **data, unsigned int *len,
+                        bool verbose) {
+    // check input arguments
+    assert(NULL != filename);
+    assert(NULL != len);
+
+    // intermediate storage for the data read
+    std::vector<T> data_read;
+
+    // open file for reading
+    FILE *fh = NULL;
+
+    // check if filestream is valid
+    if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) {
+        printf("Unable to open input file: %s\n", filename);
+        return false;
+    }
+
+    // read all data elements
+    T token;
+
+    while (!feof(fh)) {
+        fscanf(fh, "%f", &token);
+        data_read.push_back(token);
+    }
+
+    // the last element is read twice
+    data_read.pop_back();
+    fclose(fh);
+
+    // check if the given handle is already initialized
+    if (NULL != *data) {
+        if (*len != data_read.size()) {
+            std::cerr << "sdkReadFile() : Initialized memory given but "
+                      << "size  mismatch with signal read "
+                      << "(data read / data init = " << (unsigned int)data_read.size()
+                      << " / " << *len << ")" << std::endl;
+
+            return false;
+        }
+    } else {
+        // allocate storage for the data read
+        *data = reinterpret_cast<T *>(malloc(sizeof(T) * data_read.size()));
+        // store signal size
+        *len = static_cast<unsigned int>(data_read.size());
+    }
+
+    // copy data
+    memcpy(*data, &data_read.front(), sizeof(T) * data_read.size());
+
+    return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len,
+                              unsigned int block_num, unsigned int block_size,
+                              bool verbose) {
+    // check input arguments
+    assert(NULL != filename);
+    assert(NULL != len);
+
+    // open file for reading
+    FILE *fh = fopen(filename, "rb");
+
+    if (fh == NULL && verbose) {
+        std::cerr << "sdkReadFile() : Opening file failed." << std::endl;
+        return false;
+    }
+
+    // check if the given handle is already initialized
+    // allocate storage for the data read
+    data[block_num] = reinterpret_cast<T *>(malloc(block_size));
+
+    // read all data elements
+    fseek(fh, block_num * block_size, SEEK_SET);
+    *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh);
+
+    fclose(fh);
+
+    return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename
+//! @return true if writing the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len,
+                         const S epsilon, bool verbose, bool append = false) {
+    assert(NULL != filename);
+    assert(NULL != data);
+
+    // open file for writing
+    //    if (append) {
+    std::fstream fh(filename, std::fstream::out | std::fstream::ate);
+
+    if (verbose) {
+        std::cerr << "sdkWriteFile() : Open file " << filename
+                  << " for write/append." << std::endl;
+    }
+
+    /*    } else {
+            std::fstream fh(filename, std::fstream::out);
+            if (verbose) {
+                std::cerr << "sdkWriteFile() : Open file " << filename << " for
+       write." << std::endl;
+            }
+        }
+    */
+
+    // check if filestream is valid
+    if (!fh.good()) {
+        if (verbose) {
+            std::cerr << "sdkWriteFile() : Opening file failed." << std::endl;
+        }
+
+        return false;
+    }
+
+    // first write epsilon
+    fh << "# " << epsilon << "\n";
+
+    // write data
+    for (unsigned int i = 0; (i < len) && (fh.good()); ++i) {
+        fh << data[i] << ' ';
+    }
+
+    // Check if writing succeeded
+    if (!fh.good()) {
+        if (verbose) {
+            std::cerr << "sdkWriteFile() : Writing file failed." << std::endl;
+        }
+
+        return false;
+    }
+
+    // file ends with nl
+    fh << std::endl;
+
+    return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  timer_interface to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareData(const T *reference, const T *data,
+                        const unsigned int len, const S epsilon,
+                        const float threshold) {
+    assert(epsilon >= 0);
+
+    bool result = true;
+    unsigned int error_count = 0;
+
+    for (unsigned int i = 0; i < len; ++i) {
+        float diff = static_cast<float>(reference[i]) - static_cast<float>(data[i]);
+        bool comp = (diff <= epsilon) && (diff >= -epsilon);
+        result &= comp;
+
+        error_count += !comp;
+
+#if 0
+
+        if (!comp) {
+      std::cerr << "ERROR, i = " << i << ",\t "
+                << reference[i] << " / "
+                << data[i]
+                << " (reference / data)\n";
+    }
+
+#endif
+    }
+
+    if (threshold == 0.0f) {
+        return (result) ? true : false;
+    } else {
+        if (error_count) {
+            printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+                   static_cast<float>(error_count) * 100 / static_cast<float>(len),
+                   error_count);
+        }
+
+        return (len * threshold > error_count) ? true : false;
+    }
+}
+
+#ifndef __MIN_EPSILON_ERROR
+#define __MIN_EPSILON_ERROR 1e-3f
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param epsilon    threshold % of (# of bytes) for pass/fail
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareDataAsFloatThreshold(const T *reference, const T *data,
+                                        const unsigned int len, const S epsilon,
+                                        const float threshold) {
+    assert(epsilon >= 0);
+
+    // If we set epsilon to be 0, let's set a minimum threshold
+    float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR);
+    int error_count = 0;
+    bool result = true;
+
+    for (unsigned int i = 0; i < len; ++i) {
+        float diff =
+            fabs(static_cast<float>(reference[i]) - static_cast<float>(data[i]));
+        bool comp = (diff < max_error);
+        result &= comp;
+
+        if (!comp) {
+            error_count++;
+        }
+    }
+
+    if (threshold == 0.0f) {
+        if (error_count) {
+            printf("total # of errors = %d\n", error_count);
+        }
+
+        return (error_count == 0) ? true : false;
+    } else {
+        if (error_count) {
+            printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+                   static_cast<float>(error_count) * 100 / static_cast<float>(len),
+                   error_count);
+        }
+
+        return ((len * threshold > error_count) ? true : false);
+    }
+}
+
+inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) {
+    printf("sdkDumpBin: <%s>\n", filename);
+    FILE *fp;
+    FOPEN(fp, filename, "wb");
+    fwrite(data, bytes, 1, fp);
+    fflush(fp);
+    fclose(fp);
+}
+
+inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file,
+                                  unsigned int nelements, const float epsilon,
+                                  const float threshold, char *exec_path) {
+    unsigned int *src_buffer, *ref_buffer;
+    FILE *src_fp = NULL, *ref_fp = NULL;
+
+    uint64_t error_count = 0;
+    size_t fsize = 0;
+
+    if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+        printf("compareBin2Bin <unsigned int> unable to open src_file: %s\n",
+               src_file);
+        error_count++;
+    }
+
+    char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+    if (ref_file_path == NULL) {
+        printf("compareBin2Bin <unsigned int>  unable to find <%s> in <%s>\n",
+               ref_file, exec_path);
+        printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+               ref_file);
+        printf("Aborting comparison!\n");
+        printf("  FAILED\n");
+        error_count++;
+
+        if (src_fp) {
+            fclose(src_fp);
+        }
+
+        if (ref_fp) {
+            fclose(ref_fp);
+        }
+    } else {
+        if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+            printf(
+                "compareBin2Bin <unsigned int>"
+                " unable to open ref_file: %s\n",
+                ref_file_path);
+            error_count++;
+        }
+
+        if (src_fp && ref_fp) {
+            src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+            ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+
+            fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp);
+            fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp);
+
+            printf(
+                "> compareBin2Bin <unsigned int> nelements=%d,"
+                " epsilon=%4.2f, threshold=%4.2f\n",
+                nelements, epsilon, threshold);
+            printf("   src_file <%s>, size=%d bytes\n", src_file,
+                   static_cast<int>(fsize));
+            printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+                   static_cast<int>(fsize));
+
+            if (!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements,
+                                                  epsilon, threshold)) {
+                error_count++;
+            }
+
+            fclose(src_fp);
+            fclose(ref_fp);
+
+            free(src_buffer);
+            free(ref_buffer);
+        } else {
+            if (src_fp) {
+                fclose(src_fp);
+            }
+
+            if (ref_fp) {
+                fclose(ref_fp);
+            }
+        }
+    }
+
+    if (error_count == 0) {
+        printf("  OK\n");
+    } else {
+        printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+    }
+
+    return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file,
+                                   unsigned int nelements, const float epsilon,
+                                   const float threshold, char *exec_path) {
+    float *src_buffer = NULL, *ref_buffer = NULL;
+    FILE *src_fp = NULL, *ref_fp = NULL;
+    size_t fsize = 0;
+
+    uint64_t error_count = 0;
+
+    if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+        printf("compareBin2Bin <float> unable to open src_file: %s\n", src_file);
+        error_count = 1;
+    }
+
+    char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+    if (ref_file_path == NULL) {
+        printf("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file,
+               exec_path);
+        printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+               exec_path);
+        printf("Aborting comparison!\n");
+        printf("  FAILED\n");
+        error_count++;
+
+        if (src_fp) {
+            fclose(src_fp);
+        }
+
+        if (ref_fp) {
+            fclose(ref_fp);
+        }
+    } else {
+        if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+            printf("compareBin2Bin <float> unable to open ref_file: %s\n",
+                   ref_file_path);
+            error_count = 1;
+        }
+
+        if (src_fp && ref_fp) {
+            src_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+            ref_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+
+            printf(
+                "> compareBin2Bin <float> nelements=%d, epsilon=%4.2f,"
+                " threshold=%4.2f\n",
+                nelements, epsilon, threshold);
+            fsize = fread(src_buffer, sizeof(float), nelements, src_fp);
+            printf("   src_file <%s>, size=%d bytes\n", src_file,
+                   static_cast<int>(fsize * sizeof(float)));
+            fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp);
+            printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+                   static_cast<int>(fsize * sizeof(float)));
+
+            if (!compareDataAsFloatThreshold<float, float>(
+                ref_buffer, src_buffer, nelements, epsilon, threshold)) {
+                error_count++;
+            }
+
+            fclose(src_fp);
+            fclose(ref_fp);
+
+            free(src_buffer);
+            free(ref_buffer);
+        } else {
+            if (src_fp) {
+                fclose(src_fp);
+            }
+
+            if (ref_fp) {
+                fclose(ref_fp);
+            }
+        }
+    }
+
+    if (error_count == 0) {
+        printf("  OK\n");
+    } else {
+        printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+    }
+
+    return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareL2fe(const float *reference, const float *data,
+                           const unsigned int len, const float epsilon) {
+    assert(epsilon >= 0);
+
+    float error = 0;
+    float ref = 0;
+
+    for (unsigned int i = 0; i < len; ++i) {
+        float diff = reference[i] - data[i];
+        error += diff * diff;
+        ref += reference[i] * reference[i];
+    }
+
+    float normRef = sqrtf(ref);
+
+    if (fabs(ref) < 1e-7) {
+#ifdef _DEBUG
+        std::cerr << "ERROR, reference l2-norm is 0\n";
+#endif
+        return false;
+    }
+
+    float normError = sqrtf(error);
+    error = normError / normRef;
+    bool result = error < epsilon;
+#ifdef _DEBUG
+
+    if (!result) {
+    std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon "
+              << epsilon << "\n";
+  }
+
+#endif
+
+    return result;
+}
+
+inline bool sdkLoadPPMub(const char *file, unsigned char **data,
+                         unsigned int *w, unsigned int *h) {
+    unsigned int channels;
+    return __loadPPM(file, data, w, h, &channels);
+}
+
+inline bool sdkLoadPPM4ub(const char *file, unsigned char **data,
+                          unsigned int *w, unsigned int *h) {
+    unsigned char *idata = 0;
+    unsigned int channels;
+
+    if (__loadPPM(file, &idata, w, h, &channels)) {
+        // pad 4th component
+        int size = *w * *h;
+        // keep the original pointer
+        unsigned char *idata_orig = idata;
+        *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4);
+        unsigned char *ptr = *data;
+
+        for (int i = 0; i < size; i++) {
+            *ptr++ = *idata++;
+            *ptr++ = *idata++;
+            *ptr++ = *idata++;
+            *ptr++ = 0;
+        }
+
+        free(idata_orig);
+        return true;
+    } else {
+        free(idata);
+        return false;
+    }
+}
+
+inline bool sdkComparePPM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+    unsigned char *src_data, *ref_data;
+    uint64_t error_count = 0;
+    unsigned int ref_width, ref_height;
+    unsigned int src_width, src_height;
+
+    if (src_file == NULL || ref_file == NULL) {
+        if (verboseErrors) {
+            std::cerr << "PPMvsPPM: src_file or ref_file is NULL."
+                         "  Aborting comparison\n";
+        }
+
+        return false;
+    }
+
+    if (verboseErrors) {
+        std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+        std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+    }
+
+    if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+        if (verboseErrors) {
+            std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file
+                      << "\n";
+        }
+
+        return false;
+    }
+
+    if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) {
+        std::cerr << "PPMvsPPM: unable to load src image file: " << src_file
+                  << "\n";
+        return false;
+    }
+
+    if (src_height != ref_height || src_width != ref_width) {
+        if (verboseErrors) {
+            std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width
+                      << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                      << ")\n";
+        }
+    }
+
+    if (verboseErrors) {
+        std::cerr << "PPMvsPPM: comparing images size (" << src_width << ","
+                  << src_height << ") epsilon(" << epsilon << "), threshold("
+                  << threshold * 100 << "%)\n";
+    }
+
+    if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon,
+                    threshold) == false) {
+        error_count = 1;
+    }
+
+    if (error_count == 0) {
+        if (verboseErrors) {
+            std::cerr << "    OK\n\n";
+        }
+    } else {
+        if (verboseErrors) {
+            std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+        }
+    }
+
+    // returns true if all pixels pass
+    return (error_count == 0) ? true : false;
+}
+
+inline bool sdkComparePGM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+    unsigned char *src_data = 0, *ref_data = 0;
+    uint64_t error_count = 0;
+    unsigned int ref_width, ref_height;
+    unsigned int src_width, src_height;
+
+    if (src_file == NULL || ref_file == NULL) {
+        if (verboseErrors) {
+            std::cerr << "PGMvsPGM: src_file or ref_file is NULL."
+                         "  Aborting comparison\n";
+        }
+
+        return false;
+    }
+
+    if (verboseErrors) {
+        std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+        std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+    }
+
+    if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+        if (verboseErrors) {
+            std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file
+                      << "\n";
+        }
+
+        return false;
+    }
+
+    if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) {
+        std::cerr << "PGMvsPGM: unable to load src image file: " << src_file
+                  << "\n";
+        return false;
+    }
+
+    if (src_height != ref_height || src_width != ref_width) {
+        if (verboseErrors) {
+            std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width
+                      << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                      << ")\n";
+        }
+    }
+
+    if (verboseErrors)
+        std::cerr << "PGMvsPGM: comparing images size (" << src_width << ","
+                  << src_height << ") epsilon(" << epsilon << "), threshold("
+                  << threshold * 100 << "%)\n";
+
+    if (compareData(ref_data, src_data, src_width * src_height, epsilon,
+                    threshold) == false) {
+        error_count = 1;
+    }
+
+    if (error_count == 0) {
+        if (verboseErrors) {
+            std::cerr << "    OK\n\n";
+        }
+    } else {
+        if (verboseErrors) {
+            std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+        }
+    }
+
+    // returns true if all pixels pass
+    return (error_count == 0) ? true : false;
+}
+
+#endif  // COMMON_HELPER_IMAGE_H_
\ No newline at end of file
diff --git a/3rdParty/cuda_samples/helper_string.h b/3rdParty/cuda_samples/helper_string.h
new file mode 100644
index 0000000000000000000000000000000000000000..c09935174ec057757c20263df080cb8d77b53f52
--- /dev/null
+++ b/3rdParty/cuda_samples/helper_string.h
@@ -0,0 +1,368 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef COMMON_HELPER_STRING_H_
+#define COMMON_HELPER_STRING_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else  // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string) {
+    int string_start = 0;
+
+    while (string[string_start] == delimiter) {
+        string_start++;
+    }
+
+    if (string_start >= static_cast<int>(strlen(string) - 1)) {
+        return 0;
+    }
+
+    return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension) {
+    int string_length = static_cast<int>(strlen(filename));
+
+    while (filename[string_length--] != '.') {
+        if (string_length == 0) break;
+    }
+
+    if (string_length > 0) string_length += 2;
+
+    if (string_length == 0)
+        *extension = NULL;
+    else
+        *extension = &filename[string_length];
+
+    return string_length;
+}
+
+inline bool checkCmdLineFlag(const int argc, const char **argv,
+                             const char *string_ref) {
+    bool bFound = false;
+
+    if (argc >= 1) {
+        for (int i = 1; i < argc; i++) {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+
+            const char *equal_pos = strchr(string_argv, '=');
+            int argv_length = static_cast<int>(
+                equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+            int length = static_cast<int>(strlen(string_ref));
+
+            if (length == argv_length &&
+                !STRNCASECMP(string_argv, string_ref, length)) {
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv,
+                                    const char *string_ref, T *value) {
+    bool bFound = false;
+
+    if (argc >= 1) {
+        for (int i = 1; i < argc; i++) {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = static_cast<int>(strlen(string_ref));
+
+            if (!STRNCASECMP(string_argv, string_ref, length)) {
+                if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    *value = (T)atoi(&string_argv[length + auto_inc]);
+                }
+
+                bFound = true;
+                i = argc;
+            }
+        }
+    }
+
+    return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv,
+                                 const char *string_ref) {
+    bool bFound = false;
+    int value = -1;
+
+    if (argc >= 1) {
+        for (int i = 1; i < argc; i++) {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = static_cast<int>(strlen(string_ref));
+
+            if (!STRNCASECMP(string_argv, string_ref, length)) {
+                if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    value = atoi(&string_argv[length + auto_inc]);
+                } else {
+                    value = 0;
+                }
+
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (bFound) {
+        return value;
+    } else {
+        return 0;
+    }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv,
+                                     const char *string_ref) {
+    bool bFound = false;
+    float value = -1;
+
+    if (argc >= 1) {
+        for (int i = 1; i < argc; i++) {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = static_cast<int>(strlen(string_ref));
+
+            if (!STRNCASECMP(string_argv, string_ref, length)) {
+                if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    value = static_cast<float>(atof(&string_argv[length + auto_inc]));
+                } else {
+                    value = 0.f;
+                }
+
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (bFound) {
+        return value;
+    } else {
+        return 0;
+    }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref,
+                                     char **string_retval) {
+    bool bFound = false;
+
+    if (argc >= 1) {
+        for (int i = 1; i < argc; i++) {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            char *string_argv = const_cast<char *>(&argv[i][string_start]);
+            int length = static_cast<int>(strlen(string_ref));
+
+            if (!STRNCASECMP(string_argv, string_ref, length)) {
+                *string_retval = &string_argv[length + 1];
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (!bFound) {
+        *string_retval = NULL;
+    }
+
+    return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename,
+                             const char *executable_path) {
+    // <executable_name> defines a variable that is replaced with the name of the
+    // executable
+
+    // Typical relative search paths to locate needed companion files (e.g. sample
+    // input data, or JIT source files) The origin for the relative search may be
+    // the .exe file, a .bat file launching an .exe, a browser .exe launching the
+    // .exe or .bat, etc
+    const char *searchPath[] = {
+        "./",                                          // same dir
+        "./data/",                                      // same dir
+        "../../../../Samples/<executable_name>/",       // up 4 in tree
+        "../../../Samples/<executable_name>/",          // up 3 in tree
+        "../../Samples/<executable_name>/",             // up 2 in tree
+        "../../../../Samples/<executable_name>/data/",  // up 4 in tree
+        "../../../Samples/<executable_name>/data/",     // up 3 in tree
+        "../../Samples/<executable_name>/data/",        // up 2 in tree
+        "../../../../Common/data/",                     // up 4 in tree
+        "../../../Common/data/",                        // up 3 in tree
+        "../../Common/data/"                            // up 2 in tree
+    };
+
+    // Extract the executable name
+    std::string executable_name;
+
+    if (executable_path != 0) {
+        executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+
+#else
+        // Linux & OSX path delimiter
+        size_t delimiter_pos = executable_name.find_last_of('/');
+        executable_name.erase(0, delimiter_pos + 1);
+#endif
+    }
+
+    // Loop over all search paths and return the first hit
+    for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+        std::string path(searchPath[i]);
+        size_t executable_name_pos = path.find("<executable_name>");
+
+        // If there is executable_name variable in the searchPath
+        // replace it with the value
+        if (executable_name_pos != std::string::npos) {
+            if (executable_path != 0) {
+                path.replace(executable_name_pos, strlen("<executable_name>"),
+                             executable_name);
+            } else {
+                // Skip this path entry if no executable argument is given
+                continue;
+            }
+        }
+
+#ifdef _DEBUG
+        printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+        // Test if the file exists
+        path.append(filename);
+        FILE *fp;
+        FOPEN(fp, path.c_str(), "rb");
+
+        if (fp != NULL) {
+            fclose(fp);
+            // File found
+            // returning an allocated array here for backwards compatibility reasons
+            char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
+            STRCPY(file_path, path.length() + 1, path.c_str());
+            return file_path;
+        }
+
+        if (fp) {
+            fclose(fp);
+        }
+    }
+
+    // File not found
+    return 0;
+}
+
+#endif  // COMMON_HELPER_STRING_H_
\ No newline at end of file
diff --git a/3rdParty/cuda_samples/helper_timer.h b/3rdParty/cuda_samples/helper_timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..51efd720993057092323ea224377e87c95d70ff2
--- /dev/null
+++ b/3rdParty/cuda_samples/helper_timer.h
@@ -0,0 +1,465 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Helper Timing Functions
+#ifndef COMMON_HELPER_TIMER_H_
+#define COMMON_HELPER_TIMER_H_
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include <exception.h>
+
+// Definition of the StopWatch Interface, this is used if we don't want to use
+// the CUT functions But rather in a self contained class interface
+class StopWatchInterface {
+public:
+    StopWatchInterface() {}
+    virtual ~StopWatchInterface() {}
+
+public:
+    //! Start time measurement
+    virtual void start() = 0;
+
+    //! Stop time measurement
+    virtual void stop() = 0;
+
+    //! Reset time counters to zero
+    virtual void reset() = 0;
+
+    //! Time in msec. after start. If the stop watch is still running (i.e. there
+    //! was no call to stop()) then the elapsed time is returned, otherwise the
+    //! time between the last start() and stop call is returned
+    virtual float getTime() = 0;
+
+    //! Mean time to date based on the number of times the stopwatch has been
+    //! _stopped_ (ie finished sessions) and the current total time
+    virtual float getAverageTime() = 0;
+};
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchWin()
+      : start_time(),
+        end_time(),
+        diff_time(0.0f),
+        total_time(0.0f),
+        running(false),
+        clock_sessions(0),
+        freq(0),
+        freq_set(false) {
+    if (!freq_set) {
+      // helper variable
+      LARGE_INTEGER temp;
+
+      // get the tick frequency from the OS
+      QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER *>(&temp));
+
+      // convert to type in which it is needed
+      freq = (static_cast<double>(temp.QuadPart)) / 1000.0;
+
+      // rememeber query
+      freq_set = true;
+    }
+  }
+
+  // Destructor
+  ~StopWatchWin() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  LARGE_INTEGER start_time;
+  //! End of measurement
+  LARGE_INTEGER end_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+
+  //! tick frequency
+  double freq;
+
+  //! flag if the frequency has been set
+  bool freq_set;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::start() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::stop() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&end_time));
+  diff_time = static_cast<float>(((static_cast<double>(end_time.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+
+  total_time += diff_time;
+  clock_sessions++;
+  running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    LARGE_INTEGER temp;
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&temp));
+    retval += static_cast<float>(((static_cast<double>(temp.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <sys/time.h>
+#include <ctime>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface {
+public:
+    //! Constructor, default
+    StopWatchLinux()
+        : start_time(),
+          diff_time(0.0),
+          total_time(0.0),
+          running(false),
+          clock_sessions(0) {}
+
+    // Destructor
+    virtual ~StopWatchLinux() {}
+
+public:
+    //! Start time measurement
+    inline void start();
+
+    //! Stop time measurement
+    inline void stop();
+
+    //! Reset time counters to zero
+    inline void reset();
+
+    //! Time in msec. after start. If the stop watch is still running (i.e. there
+    //! was no call to stop()) then the elapsed time is returned, otherwise the
+    //! time between the last start() and stop call is returned
+    inline float getTime();
+
+    //! Mean time to date based on the number of times the stopwatch has been
+    //! _stopped_ (ie finished sessions) and the current total time
+    inline float getAverageTime();
+
+private:
+    // helper functions
+
+    //! Get difference between start time and current time
+    inline float getDiffTime();
+
+private:
+    // member variables
+
+    //! Start of measurement
+    struct timeval start_time;
+
+    //! Time difference between the last start and stop
+    float diff_time;
+
+    //! TOTAL time difference between starts and stops
+    float total_time;
+
+    //! flag if the stop watch is running
+    bool running;
+
+    //! Number of times clock has been started
+    //! and stopped to allow averaging
+    int clock_sessions;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::start() {
+    gettimeofday(&start_time, 0);
+    running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::stop() {
+    diff_time = getDiffTime();
+    total_time += diff_time;
+    running = false;
+    clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::reset() {
+    diff_time = 0;
+    total_time = 0;
+    clock_sessions = 0;
+
+    if (running) {
+        gettimeofday(&start_time, 0);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getTime() {
+    // Return the TOTAL time to date
+    float retval = total_time;
+
+    if (running) {
+        retval += getDiffTime();
+    }
+
+    return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getAverageTime() {
+    return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getDiffTime() {
+    struct timeval t_time;
+    gettimeofday(&t_time, 0);
+
+    // time difference in milli-seconds
+    return static_cast<float>(1000.0 * (t_time.tv_sec - start_time.tv_sec) +
+                              (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif  // WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkCreateTimer(StopWatchInterface **timer_interface) {
+// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    *timer_interface = reinterpret_cast<StopWatchInterface *>(new StopWatchWin());
+#else
+    *timer_interface =
+        reinterpret_cast<StopWatchInterface *>(new StopWatchLinux());
+#endif
+    return (*timer_interface != NULL) ? true : false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) {
+    // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface) {
+        delete *timer_interface;
+        *timer_interface = NULL;
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStartTimer(StopWatchInterface **timer_interface) {
+    // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface) {
+        (*timer_interface)->start();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStopTimer(StopWatchInterface **timer_interface) {
+    // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface) {
+        (*timer_interface)->stop();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkResetTimer(StopWatchInterface **timer_interface) {
+    // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface) {
+        (*timer_interface)->reset();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) {
+    //  printf("sdkGetAverageTimerValue called object %08x\n", (void
+    //  *)*timer_interface);
+    if (*timer_interface) {
+        return (*timer_interface)->getAverageTime();
+    } else {
+        return 0.0f;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetTimerValue(StopWatchInterface **timer_interface) {
+    // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface) {
+        return (*timer_interface)->getTime();
+    } else {
+        return 0.0f;
+    }
+}
+
+#endif  // COMMON_HELPER_TIMER_H_
\ No newline at end of file
diff --git a/3rdParty/googletest/CMakeLists.txt b/3rdParty/googletest/CMakeLists.txt
index acc5fb1ed8d37bcc89e9e60aceb28c7400e7cfca..781b60be44fc76ec640fea0444a527c10125e141 100644
--- a/3rdParty/googletest/CMakeLists.txt
+++ b/3rdParty/googletest/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Note: CMake support is community-based. The maintainers do not use CMake
 # internally.
 
-cmake_minimum_required(VERSION 2.8.8)
+cmake_minimum_required(VERSION 2.8.12)
 
 if (POLICY CMP0048)
   cmake_policy(SET CMP0048 NEW)
diff --git a/3rdParty/googletest/googlemock/CMakeLists.txt b/3rdParty/googletest/googlemock/CMakeLists.txt
index d32b70b5be0e0ae74f5376fb03a2226065ad599a..63cd3c61441ee2ba9177b6b366c069013b0ff4ad 100644
--- a/3rdParty/googletest/googlemock/CMakeLists.txt
+++ b/3rdParty/googletest/googlemock/CMakeLists.txt
@@ -42,7 +42,7 @@ else()
   cmake_policy(SET CMP0048 NEW)
   project(gmock VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
 endif()
-cmake_minimum_required(VERSION 2.6.4)
+cmake_minimum_required(VERSION 2.8.12)
 
 if (COMMAND set_up_hermetic_build)
   set_up_hermetic_build()
diff --git a/3rdParty/googletest/googletest/CMakeLists.txt b/3rdParty/googletest/googletest/CMakeLists.txt
index db292946a59453e09929229c1fbdb3701f2bd6ab..0ef01d22e7216988d86f728a27309d6494e36b1c 100644
--- a/3rdParty/googletest/googletest/CMakeLists.txt
+++ b/3rdParty/googletest/googletest/CMakeLists.txt
@@ -53,7 +53,7 @@ else()
   cmake_policy(SET CMP0048 NEW)
   project(gtest VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
 endif()
-cmake_minimum_required(VERSION 2.6.4)
+cmake_minimum_required(VERSION 2.8.12)
 
 if (POLICY CMP0063) # Visibility
   cmake_policy(SET CMP0063 NEW)
diff --git a/3rdParty/metis/metis-5.1.1/GKlib/GKlibSystem.cmake b/3rdParty/metis/metis-5.1.1/GKlib/GKlibSystem.cmake
index d83b2083c176a3addb3ddb951fd0e44923b18aa6..b8478c5d06fd5f7f1347ef8da75073a90fa5faa1 100644
--- a/3rdParty/metis/metis-5.1.1/GKlib/GKlibSystem.cmake
+++ b/3rdParty/metis/metis-5.1.1/GKlib/GKlibSystem.cmake
@@ -61,7 +61,9 @@ if(GDB)
   set(GKlib_COPTS "${GKlib_COPTS} -g")
   set(GKlib_COPTIONS "${GKlib_COPTIONS} -Werror")
 else()
-  set(GKlib_COPTS "-O3")
+  if(NOT MSVC)
+    set(GKlib_COPTS "-O3")
+  endif()
 endif(GDB)
 
 
diff --git a/3rdParty/metis/metis-5.1.1/libmetis/CMakeLists.txt b/3rdParty/metis/metis-5.1.1/libmetis/CMakeLists.txt
index 4732b645ea354ada4a61540e12f73bff90540cb5..802241df51949274173b3f647cf6a54615947eb5 100644
--- a/3rdParty/metis/metis-5.1.1/libmetis/CMakeLists.txt
+++ b/3rdParty/metis/metis-5.1.1/libmetis/CMakeLists.txt
@@ -6,10 +6,12 @@ file(GLOB metis_sources *.c)
 add_library(metis ${METIS_LIBRARY_TYPE} ${GKlib_sources} ${metis_sources})
 if(UNIX)
   target_link_libraries(metis m)
+
+  target_compile_options(metis PRIVATE "-Wno-format")
 endif()
 
 if(MSVC)
-   target_compile_options(metis PRIVATE "/w")
+   target_compile_options(metis PRIVATE "/W0")
 endif()
 
 if(METIS_INSTALL)
diff --git a/CMake/3rd.cmake b/CMake/3rd.cmake
index 6cc488f94716f7cec973874b0930ed8f9b719d08..781146111d48739671b35c98bb96ebff358809b4 100644
--- a/CMake/3rd.cmake
+++ b/CMake/3rd.cmake
@@ -1,5 +1,2 @@
 include(${VF_CMAKE_DIR}/3rd/boost.cmake)
-include(${VF_CMAKE_DIR}/3rd/cuda.cmake)
 include(${VF_CMAKE_DIR}/3rd/gmock.cmake)
-include(${VF_CMAKE_DIR}/3rd/mpi.cmake)
-include(${VF_CMAKE_DIR}/3rd/OpenMP.cmake)
\ No newline at end of file
diff --git a/CMake/3rd/OpenMP.cmake b/CMake/3rd/OpenMP.cmake
deleted file mode 100644
index 45465ba4bf43dd5cf7687b83ac1d368332614582..0000000000000000000000000000000000000000
--- a/CMake/3rd/OpenMP.cmake
+++ /dev/null
@@ -1,13 +0,0 @@
-function (linkOpenMP targetName)
-
-	if(NOT USE_OPENMP)
-		return()
-	endif()
-
-	find_package(OpenMP REQUIRED)
-
-	if(OpenMP_CXX_FOUND)
-		target_link_libraries(${targetName} PUBLIC OpenMP::OpenMP_CXX)
-	endif()
-
-endfunction()
\ No newline at end of file
diff --git a/CMake/3rd/boost.cmake b/CMake/3rd/boost.cmake
index 544ae2d97b3b8ef0277445cf252149986c8dfb3a..74f6f165cda1ef93f64dd326c961b2663bc6ea67 100644
--- a/CMake/3rd/boost.cmake
+++ b/CMake/3rd/boost.cmake
@@ -1,4 +1,10 @@
-function(linkBoost components)
+function(linkBoost)
+
+    set( options )
+    set( oneValueArgs )
+    set( multiValueArgs COMPONENTS)
+    cmake_parse_arguments( ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} )
+
   if(BUILD_SHARED_LIBS)
      if (WIN32)
          set(Boost_USE_STATIC_LIBS ON)
@@ -8,7 +14,9 @@ function(linkBoost components)
 	 set(Boost_USE_STATIC_RUNTIME OFF)
   else()
 	 set(Boost_USE_STATIC_LIBS ON)
-	 set(Boost_USE_STATIC_RUNTIME ON)
+   if(WIN32)
+	  set(Boost_USE_STATIC_RUNTIME ON)
+   endif()
   endif()
 	  
   set(Boost_USE_MULTITHREADED ON)
@@ -18,8 +26,16 @@ function(linkBoost components)
 #	add_definitions( -DBOOST_ALL_DYN_LINK )
   endif()
 
-  vf_get_library_name(library_name)
-  find_package( Boost REQUIRED COMPONENTS ${components})
+    vf_get_library_name(library_name)
+    if(DEFINED ARG_COMPONENTS)
+        find_package( Boost REQUIRED COMPONENTS ${ARG_COMPONENTS})
+        target_link_libraries(${library_name} PRIVATE ${Boost_LIBRARIES})
+        message("here")
+    else()
+        find_package( Boost REQUIRED)
+        message("or here")
+    endif()
+
+
   target_include_directories(${library_name} PRIVATE ${Boost_INCLUDE_DIR})
-  target_link_libraries(${library_name} PRIVATE ${Boost_LIBRARIES})
 endfunction()
diff --git a/CMake/3rd/cuda.cmake b/CMake/3rd/cuda.cmake
deleted file mode 100644
index 83acbdcc858e87e04d46d5a33e9b64b11c0e20e4..0000000000000000000000000000000000000000
--- a/CMake/3rd/cuda.cmake
+++ /dev/null
@@ -1,21 +0,0 @@
-
-function(linkCUDA)
-
-    find_path(CUDA_CUT_INCLUDE_DIR
-      helper_cuda.h
-      PATHS "$ENV{NVCUDASAMPLES_ROOT}" "${NVCUDASAMPLES_ROOT}"
-      PATH_SUFFIXES "common/inc" "Common"
-      DOC "Location of helper_cuda.h"
-      NO_DEFAULT_PATH
-    )
-
-    vf_get_library_name(library_name)
-    target_include_directories(${library_name} PRIVATE ${CUDA_CUT_INCLUDE_DIR})
-    target_include_directories(${library_name} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-
-    message(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-
-    # set the following properties only for specific targets
-    # set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
-    # set_property(TARGET ${targetName} PROPERTY CUDA_64_BIT_DEVICE_CODE ON)
-endfunction()
\ No newline at end of file
diff --git a/CMake/3rd/mpi.cmake b/CMake/3rd/mpi.cmake
deleted file mode 100644
index 93b3ee386cc46623ed02bbbffb88996328f03e52..0000000000000000000000000000000000000000
--- a/CMake/3rd/mpi.cmake
+++ /dev/null
@@ -1,9 +0,0 @@
-function (linkMPI)
-
-    find_package(MPI REQUIRED)
-
-    vf_get_library_name(library_name)
-    target_include_directories(${library_name} PUBLIC ${MPI_CXX_INCLUDE_PATH})
-    target_link_libraries(${library_name} PRIVATE MPI::MPI_CXX)
-
-endfunction()
\ No newline at end of file
diff --git a/CMake/CMakeSetCompilerFlags.cmake b/CMake/CMakeSetCompilerFlags.cmake
index 4165eeff8b8cce1e89d19beffefe7507496fee49..784f3f24a7cde518e113363aa92ce90fec0e9c2d 100644
--- a/CMake/CMakeSetCompilerFlags.cmake
+++ b/CMake/CMakeSetCompilerFlags.cmake
@@ -1,29 +1,21 @@
 
-###############################################################
-# set hostname -> CAB_MACHINE and load an optional config file
-###############################################################
+#########################################################################################
+## Access the hostname and loads a optional machine file hostname.cmake
+#########################################################################################
 macro(loadMachineFile)
 
-    IF(NOT CAB_MACHINE)
-        SET(CAB_MACHINE $ENV{CAB_MACHINE})
+    site_name(MACHINE_NAME)
+    string(TOUPPER  "${MACHINE_NAME}" MACHINE_NAME)
 
-        IF( CAB_MACHINE )
-            STRING(TOUPPER  "${CAB_MACHINE}" CAB_MACHINE)
-        ELSE()
-            EXECUTE_PROCESS( COMMAND hostname OUTPUT_VARIABLE CAB_MACHINE)
-            STRING(REGEX REPLACE "[ ]*([A-Za-z0-9]+).*[\\\\n]*" "\\1" CAB_MACHINE "${CAB_MACHINE}" )
-            STRING(TOUPPER  "${CAB_MACHINE}" CAB_MACHINE)
-        ENDIF()
-    ENDIF()
+    set(BUILD_MACHINE_FILE_PATH "${VF_CMAKE_DIR}/cmake_config_files")
 
-    LIST(APPEND VF_COMPILER_DEFINITION CAB_MACHINE=${CAB_MACHINE})
-    SET(CMAKE_CONFIG_FILE "${VF_CMAKE_DIR}/cmake_config_files/${CAB_MACHINE}.config.cmake")
+    set(MACHINE_FILE "${BUILD_MACHINE_FILE_PATH}/${MACHINE_NAME}.config.cmake")
 
-    IF(NOT EXISTS ${CMAKE_CONFIG_FILE})
-        status("No configuration file found for machine: ${CAB_MACHINE}.")
+    IF(NOT EXISTS ${MACHINE_FILE})
+        status("No configuration file found: ${MACHINE_FILE}.")
     ELSE()
-        status("Load configuration file ${CAB_MACHINE}.config.cmake")
-        include(${CMAKE_CONFIG_FILE})
+        status("Load configuration file: ${MACHINE_FILE}")
+        include(${MACHINE_FILE})
     ENDIF()
 
 endmacro()
@@ -35,9 +27,9 @@ endmacro()
 ################################################################
 macro(loadCompilerFlags)
 
-  SET(CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "")
-  SET(CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_DEBUG "")
-  SET(CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_RELEASE "")
+  SET(CS_COMPILER_FLAGS_CXX "")
+  SET(CS_COMPILER_FLAGS_CXX_DEBUG "")
+  SET(CS_COMPILER_FLAGS_CXX_RELEASE "")
 
    # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html#variable:CMAKE_<LANG>_COMPILER_ID
 
@@ -57,9 +49,9 @@ endmacro()
 ################################################################
 function(addAdditionalFlags project_name)
 
-    status_lib("additional compiler flags CXX: ${CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS}")
-    status_lib("additional compiler flags CXX debug: ${CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_DEBUG}")
-    status_lib("additional compiler flags CXX release: ${CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_RELEASE}")
+    status_lib("additional compiler flags CXX: ${CS_COMPILER_FLAGS_CXX}")
+    status_lib("additional compiler flags CXX debug: ${CS_COMPILER_FLAGS_CXX_DEBUG}")
+    status_lib("additional compiler flags CXX release: ${CS_COMPILER_FLAGS_CXX_RELEASE}")
     status_lib("additional compiler definitions: ${VF_COMPILER_DEFINITION}")
     status_lib("additional linker flags: ${VF_LINK_OPTIONS}")
 
@@ -74,15 +66,15 @@ function(addAdditionalFlags project_name)
     endforeach()
 
     # compile options
-    foreach(flag IN LISTS CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS)
+    foreach(flag IN LISTS CS_COMPILER_FLAGS_CXX)
         target_compile_options(${project_name} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${flag}>")
     endforeach()
 
-    foreach(flag IN LISTS CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_DEBUG)
+    foreach(flag IN LISTS CS_COMPILER_FLAGS_CXX_DEBUG)
         target_compile_options(${project_name} PRIVATE "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${flag}>")
     endforeach()
 
-    foreach(flag IN LISTS CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_RELEASE)
+    foreach(flag IN LISTS CS_COMPILER_FLAGS_CXX_RELEASE)
         target_compile_options(${project_name} PRIVATE "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:${flag}>")
     endforeach()
 
diff --git a/CMake/VirtualFluidsMacros.cmake b/CMake/VirtualFluidsMacros.cmake
index 69e56fa63d4f25adb84e539678055e7f46b62d3d..debb5ee7826d7dc3d6499b3813070dcab4b94bd4 100644
--- a/CMake/VirtualFluidsMacros.cmake
+++ b/CMake/VirtualFluidsMacros.cmake
@@ -112,6 +112,15 @@ function(vf_add_library)
     else()
         vf_get_library_name (library_name)
     endif()
+
+    if(NOT DEFINED ARG_BUILDTYPE)
+        if(BUILD_SHARED_LIBS)
+            set(ARG_BUILDTYPE "shared")
+        else()
+            set(ARG_BUILDTYPE "static")
+        endif()
+    endif()
+
     status("Configuring the target: ${library_name} (type=${ARG_BUILDTYPE})...")
 
 
@@ -216,6 +225,10 @@ function(vf_add_library)
     target_include_directories(${library_name} PRIVATE ${VF_SRC_DIR}/gpu)
     target_include_directories(${library_name} PRIVATE ${VF_SRC_DIR}/cpu)
 
+    if(BUILD_VF_GPU)
+        target_include_directories(${library_name} PRIVATE "${VF_THIRD_DIR}/cuda_samples/")
+        target_include_directories(${library_name} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    endif()
 
     status("... configuring target: ${library_name} (type=${ARG_BUILDTYPE}) done")
 
diff --git a/CMake/cmake_config_files/PHOENIX.config.cmake b/CMake/cmake_config_files/PHOENIX.config.cmake
index 1d69df88bc174c206fb3639b7d67e703dd41d5eb..2f576538c106a6a4d83509a49a1408a8d63efbdb 100644
--- a/CMake/cmake_config_files/PHOENIX.config.cmake
+++ b/CMake/cmake_config_files/PHOENIX.config.cmake
@@ -43,4 +43,5 @@ SET(BOOST_LIBRARYDIR  "/cluster/lib/boost/1.63.0/gcc/lib"  CACHE PATH "BOOST_LIB
 #SET(VTK_DIR "/home/irmb/tools/VTK/build/VTK-8.2.0" CACHE PATH "VTK directory override" FORCE)
 #SET(VTK_DIR "/home/stelenz/software/vtk/VTK-8.1.0/build" CACHE PATH "VTK directory override" FORCE)
 
-set(NVCUDASAMPLES_ROOT "/cluster/cuda/11.0/samples")
+## nvidia
+set(CMAKE_CUDA_ARCHITECTURES 60) # NVIDIA Tesla P100
\ No newline at end of file
diff --git a/CMake/compilerflags/AppleClang.cmake b/CMake/compilerflags/AppleClang.cmake
index aecc48fa5398c6418c2d47f5af243659128050bc..6f52ad35956a967b3892aed26257fa1c41cb0c46 100644
--- a/CMake/compilerflags/AppleClang.cmake
+++ b/CMake/compilerflags/AppleClang.cmake
@@ -3,20 +3,26 @@
 #############################################################################################################
 
 # debug
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_DEBUG "-g")  # generates debug information. Works best with -O0.
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_DEBUG "-O0")
+list(APPEND CS_COMPILER_FLAGS_CXX_DEBUG "-g")  # generates debug information. Works best with -O0.
+list(APPEND CS_COMPILER_FLAGS_CXX_DEBUG "-O0")
 
 # release
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_RELEASE "-O3") # optimization level (-O3: most optimization which also could result in larger binaries)
+list(APPEND CS_COMPILER_FLAGS_CXX_RELEASE "-O3") # optimization level (-O3: most optimization which also could result in larger binaries)
 
 # all
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-fPIC") # position independent code for shared libraries
+list(APPEND CS_COMPILER_FLAGS_CXX "-fPIC") # position independent code for shared libraries
 
 #############################################################################################################
 # warnings
 #############################################################################################################
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-Wall")
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-Wunreachable-code")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wall")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wextra")
+list(APPEND CS_COMPILER_FLAGS_CXX "-pedantic")
 
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-Wno-unused-function")
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-Wno-reorder")
+if(BUILD_WARNINGS_AS_ERRORS)
+    list(APPEND CS_COMPILER_FLAGS_CXX -Werror)
+endif()
+
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wno-unused-function")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wno-unused-parameter")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wno-reorder")
diff --git a/CMake/compilerflags/Clang.cmake b/CMake/compilerflags/Clang.cmake
index 434be42697165ea347b2c2728e9cf836b66650af..2eb4eec5ee89715a1668ef078c84f4e720bd04e6 100644
--- a/CMake/compilerflags/Clang.cmake
+++ b/CMake/compilerflags/Clang.cmake
@@ -3,22 +3,31 @@
 #############################################################################################################
 
 # debug
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_DEBUG "-g")  # generates debug information. Works best with -O0.
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_DEBUG "-O0")
+list(APPEND CS_COMPILER_FLAGS_CXX_DEBUG "-g")  # generates debug information. Works best with -O0.
+list(APPEND CS_COMPILER_FLAGS_CXX_DEBUG "-O0")
 
 # release
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_RELEASE "-O3") # optimization level (-O3: most optimization which also could result in larger binaries)
+list(APPEND CS_COMPILER_FLAGS_CXX_RELEASE "-O3") # optimization level (-O3: most optimization which also could result in larger binaries)
 
 # all
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-fPIC") # position independent code for shared libraries
+list(APPEND CS_COMPILER_FLAGS_CXX "-fPIC") # position independent code for shared libraries
 
 
 #############################################################################################################
 # warnings
 #############################################################################################################
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-Wall")
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-Wno-unused-function")
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-Wno-reorder-ctor")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wall")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wextra")
+list(APPEND CS_COMPILER_FLAGS_CXX "-pedantic")
+
+if(BUILD_WARNINGS_AS_ERRORS)
+    list(APPEND CS_COMPILER_FLAGS_CXX -Werror)
+endif()
+
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wno-unused-function")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wno-unused-parameter")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wno-reorder-ctor")
+
 
 
 #############################################################################################################
diff --git a/CMake/compilerflags/GNU.cmake b/CMake/compilerflags/GNU.cmake
index 211f2a040b9c6c71d632fbee8621377d1df63350..3e67b79e9c84d29b51b2881b17e2b74f5510bbd2 100644
--- a/CMake/compilerflags/GNU.cmake
+++ b/CMake/compilerflags/GNU.cmake
@@ -3,33 +3,41 @@
 #############################################################################################################
 
 # debug
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_DEBUG "-g")  # generates debug information. Works best with -O0.
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_DEBUG "-O0") # no optimization
+list(APPEND CS_COMPILER_FLAGS_CXX_DEBUG "-g")  # generates debug information. Works best with -O0.
+list(APPEND CS_COMPILER_FLAGS_CXX_DEBUG "-O0") # no optimization
 
 # release
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_RELEASE "-O3") # optimization level (-O3: most optimization which also could result in larger binaries)
+list(APPEND CS_COMPILER_FLAGS_CXX_RELEASE "-O3") # optimization level (-O3: most optimization which also could result in larger binaries)
 
 # all
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-fPIC") # position independent code for shared libraries
+list(APPEND CS_COMPILER_FLAGS_CXX "-fPIC") # position independent code for shared libraries
 
 if(NOT BUILD_VF_INCLUDE_WHAT_YOU_USE) # optimization flag '-funroll-all-loops' is not supported for IWYU
-    LIST(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-funroll-all-loops")
+    LIST(APPEND CS_COMPILER_FLAGS_CXX "-funroll-all-loops")
 endif()
 
 # gcov
 if (BUILD_VF_COVERAGE)
-    list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "--coverage")
+    list(APPEND CS_COMPILER_FLAGS_CXX "--coverage")
     set(CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS} " --coverage")
 endif()
 
 #############################################################################################################
 # warnings
 #############################################################################################################
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-Wall")
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-Wno-unused-function")
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-Wno-reorder")
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-Wno-sign-compare")
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-Wno-unknown-pragmas")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wall")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wextra")
+list(APPEND CS_COMPILER_FLAGS_CXX "-pedantic")
+
+if(BUILD_WARNINGS_AS_ERRORS)
+    list(APPEND CS_COMPILER_FLAGS_CXX -Werror)
+endif()
+
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wno-unused-function")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wno-unused-parameter")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wno-reorder")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wno-unknown-pragmas")
+list(APPEND CS_COMPILER_FLAGS_CXX "-Wno-cast-function-type")
 
 #############################################################################################################
 # linker options
diff --git a/CMake/compilerflags/Intel.cmake b/CMake/compilerflags/Intel.cmake
index a53998d93eec146de6158d80bad2302536b2c252..c7177d4c246402d8b865e7059273cb18d0938e44 100644
--- a/CMake/compilerflags/Intel.cmake
+++ b/CMake/compilerflags/Intel.cmake
@@ -2,22 +2,22 @@
 # compiler flags
 #############################################################################################################
 
-#LIST(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-O")
-#~ LIST(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-wd654")
-#~ LIST(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-wd1125") #virtual function override intended
-#~ LIST(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-wd1224") #warning directive: This file includes at least one deprecated or antiquated header
-#~ LIST(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-wd377")  #class "std::auto_ptr<RCF::I_ClientTransport>" has no suitable copy constructor
-#~ LIST(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-wd327")  #class "std::auto_ptr<RCF::I_ClientTransport>" has no suitable copy constructor
-#~ LIST(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-wd327")  #class "std::auto_ptr<RCF::I_ClientTransport>" has no suitable copy constructor
-#~
-#~ LIST(APPEND CAB_COMPILER_ADDTIONAL_C_COMPILER_FLAGS "-wd266")  #function "__GKfree" declared implicitly
-#LIST(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-xHOST -O3 -ip -ipo -fno-alias -mcmodel=medium -qopt-streaming-stores=always")
+#LIST(APPEND CS_COMPILER_FLAGS_CXX "-O")
+#LIST(APPEND CS_COMPILER_FLAGS_CXX "-wd654")
+#LIST(APPEND CS_COMPILER_FLAGS_CXX "-wd1125") #virtual function override intended
+#LIST(APPEND CS_COMPILER_FLAGS_CXX "-wd1224") #warning directive: This file includes at least one deprecated or antiquated header
+#LIST(APPEND CS_COMPILER_FLAGS_CXX "-wd377")  #class "std::auto_ptr<RCF::I_ClientTransport>" has no suitable copy constructor
+#LIST(APPEND CS_COMPILER_FLAGS_CXX "-wd327")  #class "std::auto_ptr<RCF::I_ClientTransport>" has no suitable copy constructor
+#LIST(APPEND CS_COMPILER_FLAGS_CXX "-wd327")  #class "std::auto_ptr<RCF::I_ClientTransport>" has no suitable copy constructor
+#
+#LIST(APPEND CAB_COMPILER_ADDTIONAL_C_COMPILER_FLAGS "-wd266")  #function "__GKfree" declared implicitly
+#LIST(APPEND CS_COMPILER_FLAGS_CXX "-xHOST -O3 -ip -ipo -fno-alias -mcmodel=medium -qopt-streaming-stores=always")
 
 # all
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-xHOST;-O3;-ip;-fno-alias;-mcmodel=medium;-qopt-streaming-stores=always;-xCORE-AVX512;-qopt-zmm-usage=high")
+list(APPEND CS_COMPILER_FLAGS_CXX "-xHOST;-O3;-ip;-fno-alias;-mcmodel=medium;-qopt-streaming-stores=always;-xCORE-AVX512;-qopt-zmm-usage=high")
 
 # debug
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS_DEBUG "-g -traceback")
+list(APPEND CS_COMPILER_FLAGS_CXX_DEBUG "-g -traceback")
 
 
 #############################################################################################################
diff --git a/CMake/compilerflags/MSVC.cmake b/CMake/compilerflags/MSVC.cmake
index 937d9f4d5d1fead78f35e2442845b27f13bc9a54..2af38d98e63cf04c4da476fb02754ce47510e4f6 100644
--- a/CMake/compilerflags/MSVC.cmake
+++ b/CMake/compilerflags/MSVC.cmake
@@ -1,29 +1,29 @@
 #############################################################################################################
 # compiler flags
 #############################################################################################################
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "/bigobj") # increases that address capacity to 4,294,967,296 (2^32).
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "-MP")     # enable multi-threaded compiling
+list(APPEND CS_COMPILER_FLAGS_CXX "/bigobj") # increases that address capacity to 4,294,967,296 (2^32).
+list(APPEND CS_COMPILER_FLAGS_CXX "-MP")     # enable multi-threaded compiling
 
 
 #############################################################################################################
 # warnings
 #############################################################################################################
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "/W4") # highest warning level
+list(APPEND CS_COMPILER_FLAGS_CXX "/W4") # highest warning level
 
 # With W4 the following warnings appear many times. As long they are not eliminated they are suppressed:
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "/wd4458") # C4458: declaration of 'XXX' hides class member
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "/wd4100") # C4100: 'XXX': unreferenced formal parameter
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "/wd4505") # C4505: 'XXX': unreferenced local function has been removed
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "/wd4244") # C4244: '=': conversion from 'int' to 'char', possible loss of data, triggered by algorithm(2216,24)
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "/wd4310") # C4310: cast truncates constant value, triggerd by muParserbase.h
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "/wd4127") # C4127: conditional expression is constant: e.g. sizeof(int)
+list(APPEND CS_COMPILER_FLAGS_CXX "/wd4458") # C4458: declaration of 'XXX' hides class member
+list(APPEND CS_COMPILER_FLAGS_CXX "/wd4100") # C4100: 'XXX': unreferenced formal parameter
+list(APPEND CS_COMPILER_FLAGS_CXX "/wd4505") # C4505: 'XXX': unreferenced local function has been removed
+list(APPEND CS_COMPILER_FLAGS_CXX "/wd4244") # C4244: '=': conversion from 'int' to 'char', possible loss of data, triggered by algorithm(2216,24)
+list(APPEND CS_COMPILER_FLAGS_CXX "/wd4310") # C4310: cast truncates constant value, triggerd by muParserbase.h
+list(APPEND CS_COMPILER_FLAGS_CXX "/wd4127") # C4127: conditional expression is constant: e.g. sizeof(int)
 
 # Urgent FIXME: This warning should be activated and fixed:
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "/wd4701") # C4701: potentially uninitialized local variable 'lMaxX3' used
+list(APPEND CS_COMPILER_FLAGS_CXX "/wd4701") # C4701: potentially uninitialized local variable 'lMaxX3' used
 
 
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "/wd4251") # disable needs to have dll interface
-list(APPEND CAB_COMPILER_ADDTIONAL_CXX_COMPILER_FLAGS "/wd4005") # disable macro redefinition (triggered by metis.h)
+list(APPEND CS_COMPILER_FLAGS_CXX "/wd4251") # disable needs to have dll interface
+list(APPEND CS_COMPILER_FLAGS_CXX "/wd4005") # disable macro redefinition (triggered by metis.h)
 
 #############################################################################################################
 # preprocessor definitions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 50073d8a0016c0345e9ae7ae3cfc5507afb6ad14..efce16caaf8d40ae3f5a9ca9bcedd5c5a6767bf5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,17 +8,15 @@
 #################################################################################
 #  required cmake versions
 #  CMAKE 3.13: target_link_options
+#  CMAKE 3.15: CMAKE_MSVC_RUNTIME_LIBRARY
 #################################################################################
-cmake_minimum_required(VERSION 3.13..3.18 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.15..3.19 FATAL_ERROR)
 
 project(VirtualFluids CXX)
 
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-set(CMAKE_CUDA_STANDARD 14)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER ".cmake")
 set(libraryFolder "libs")
@@ -37,30 +35,36 @@ set (VF_ROOT_DIR  ${CMAKE_CURRENT_SOURCE_DIR})
 option(BUILD_VF_CPU "Build VirtualFluids cpu variant" OFF)
 option(BUILD_VF_GPU "Build VirtualFluids gpu variant" OFF)
 
+option(BUILD_USE_OPENMP "Build VirtualFluids with openmp" ON)
+
+
+# vf gpu
+option(BUILD_VF_GPU          "Build VirtualFluids GPU"     ON )
+option(BUILD_VF_GKS          "Build VirtualFluids GKS"     OFF )
+option(BUILD_VF_TRAFFIC      "Build VirtualFluids Traffic" OFF)
+option(BUILD_JSONCPP         "Builds json cpp "            OFF)
+option(BUILD_NUMERIC_TESTS   "Build numeric tests"         OFF)
+
 option(BUILD_VF_UNIT_TESTS "Build VirtualFluids unit tests" OFF)
 option(BUILD_VF_CLANG_TIDY "Add the clang tidy checks to the targets" OFF)
 option(BUILD_VF_INCLUDE_WHAT_YOU_USE "Add IWYU to the targets" OFF)
 option(BUILD_VF_CPPCHECK "Add cppcheck to the targets" OFF)
 option(BUILD_VF_COVERAGE "Add the -coverage compiler flag." OFF)
 
-option(BUILD_SHARED_LIBS "" ON)
+option(BUILD_SHARED_LIBS "" OFF)
+option(BUILD_WARNINGS_AS_ERRORS "" OFF)
 
-option(USE_OPENMP "Include OpenMP support" ON)
+# windows: use multi-threaded dynamically-linked runtime library
+if(BUILD_SHARED_LIBS)
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
+else()
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
+endif()
 
 option(BUILD_VF_PYTHON_BINDINGS "" OFF)
 
 option(BUILD_VF_DOUBLE_ACCURACY "Use double accuracy" OFF)
 
-#################################################################################
-#  CMAKE POLICIES
-#################################################################################
-# CMAKE_CUDA_ARCHITECTURES
-# https://cmake.org/cmake/help/git-stage/policy/CMP0104.htmls
-if(POLICY CMP0104)
-    cmake_policy(SET CMP0104 NEW)
-    set(CMAKE_CUDA_ARCHITECTURES 30)
-    # with cuda 11 the minimum architecture is 52
-endif()
 
 #################################################################################
 #  MACROS
@@ -75,6 +79,13 @@ ENDIF()
 #################################################################################
 #  COMMON LIBRARIES
 #################################################################################
+if(BUILD_USE_OPENMP)
+    find_package(OpenMP REQUIRED)
+endif()
+
+find_package(MPI REQUIRED)
+
+
 add_subdirectory(src/basics)
 
 #################################################################################
@@ -84,6 +95,24 @@ if (BUILD_VF_CPU)
     include (cpu.cmake)
 endif()
 if(BUILD_VF_GPU)
+
+    include(CheckLanguage)
+    check_language(CUDA)
+
+    if(NOT CMAKE_CUDA_COMPILER)
+        message(FATAL_ERROR "CUDA Compiler was requested but is not found on the system.")
+    endif()
+
+    set(CMAKE_CUDA_STANDARD 11)
+    set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+
+    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        message(WARNING "CMAKE_CUDA_ARCHITECTURES was not defined and is set to 30 (CUDA support until 10.1 only).")
+        set(CMAKE_CUDA_ARCHITECTURES 30)
+    endif()
+
+    message("CUDA Architecture: ${CMAKE_CUDA_ARCHITECTURES}")
+
     include (gpu.cmake)
 endif()
 
@@ -91,5 +120,7 @@ endif()
 #  3rd Party Libraries
 #################################################################################
 if(BUILD_VF_UNIT_TESTS)
-    add_subdirectory(${VF_THIRD_DIR}/googletest)
+    if(NOT BUILD_NUMERIC_TESTS) # in this case googletest is already included.
+        add_subdirectory(${VF_THIRD_DIR}/googletest)
+    endif()
 endif()
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 0000000000000000000000000000000000000000..6863446af85b177bb4dc99eed475aa52f4d50269
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,133 @@
+{
+  "version": 1,
+  "cmakeMinimumRequired": {
+    "major": 3,
+    "minor": 19,
+    "patch": 0
+  },
+  "configurePresets": [
+    {
+      "name": "default",
+      "hidden": true,
+      "binaryDir": "${sourceDir}/build/",
+      "cacheVariables": {
+        "BUILD_VF_UNIT_TESTS": "ON"
+      }
+    },
+    {
+      "name": "default_make",
+      "inherits": "default",
+      "hidden": true,
+      "generator": "Unix Makefiles"
+    },
+    {
+      "name": "default_ccache_make",
+      "inherits": "default_make",
+      "hidden": true,
+      "cacheVariables": {
+        "CMAKE_CXX_COMPILER_LAUNCHER": "ccache",
+        "CMAKE_CUDA_COMPILER_LAUNCHER": "ccache",
+        "CMAKE_C_COMPILER_LAUNCHER": "ccache"
+      }
+    },
+    {
+      "name": "default_msvc",
+      "inherits": "default",
+      "hidden": true,
+      "generator": "Visual Studio 16 2019",
+      "architecture": "x64"
+    },
+    {
+      "name": "default_cpu",
+      "hidden": true,
+      "description": "CPU build of VirtualFluids",
+      "cacheVariables": {
+        "BUILD_VF_CPU": "ON"
+      }
+    },
+    {
+      "name": "default_gpu",
+      "hidden": true,
+      "description": "GPU build of VirtualFluids",
+      "cacheVariables": {
+        "BUILD_VF_GPU": "ON"
+      }
+    },
+    {
+      "name": "default_gpu_numerical_tests",
+      "inherits": ["default_gpu"],
+      "hidden": true,
+      "description": "GPU numerical tests of VirtualFluids",
+      "cacheVariables": {
+        "BUILD_VF_DOUBLE_ACCURACY": "ON",
+        "BUILD_NUMERIC_TESTS": "ON"
+      }
+    },
+    {
+      "name": "default_all",
+      "hidden": true,
+      "description": "All build of VirtualFluids",
+      "inherits": ["default_cpu", "default_gpu"]
+    },
+    {
+      "name": "cpu_make",
+      "inherits": ["default_make", "default_cpu"],
+      "displayName": "cpu make configuration"
+    },
+    {
+      "name": "cpu_make_ccache",
+      "inherits": ["default_ccache_make", "default_cpu"],
+      "displayName": "cpu ccache make configuration"
+    },
+    {
+      "name": "cpu_msvc",
+      "inherits": ["default_msvc", "default_cpu"],
+      "displayName": "cpu msvc configuration"
+    },
+    {
+      "name": "gpu_make",
+      "inherits": ["default_make", "default_gpu"],
+      "displayName": "gpu make configuration"
+    },
+    {
+      "name": "gpu_make_ccache",
+      "inherits": ["default_ccache_make", "default_gpu"],
+      "displayName": "gpu ccache make configuration"
+    },
+    {
+      "name": "gpu_msvc",
+      "inherits": ["default_msvc", "default_gpu"],
+      "displayName": "gpu msvc configuration"
+    },
+    {
+      "name": "all_make",
+      "inherits": ["default_make", "default_all"],
+      "displayName": "all make configuration"
+    },
+    {
+      "name": "all_make_ccache",
+      "inherits": ["default_ccache_make", "default_all"],
+      "displayName": "all ccache make configuration"
+    },
+    {
+      "name": "all_msvc",
+      "inherits": ["default_msvc", "default_all"],
+      "displayName": "all msvc configuration"
+    },
+    {
+      "name": "gpu_numerical_tests_make",
+      "inherits": ["default_make", "default_gpu_numerical_tests"],
+      "displayName": "gpu numerical tests make configuration"
+    },
+    {
+      "name": "gpu_numerical_tests_ccache_make",
+      "inherits": ["default_ccache_make", "default_gpu_numerical_tests"],
+      "displayName": "gpu numerical tests ccache make configuration"
+    },
+    {
+      "name": "gpu_numerical_tests_msvc",
+      "inherits": ["default_msvc", "default_gpu_numerical_tests"],
+      "displayName": "gpu numerical tests msvc configuration"
+    }
+  ]
+}
diff --git a/apps/gpu/LidDrivenCavityGPU/CMakeLists.txt b/apps/gpu/LidDrivenCavityGPU/CMakeLists.txt
index f4e91979beaf3a63f1691803abe0feaf09a2d6cb..4a8f54ceea63c6efee33f38814d4b8db48695a2e 100644
--- a/apps/gpu/LidDrivenCavityGPU/CMakeLists.txt
+++ b/apps/gpu/LidDrivenCavityGPU/CMakeLists.txt
@@ -1,8 +1,3 @@
-
-
-PROJECT(LidDrivenCavityGPU)
-
+project(LidDrivenCavityGPU LANGUAGES CUDA CXX)
 
 vf_add_library(BUILDTYPE binary PRIVATE_LINK basics GridGenerator VirtualFluids_GPU GksMeshAdapter GksGpu FILES LidDrivenCavity.cpp)
-
-linkCUDA()
diff --git a/gpu.cmake b/gpu.cmake
index 8ccd015747d94b4724317da10f0dce6eba00750e..ff11b0a60ef949089130f8a8b2885d267ce91050 100644
--- a/gpu.cmake
+++ b/gpu.cmake
@@ -1,31 +1,7 @@
 
-#############################################################
-###                     CUDAPATH                          ###
-#############################################################
-
-# if CMake cannot find CUDA by itself, set the correct paths manually:
-#SET(CUDA_CUT_INCLUDE_DIR    "/cluster/cuda/9.0/include;/cluster/cuda/9.0/samples/common/inc" CACHE PATH "CUDA_CUT_INCLUDE_DIR")
-#SET(CUDA_SAMPLE_INCLUDE_DIR "/cluster/cuda/9.0/samples/common/inc" CACHE PATH "CUDA_CUT_INCLUDE_DIR")
-
-#############################################################
-###                         OPTIONS                       ###
-#############################################################
-
-option(VF_DOUBLE_ACCURACY       "Use double accuracy"     ON )
-
-
-#############################################################
-
-enable_language(CUDA)
-
-#############################################################
-
-
 # only use this with device of CC larger than 6.0
 IF(VF_DOUBLE_ACCURACY)
-    set(CMAKE_CUDA_FLAGS " -arch=sm_60" CACHE STRING "" FORCE)
-ELSE(VF_DOUBLE_ACCURACY)
-    set(CMAKE_CUDA_FLAGS "" CACHE STRING "" FORCE)
+    set(CMAKE_CUDA_ARCHITECTURES 60)
 ENDIF(VF_DOUBLE_ACCURACY)
 set(CMAKE_CUDA_FLAGS_DEBUG " -G" CACHE STRING "" FORCE)
 
diff --git a/sonar-project.properties b/sonar-project.properties
index 51f5908ff5e93027d8d0a7a02224edb7a08f7142..79510bce96750061e92d6d0bf720eb1c6427790a 100644
--- a/sonar-project.properties
+++ b/sonar-project.properties
@@ -1,12 +1,9 @@
 # must be unique in a given SonarQube instance
 sonar.projectKey=vf:project:open_source
 
-# --- optional properties ---
-
-# defaults to project key
 sonar.projectName=VirtualFluids
-# defaults to 'not provided'
-#sonar.projectVersion=1.0
+
+sonar.projectVersion=1.0
 
 sonar.language=cxx
 
@@ -36,4 +33,4 @@ sonar.cxx.gcc.reportPath=build/gcc_warnings.txt
 
 sonar.cxx.funccomplexity.threshold=10
 
-sonar.cxx.funcsize.threshold=20
\ No newline at end of file
+sonar.cxx.funcsize.threshold=20
diff --git a/src/basics/CMakeLists.txt b/src/basics/CMakeLists.txt
index 0b643212ad9c5da74e711d72da085c1fe670704e..1703d6269c1cebe36005226373da85b14c5515f6 100644
--- a/src/basics/CMakeLists.txt
+++ b/src/basics/CMakeLists.txt
@@ -1,7 +1,7 @@
 
 include(Core/buildInfo.cmake)
 
-vf_add_library(BUILDTYPE static EXCLUDE buildInfo.in.cpp)
+vf_add_library(PUBLIC_LINK MPI::MPI_CXX EXCLUDE buildInfo.in.cpp)
 
 vf_get_library_name (library_name)
 target_include_directories(${library_name} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/Core)
@@ -21,7 +21,4 @@ IF(MSVC)
     target_compile_definitions(${library_name} PUBLIC NOMINMAX) # Disable Min/Max-Macros
 ENDIF(MSVC)
 
-
-linkMPI()
-
-vf_add_tests()
\ No newline at end of file
+vf_add_tests()
diff --git a/src/basics/Core/Input/ConfigFileReader/ConfigFileReader.cpp b/src/basics/Core/Input/ConfigFileReader/ConfigFileReader.cpp
index dce44b5551d2aedcf0c0d70a21dbcd9e8b071c8d..8fdb9fe1bc7f5dfe10dbfb522a87a4a3187dd60b 100644
--- a/src/basics/Core/Input/ConfigFileReader/ConfigFileReader.cpp
+++ b/src/basics/Core/Input/ConfigFileReader/ConfigFileReader.cpp
@@ -6,20 +6,17 @@
 #include <fstream>
 #include <iostream>
 
-BASICS_EXPORT std::shared_ptr<ConfigFileReader> ConfigFileReader::getNewInstance()
+std::shared_ptr<ConfigFileReader> ConfigFileReader::getNewInstance()
 {
     return std::shared_ptr<ConfigFileReader>(new ConfigFileReader());
 }
 
-ConfigFileReader::ConfigFileReader() = default;
-
-BASICS_EXPORT ConfigFileReader::~ConfigFileReader() = default;
-
-BASICS_EXPORT std::shared_ptr<ConfigData> ConfigFileReader::readConfigFile(const std::string &filePath) const
+std::shared_ptr<ConfigData> ConfigFileReader::readConfigFile(const char* filePath) const
 {
+    std::cout << filePath << std::endl;
     std::shared_ptr<ConfigDataImp> data = ConfigDataImp::getNewInstance();
     std::ifstream stream;
-    stream.open(filePath.c_str(), std::ios::in);
+    stream.open(filePath, std::ios::in);
     if (stream.fail())
         throw std::runtime_error("can not open config file!");
     std::unique_ptr<input::Input> input = input::Input::makeInput(stream, "config");
diff --git a/src/basics/Core/Input/ConfigFileReader/ConfigFileReader.h b/src/basics/Core/Input/ConfigFileReader/ConfigFileReader.h
index 9d88dfce6a7337d8ffaa7c4ffe43a4fd63949914..77c93ebfa4ba8564188d8e4a5442963382cf91e3 100644
--- a/src/basics/Core/Input/ConfigFileReader/ConfigFileReader.h
+++ b/src/basics/Core/Input/ConfigFileReader/ConfigFileReader.h
@@ -1,9 +1,11 @@
 #ifndef CONFIGFILEREADER_H
 #define CONFIGFILEREADER_H
 
-#include "../Input.h"
 
 #include <memory>
+#include <string>
+
+#include "basics_export.h"
 
 class ConfigData;
 
@@ -11,11 +13,10 @@ class ConfigFileReader
 {
 public:
     BASICS_EXPORT static std::shared_ptr<ConfigFileReader> getNewInstance();
-    BASICS_EXPORT virtual ~ConfigFileReader();
 
-    BASICS_EXPORT std::shared_ptr<ConfigData> readConfigFile(const std::string &filePath) const;
+    BASICS_EXPORT std::shared_ptr<ConfigData> readConfigFile(const char* filePath) const;
 
 private:
-    ConfigFileReader();
+    ConfigFileReader() = default;
 };
 #endif
diff --git a/src/basics/Core/StringUtilities/StringUtil.cpp b/src/basics/Core/StringUtilities/StringUtil.cpp
index 31496beae5953159c8d9a08a25fb5e808d9efea3..327a62346a91073834c8b710e90968a524ee2d28 100644
--- a/src/basics/Core/StringUtilities/StringUtil.cpp
+++ b/src/basics/Core/StringUtilities/StringUtil.cpp
@@ -1,5 +1,6 @@
 #include "StringUtil.h"
 
+#include <string.h>
 #include <regex>
 #include <sstream>
 
diff --git a/src/basics/Core/StringUtilities/StringUtil.h b/src/basics/Core/StringUtilities/StringUtil.h
index cdf8dce290110848d5e6c50eb2fac35822dddf76..1927a69bc60bf2467fb09893463b3c9363191890 100644
--- a/src/basics/Core/StringUtilities/StringUtil.h
+++ b/src/basics/Core/StringUtilities/StringUtil.h
@@ -36,13 +36,23 @@ public:
 
     static BASICS_EXPORT bool endsWith(const std::string &input, const std::string &end);
 
+
+   template<class T>
+   static T fromString(const std::string& s)
+   {
+      std::istringstream stream (s);
+      T t;
+      stream >> t;
+      return t;
+   }
+
 private:
     StringUtil() = default;
-    ;
+
     StringUtil(const StringUtil &) = default;
-    ;
+ 
     virtual ~StringUtil() = default;
-    ;
+
 
     static bool toBool(bool &t, const std::string &input, std::ios_base &(*f)(std::ios_base &));
 };
diff --git a/src/basics/Core/buildInfo.in.cpp b/src/basics/Core/buildInfo.in.cpp
index 56f302208256ba86d7f814bfa6711f704aea3479..482f4592a41bfed0615858869ec4bb297764b0b1 100644
--- a/src/basics/Core/buildInfo.in.cpp
+++ b/src/basics/Core/buildInfo.in.cpp
@@ -1,5 +1,3 @@
-#include "buildInfo.h"
-
 #include "basics_export.h"
 
 namespace buildInfo
diff --git a/src/basics/basics/utilities/UbComparators.h b/src/basics/basics/utilities/UbComparators.h
index bc507e456b9603d85d42f8597fe4748a135e51d2..d461e27cb8d5f6026f13b2aea363d563ebed5c5c 100644
--- a/src/basics/basics/utilities/UbComparators.h
+++ b/src/basics/basics/utilities/UbComparators.h
@@ -170,7 +170,7 @@ struct compareMember {
 //   l.sort( compareConstMethods<Klasse, double,  &Klasse::getVal1 >() );
 //}
 
-}; // namespace UbComparators
+} // namespace UbComparators
 
 #endif // UBCOMPARATOR_H
 
diff --git a/src/basics/basics/utilities/UbEqual.h b/src/basics/basics/utilities/UbEqual.h
index b3ca9102d585faeac0b2a4e413434d4d0d759282..87955b181ea3efaad4b7f3d2ebc746271ec95bb7 100644
--- a/src/basics/basics/utilities/UbEqual.h
+++ b/src/basics/basics/utilities/UbEqual.h
@@ -259,7 +259,7 @@ inline bool isUbEqual(const T1 &a, const T2 &b)
 {
     using Low = typename UbEqualTrait<T1, T2>::Low;
     return specific_equal<Low, Low>(static_cast<Low>(a), static_cast<Low>(b));
-};
+}
 
 //////////////////////////////////////////////////////////////////////////
 // UbEqual-Functor
diff --git a/src/basics/basics/utilities/UbLogger.h b/src/basics/basics/utilities/UbLogger.h
index d350a763721b4b5fc1c2736fa56e2fff66a3eee0..fc2b118715a0f4afc0251b8ed8e2373a7d488153 100644
--- a/src/basics/basics/utilities/UbLogger.h
+++ b/src/basics/basics/utilities/UbLogger.h
@@ -228,7 +228,7 @@ inline std::string UbLogger<OutputPolicy>::logTimeString()
     char buffer[11];
     time_t t;
     time(&t);
-    tm r = { 0 };
+    tm r; // = { 0 };
     strftime(buffer, sizeof(buffer), "%X", localtime_r(&t, &r));
     struct timeval tv;
     gettimeofday(&tv, 0);
diff --git a/src/basics/basics/utilities/UbSystem.h b/src/basics/basics/utilities/UbSystem.h
index 8e676811a152b176efd617a944f4e22ea538721e..0436a360c2b595115824e1b7906214621bd76314 100644
--- a/src/basics/basics/utilities/UbSystem.h
+++ b/src/basics/basics/utilities/UbSystem.h
@@ -530,7 +530,7 @@ struct select2nd {
     const result_type &operator()(const argument_type &p) const { return p.second; }
 };
 
-}; // namespace UbSystem
+} // namespace UbSystem
 
 #define UB_STATIC_ASSERT(expr) static_cast<void>(sizeof(UbSystem::ub_static_assert<expr>));
 // zum ueberpruefen von STATISCHEN ausdruecken waehrend der compile-zeit
diff --git a/src/basics/basics/writer/WbWriterVtkXmlBinary.h b/src/basics/basics/writer/WbWriterVtkXmlBinary.h
index 393c6bb13b268805a3fc2ca0850dd75d3fdd5616..421148d90497e3628ed274439c0b2fd7636b7fd2 100644
--- a/src/basics/basics/writer/WbWriterVtkXmlBinary.h
+++ b/src/basics/basics/writer/WbWriterVtkXmlBinary.h
@@ -37,7 +37,9 @@
 
 #include <basics/writer/WbWriter.h>
 
-class WbWriterVtkXmlBinary : public WbWriter
+#include "basics_export.h"
+
+class BASICS_EXPORT WbWriterVtkXmlBinary : public WbWriter
 {
 public:
     static WbWriterVtkXmlBinary *getInstance()
diff --git a/src/basics/geometry3d/GbCuboid3D.h b/src/basics/geometry3d/GbCuboid3D.h
index f0a0b0f9884999050495156f3c547b26d6398e61..762a08c2696dbf58ea4b726528c89571fc21ab62 100644
--- a/src/basics/geometry3d/GbCuboid3D.h
+++ b/src/basics/geometry3d/GbCuboid3D.h
@@ -80,7 +80,7 @@ public:
     void setCenterCoordinates(const double &x1, const double &x2, const double &x3) override;
 
     void translate(const double &x1, const double &x2, const double &x3) override;
-    void rotate(const double &rx1, const double &rx2, const double &rx3) override {}
+    void rotate(const double &rx1, const double &rx2, const double &rx3) override { (void)rx1; (void)rx2; (void)rx3; }
     void scale(const double &sx1, const double &sx2, const double &sx3) override;
 
     double getLengthX1();
diff --git a/src/basics/geometry3d/GbObject3D.h b/src/basics/geometry3d/GbObject3D.h
index f60e64c8f95dfde84a389c422bbc6b9377e68187..c562f42412c8ab75b0ebcd303a5785ab4939cdfb 100644
--- a/src/basics/geometry3d/GbObject3D.h
+++ b/src/basics/geometry3d/GbObject3D.h
@@ -49,6 +49,8 @@ class GbObject3DCreator;
 
 #include <PointerDefinitions.h>
 
+#include "basics_export.h"
+
 //////////////////////////////////////////////////////////////////////////
 //!
 //! \class GbObject3D
@@ -57,7 +59,7 @@ class GbObject3DCreator;
 //!
 //////////////////////////////////////////////////////////////////////////
 
-class GbObject3D : public ObObject
+class BASICS_EXPORT GbObject3D : public ObObject
 {
 public:
     // abstract Methods
diff --git a/src/cpu/VirtualFluidsCore/BoundaryConditions/BCAlgorithm.h b/src/cpu/VirtualFluidsCore/BoundaryConditions/BCAlgorithm.h
index 7cef8205ce13d89802c9bbd7eebb5d6eb3759b3f..734aa9342df011846ce345ba7c37a181383e8828 100644
--- a/src/cpu/VirtualFluidsCore/BoundaryConditions/BCAlgorithm.h
+++ b/src/cpu/VirtualFluidsCore/BoundaryConditions/BCAlgorithm.h
@@ -47,18 +47,30 @@ class BoundaryConditions;
 class BCAlgorithm
 {
 public:
-    static const char VelocityBCAlgorithm             = 0;
-    static const char EqDensityBCAlgorithm            = 1;
-    static const char NonEqDensityBCAlgorithm         = 2;
-    static const char NoSlipBCAlgorithm               = 3;
-    static const char SlipBCAlgorithm                 = 4;
-    static const char HighViscosityNoSlipBCAlgorithm  = 5;
-    static const char ThinWallNoSlipBCAlgorithm       = 6;
-    static const char VelocityWithDensityBCAlgorithm  = 7;
+    static const char VelocityBCAlgorithm = 0;
+    static const char EqDensityBCAlgorithm = 1;
+    static const char NonEqDensityBCAlgorithm = 2;
+    static const char NoSlipBCAlgorithm = 3;
+    static const char SlipBCAlgorithm = 4;
+    static const char HighViscosityNoSlipBCAlgorithm = 5;
+    static const char ThinWallNoSlipBCAlgorithm = 6;
+    static const char VelocityWithDensityBCAlgorithm = 7;
     static const char NonReflectingOutflowBCAlgorithm = 8;
+    static const char VelocityAndThixotropyBCAlgorithm = 9;
+    static const char DensityAndThixotropyBCAlgorithm = 10;
+    static const char NoSlipAndThixotropyBCAlgorithm = 11;
+    static const char NonReflectingOutflowAndThixotropyBCAlgorithm = 12;
+    static const char VelocityWithDensityAndThixotropyBCAlgorithm = 13;
+    static const char BinghamModelNoSlipBCAlgorithm = 14;
+    static const char HerschelBulkleyModelNoSlipBCAlgorithm = 15;
+    static const char SimpleVelocityBCAlgorithm = 16;
+    static const char SimpleSlipBCAlgorithm = 17;
+    static const char PowellEyringModelNoSlipBCAlgorithm = 18;
+    static const char BinghamModelVelocityBCAlgorithm = 19;
+
 
 public:
-    BCAlgorithm()          = default;
+    BCAlgorithm() = default;
     virtual ~BCAlgorithm() = default;
 
     virtual void addDistributions(SPtr<DistributionArray3D> distributions) = 0;
@@ -72,11 +84,13 @@ public:
     SPtr<BCArray3D> getBcArray();
     void setBcArray(SPtr<BCArray3D> bcarray);
     virtual void applyBC() = 0;
+    bool getThixotropy(){ return thixotropy; };
 
 protected:
-    bool compressible{ false };
+    bool compressible { false };
     char type;
     bool preCollision;
+    bool thixotropy { false };
 
     SPtr<BoundaryConditions> bcPtr;
     SPtr<DistributionArray3D> distributions;
diff --git a/src/cpu/VirtualFluidsCore/BoundaryConditions/BCArray3D.cpp b/src/cpu/VirtualFluidsCore/BoundaryConditions/BCArray3D.cpp
index 87606eecf03943259dfec89a805336d2a3190bfa..88f4a52b2ff0445838af5aade25d9c78ce6809a8 100644
--- a/src/cpu/VirtualFluidsCore/BoundaryConditions/BCArray3D.cpp
+++ b/src/cpu/VirtualFluidsCore/BoundaryConditions/BCArray3D.cpp
@@ -63,11 +63,11 @@ void BCArray3D::resize(std::size_t nx1, std::size_t nx2, std::size_t nx3, int va
 //////////////////////////////////////////////////////////////////////////
 bool BCArray3D::validIndices(std::size_t x1, std::size_t x2, std::size_t x3) const
 {
-    if (x1 < 0 || x1 >= this->bcindexmatrix.getNX1())
+    if (x1 >= this->bcindexmatrix.getNX1())
         return false;
-    if (x2 < 0 || x2 >= this->bcindexmatrix.getNX2())
+    if (x2 >= this->bcindexmatrix.getNX2())
         return false;
-    if (x3 < 0 || x3 >= this->bcindexmatrix.getNX3())
+    if (x3 >= this->bcindexmatrix.getNX3())
         return false;
     return true;
 }
diff --git a/src/cpu/VirtualFluidsCore/BoundaryConditions/BCArray3D.h b/src/cpu/VirtualFluidsCore/BoundaryConditions/BCArray3D.h
index 835e5b1c95454a9fbe8186d6942c3936a5e0e2cc..b9d08f7117d9dc41c008c9d92a5780aceedad21c 100644
--- a/src/cpu/VirtualFluidsCore/BoundaryConditions/BCArray3D.h
+++ b/src/cpu/VirtualFluidsCore/BoundaryConditions/BCArray3D.h
@@ -35,7 +35,7 @@
 #define BCArray_H
 
 #include "BoundaryConditions.h"
-#include "basics/container/CbArray3D.h"
+#include "CbArray3D.h"
 
 #include <typeinfo>
 
diff --git a/src/cpu/VirtualFluidsCore/BoundaryConditions/BoundaryConditions.h b/src/cpu/VirtualFluidsCore/BoundaryConditions/BoundaryConditions.h
index 1ab3b4e284de49fd1b80bad13d65fee98d221e57..84ba7a6041d38546da03323f35a23f7084df9809 100644
--- a/src/cpu/VirtualFluidsCore/BoundaryConditions/BoundaryConditions.h
+++ b/src/cpu/VirtualFluidsCore/BoundaryConditions/BoundaryConditions.h
@@ -48,7 +48,6 @@ class BoundaryConditions
 {
 public:
     BoundaryConditions()
-
     {
         UB_STATIC_ASSERT(sizeof(long long) >= 8);
         UB_STATIC_ASSERT((sizeof(long long) * 8) >= (D3Q27System::FENDDIR + 1) * BoundaryConditions::optionDigits);
@@ -150,13 +149,13 @@ public:
     {
         return (short)(((slipBoundaryFlags >> (optionDigits * direction)) & maxOptionVal) - 1);
     }
-    void setNormalVector(const LBMReal &nx1, const LBMReal &nx2, const LBMReal &nx3)
+    void setNormalVector(const float &nx1, const float &nx2, const float &nx3)
     {
         this->nx1 = nx1;
         this->nx2 = nx2;
         this->nx3 = nx3;
     }
-    UbTupleDouble3 getNormalVector() { return makeUbTuple(nx1, nx2, nx3); }
+    UbTupleFloat3 getNormalVector() { return makeUbTuple(nx1, nx2, nx3); }
 
     /*============== Velocity Boundary ========================*/
     void setVelocityBoundaryFlag(const int &direction, const short &secOpt = 0)
@@ -181,72 +180,73 @@ public:
 
     void setBoundaryVelocity(const Vector3D &vx)
     {
-        setBoundaryVelocityX1((LBMReal)vx[0]);
-        setBoundaryVelocityX2((LBMReal)vx[1]);
-        setBoundaryVelocityX3((LBMReal)vx[2]);
+        setBoundaryVelocityX1((float)vx[0]);
+        setBoundaryVelocityX2((float)vx[1]);
+        setBoundaryVelocityX3((float)vx[2]);
     }
-    void setBoundaryVelocityX1(const LBMReal &vx1) { this->bcVelocityX1 = vx1; }
-    void setBoundaryVelocityX2(const LBMReal &vx2) { this->bcVelocityX2 = vx2; }
-    void setBoundaryVelocityX3(const LBMReal &vx3) { this->bcVelocityX3 = vx3; }
-    LBMReal getBoundaryVelocityX1() { return this->bcVelocityX1; }
-    LBMReal getBoundaryVelocityX2() { return this->bcVelocityX2; }
-    LBMReal getBoundaryVelocityX3() { return this->bcVelocityX3; }
-    LBMReal getBoundaryVelocity(const int &direction)
+    void setBoundaryVelocityX1(const float &vx1) { this->bcVelocityX1 = vx1; }
+    void setBoundaryVelocityX2(const float &vx2) { this->bcVelocityX2 = vx2; }
+    void setBoundaryVelocityX3(const float &vx3) { this->bcVelocityX3 = vx3; }
+    float getBoundaryVelocityX1() { return this->bcVelocityX1; }
+    float getBoundaryVelocityX2() { return this->bcVelocityX2; }
+    float getBoundaryVelocityX3() { return this->bcVelocityX3; }
+    float getBoundaryVelocity(const int &direction)
     {
         switch (direction) {
             case D3Q27System::E:
-                return (LBMReal)(UbMath::c4o9 *
-                                 (+bcVelocityX1)); //(2/cs^2)(=6)*rho_0(=1 for incompressible)*wi*u*ei with cs=1/sqrt(3)
+                return (float)(UbMath::c4o9 *
+                               (+bcVelocityX1)); //(2/cs^2)(=6)*rho_0(=1 bei inkompr)*wi*u*ei mit cs=1/sqrt(3)
             case D3Q27System::W:
-                return (LBMReal)(UbMath::c4o9 * (-bcVelocityX1));
+                return (float)(UbMath::c4o9 *
+                               (-bcVelocityX1)); // z.B. aus paper manfred MRT LB models in three dimensions (2002)
             case D3Q27System::N:
-                return (LBMReal)(UbMath::c4o9 * (+bcVelocityX2));
+                return (float)(UbMath::c4o9 * (+bcVelocityX2));
             case D3Q27System::S:
-                return (LBMReal)(UbMath::c4o9 * (-bcVelocityX2));
+                return (float)(UbMath::c4o9 * (-bcVelocityX2));
             case D3Q27System::T:
-                return (LBMReal)(UbMath::c4o9 * (+bcVelocityX3));
+                return (float)(UbMath::c4o9 * (+bcVelocityX3));
             case D3Q27System::B:
-                return (LBMReal)(UbMath::c4o9 * (-bcVelocityX3));
+                return (float)(UbMath::c4o9 * (-bcVelocityX3));
             case D3Q27System::NE:
-                return (LBMReal)(UbMath::c1o9 * (+bcVelocityX1 + bcVelocityX2));
+                return (float)(UbMath::c1o9 * (+bcVelocityX1 + bcVelocityX2));
             case D3Q27System::SW:
-                return (LBMReal)(UbMath::c1o9 * (-bcVelocityX1 - bcVelocityX2));
+                return (float)(UbMath::c1o9 * (-bcVelocityX1 - bcVelocityX2));
             case D3Q27System::SE:
-                return (LBMReal)(UbMath::c1o9 * (+bcVelocityX1 - bcVelocityX2));
+                return (float)(UbMath::c1o9 * (+bcVelocityX1 - bcVelocityX2));
             case D3Q27System::NW:
-                return (LBMReal)(UbMath::c1o9 * (-bcVelocityX1 + bcVelocityX2));
+                return (float)(UbMath::c1o9 * (-bcVelocityX1 + bcVelocityX2));
             case D3Q27System::TE:
-                return (LBMReal)(UbMath::c1o9 * (+bcVelocityX1 + bcVelocityX3));
+                return (float)(UbMath::c1o9 * (+bcVelocityX1 + bcVelocityX3));
             case D3Q27System::BW:
-                return (LBMReal)(UbMath::c1o9 * (-bcVelocityX1 - bcVelocityX3));
+                return (float)(UbMath::c1o9 * (-bcVelocityX1 - bcVelocityX3));
             case D3Q27System::BE:
-                return (LBMReal)(UbMath::c1o9 * (+bcVelocityX1 - bcVelocityX3));
+                return (float)(UbMath::c1o9 * (+bcVelocityX1 - bcVelocityX3));
             case D3Q27System::TW:
-                return (LBMReal)(UbMath::c1o9 * (-bcVelocityX1 + bcVelocityX3));
+                return (float)(UbMath::c1o9 * (-bcVelocityX1 + bcVelocityX3));
             case D3Q27System::TN:
-                return (LBMReal)(UbMath::c1o9 * (+bcVelocityX2 + bcVelocityX3));
+                return (float)(UbMath::c1o9 * (+bcVelocityX2 + bcVelocityX3));
             case D3Q27System::BS:
-                return (LBMReal)(UbMath::c1o9 * (-bcVelocityX2 - bcVelocityX3));
+                return (float)(UbMath::c1o9 * (-bcVelocityX2 - bcVelocityX3));
             case D3Q27System::BN:
-                return (LBMReal)(UbMath::c1o9 * (+bcVelocityX2 - bcVelocityX3));
+                return (float)(UbMath::c1o9 * (+bcVelocityX2 - bcVelocityX3));
             case D3Q27System::TS:
-                return (LBMReal)(UbMath::c1o9 * (-bcVelocityX2 + bcVelocityX3));
+                return (float)(UbMath::c1o9 * (-bcVelocityX2 + bcVelocityX3));
             case D3Q27System::TNE:
-                return (LBMReal)(UbMath::c1o36 * (+bcVelocityX1 + bcVelocityX2 + bcVelocityX3));
+                return (float)(UbMath::c1o36 * (+bcVelocityX1 + bcVelocityX2 + bcVelocityX3));
             case D3Q27System::BSW:
-                return (LBMReal)(UbMath::c1o36 * (-bcVelocityX1 - bcVelocityX2 - bcVelocityX3));
+                return (float)(UbMath::c1o36 * (-bcVelocityX1 - bcVelocityX2 - bcVelocityX3));
             case D3Q27System::BNE:
-                return (LBMReal)(UbMath::c1o36 * (+bcVelocityX1 + bcVelocityX2 - bcVelocityX3));
+                return (float)(UbMath::c1o36 * (+bcVelocityX1 + bcVelocityX2 - bcVelocityX3));
             case D3Q27System::TSW:
-                return (LBMReal)(UbMath::c1o36 * (-bcVelocityX1 - bcVelocityX2 + bcVelocityX3));
+                return (float)(UbMath::c1o36 * (-bcVelocityX1 - bcVelocityX2 + bcVelocityX3));
             case D3Q27System::TSE:
-                return (LBMReal)(UbMath::c1o36 * (+bcVelocityX1 - bcVelocityX2 + bcVelocityX3));
+                return (float)(UbMath::c1o36 * (+bcVelocityX1 - bcVelocityX2 + bcVelocityX3));
             case D3Q27System::BNW:
-                return (LBMReal)(UbMath::c1o36 * (-bcVelocityX1 + bcVelocityX2 - bcVelocityX3));
+                return (float)(UbMath::c1o36 * (-bcVelocityX1 + bcVelocityX2 - bcVelocityX3));
             case D3Q27System::BSE:
-                return (LBMReal)(UbMath::c1o36 * (+bcVelocityX1 - bcVelocityX2 - bcVelocityX3));
+                return (float)(UbMath::c1o36 * (+bcVelocityX1 - bcVelocityX2 - bcVelocityX3));
             case D3Q27System::TNW:
-                return (LBMReal)(UbMath::c1o36 * (-bcVelocityX1 + bcVelocityX2 + bcVelocityX3));
+                return (float)(UbMath::c1o36 * (-bcVelocityX1 + bcVelocityX2 + bcVelocityX3));
             default:
                 throw UbException(UB_EXARGS, "unknown error");
         }
@@ -273,44 +273,44 @@ public:
         return (short)(((densityBoundaryFlags >> (optionDigits * direction)) & maxOptionVal) - 1);
     }
 
-    void setBoundaryDensity(LBMReal density) { this->bcDensity = density; }
-    LBMReal getBoundaryDensity() { return this->bcDensity; }
+    void setBoundaryDensity(float density) { this->bcDensity = density; }
+    float getBoundaryDensity() { return this->bcDensity; }
 
-    // Lodi extension
-    void setDensityLodiDensity(const LBMReal &bcLodiDensity) { this->bcLodiDensity = bcLodiDensity; }
-    void setDensityLodiVelocityX1(const LBMReal &bcLodiVelocityX1) { this->bcLodiVelocityX1 = bcLodiVelocityX1; }
-    void setDensityLodiVelocityX2(const LBMReal &bcLodiVelocityX2) { this->bcLodiVelocityX2 = bcLodiVelocityX2; }
-    void setDensityLodiVelocityX3(const LBMReal &bcLodiVelocityX3) { this->bcLodiVelocityX3 = bcLodiVelocityX3; }
-    void setDensityLodiLength(const LBMReal &bcLodiLentgh) { this->bcLodiLentgh = bcLodiLentgh; }
-    LBMReal getDensityLodiDensity() const { return this->bcLodiDensity; }
-    LBMReal getDensityLodiVelocityX1() const { return this->bcLodiVelocityX1; }
-    LBMReal getDensityLodiVelocityX2() const { return this->bcLodiVelocityX2; }
-    LBMReal getDensityLodiVelocityX3() const { return this->bcLodiVelocityX3; }
-    LBMReal getDensityLodiLength() const { return this->bcLodiLentgh; }
+    ////Lodi extension
+    void setDensityLodiDensity(const float &bcLodiDensity) { this->bcLodiDensity = bcLodiDensity; }
+    void setDensityLodiVelocityX1(const float &bcLodiVelocityX1) { this->bcLodiVelocityX1 = bcLodiVelocityX1; }
+    void setDensityLodiVelocityX2(const float &bcLodiVelocityX2) { this->bcLodiVelocityX2 = bcLodiVelocityX2; }
+    void setDensityLodiVelocityX3(const float &bcLodiVelocityX3) { this->bcLodiVelocityX3 = bcLodiVelocityX3; }
+    void setDensityLodiLength(const float &bcLodiLentgh) { this->bcLodiLentgh = bcLodiLentgh; }
+    float getDensityLodiDensity() const { return this->bcLodiDensity; }
+    float getDensityLodiVelocityX1() const { return this->bcLodiVelocityX1; }
+    float getDensityLodiVelocityX2() const { return this->bcLodiVelocityX2; }
+    float getDensityLodiVelocityX3() const { return this->bcLodiVelocityX3; }
+    float getDensityLodiLength() const { return this->bcLodiLentgh; }
 
-    LBMReal &densityLodiDensity() { return this->bcLodiDensity; }
-    LBMReal &densityLodiVelocityX1() { return this->bcLodiVelocityX1; }
-    LBMReal &densityLodiVelocityX2() { return this->bcLodiVelocityX2; }
-    LBMReal &densityLodiVelocityX3() { return this->bcLodiVelocityX3; }
-    LBMReal &densityLodiLentgh() { return this->bcLodiLentgh; }
+    float &densityLodiDensity() { return this->bcLodiDensity; }
+    float &densityLodiVelocityX1() { return this->bcLodiVelocityX1; }
+    float &densityLodiVelocityX2() { return this->bcLodiVelocityX2; }
+    float &densityLodiVelocityX3() { return this->bcLodiVelocityX3; }
+    float &densityLodiLentgh() { return this->bcLodiLentgh; }
 
-    const LBMReal &densityLodiDensity() const { return this->bcLodiDensity; }
-    const LBMReal &densityLodiVelocityX1() const { return this->bcLodiVelocityX1; }
-    const LBMReal &densityLodiVelocityX2() const { return this->bcLodiVelocityX2; }
-    const LBMReal &densityLodiVelocityX3() const { return this->bcLodiVelocityX3; }
-    const LBMReal &densityLodiLentgh() const { return this->bcLodiLentgh; }
+    const float &densityLodiDensity() const { return this->bcLodiDensity; }
+    const float &densityLodiVelocityX1() const { return this->bcLodiVelocityX1; }
+    const float &densityLodiVelocityX2() const { return this->bcLodiVelocityX2; }
+    const float &densityLodiVelocityX3() const { return this->bcLodiVelocityX3; }
+    const float &densityLodiLentgh() const { return this->bcLodiLentgh; }
 
     /*======================= Qs =============================*/
-    void setQ(const LBMReal &val, const int &direction) { q[direction] = val; }
-    LBMReal getQ(const int &direction) { return q[direction]; }
+    void setQ(const float &val, const int &direction) { q[direction] = val; }
+    float getQ(const int &direction) { return q[direction]; }
 
     virtual std::vector<std::string> getBCNames()
     {
         std::vector<std::string> tmp;
-        tmp.emplace_back("NoSlipBC");
-        tmp.emplace_back("SlipBC");
-        tmp.emplace_back("VelocityBC");
-        tmp.emplace_back("DensityBC");
+        tmp.push_back("NoSlipBC");
+        tmp.push_back("SlipBC");
+        tmp.push_back("VelocityBC");
+        tmp.push_back("DensityBC");
         return tmp;
     }
     virtual std::vector<long long> getBCFlags()
@@ -336,7 +336,7 @@ public:
     static const long long maxOptionVal; // = ( 1<<optionDigits ) - 1; //2^3-1 -> 7
 
 protected:
-    LBMReal q[D3Q27System::FENDDIR + 1];
+    float q[D3Q27System::FENDDIR + 1];
 
     long long noslipBoundaryFlags{ 0 };
     long long slipBoundaryFlags{ 0 };
@@ -344,20 +344,26 @@ protected:
     long long densityBoundaryFlags{ 0 };
     long long wallModelBoundaryFlags{ 0 };
 
-    LBMReal bcVelocityX1{ 0.0f };
-    LBMReal bcVelocityX2{ 0.0f };
-    LBMReal bcVelocityX3{ 0.0f };
-    LBMReal bcDensity{ 0.0f };
+    float bcVelocityX1{ 0.0f };
+    float bcVelocityX2{ 0.0f };
+    float bcVelocityX3{ 0.0f };
+    float bcDensity{ 0.0f };
+    // float  bcThixotropy{ 0.0f };
+
+    float bcLodiDensity{ 0.0f };
+    float bcLodiVelocityX1{ 0.0f };
+    float bcLodiVelocityX2{ 0.0f };
+    float bcLodiVelocityX3{ 0.0f };
+    float bcLodiLentgh{ 0.0f };
 
-    LBMReal bcLodiDensity{ 0.0f };
-    LBMReal bcLodiVelocityX1{ 0.0f };
-    LBMReal bcLodiVelocityX2{ 0.0f };
-    LBMReal bcLodiVelocityX3{ 0.0f };
-    LBMReal bcLodiLentgh{ 0.0f };
+    float nx1{ 0.0f }, nx2{ 0.0f }, nx3{ 0.0f };
 
-    LBMReal nx1{ 0.0f }, nx2{ 0.0f }, nx3{ 0.0f };
+    char algorithmType { -1 };
 
-    char algorithmType{ -1 };
+private:
+    friend class MPIIORestartCoProcessor;
+    friend class MPIIOMigrationCoProcessor;
+    friend class MPIIOMigrationBECoProcessor;
 };
 
 #endif
diff --git a/src/cpu/VirtualFluidsCore/CMakeLists.txt b/src/cpu/VirtualFluidsCore/CMakeLists.txt
index 871aa123b3922d109d521eedcf83adfc87493e16..ac9da38b2c28d52a9be8d8489abf42247f65506e 100644
--- a/src/cpu/VirtualFluidsCore/CMakeLists.txt
+++ b/src/cpu/VirtualFluidsCore/CMakeLists.txt
@@ -1,6 +1,11 @@
 
 
-vf_add_library(BUILDTYPE static PUBLIC_LINK basics muparser)
+set(CAB_ADDITIONAL_LINK_LIBRARIES "")
+if(BUILD_USE_OPENMP)
+list(APPEND CAB_ADDITIONAL_LINK_LIBRARIES OpenMP::OpenMP_CXX)
+endif()
+
+vf_add_library(BUILDTYPE static PUBLIC_LINK basics muparser MPI::MPI_CXX ${CAB_ADDITIONAL_LINK_LIBRARIES})
 
 vf_get_library_name(library_name)
 
diff --git a/src/cpu/VirtualFluidsCore/CoProcessors/WriteBoundaryConditionsCoProcessor.cpp b/src/cpu/VirtualFluidsCore/CoProcessors/WriteBoundaryConditionsCoProcessor.cpp
index a7ba3a84505f1695a0c59175d2d2d8adf3d4dbdb..e85f2806df40e11c7f30cca1c86bcb5dc639ee73 100644
--- a/src/cpu/VirtualFluidsCore/CoProcessors/WriteBoundaryConditionsCoProcessor.cpp
+++ b/src/cpu/VirtualFluidsCore/CoProcessors/WriteBoundaryConditionsCoProcessor.cpp
@@ -151,8 +151,6 @@ void WriteBoundaryConditionsCoProcessor::addDataGeo(SPtr<Block3D> block)
     SPtr<ILBMKernel> kernel = block->getKernel();
     SPtr<BCArray3D> bcArray = kernel->getBCProcessor()->getBCArray();
 
-    // knotennummerierung faengt immer bei 0 an!
-    unsigned int SWB, SEB, NEB, NWB, SWT, SET, NET, NWT;
 
     int minX1 = 0;
     int minX2 = 0;
@@ -171,9 +169,9 @@ void WriteBoundaryConditionsCoProcessor::addDataGeo(SPtr<Block3D> block)
     maxX2 -= 1;
     maxX3 -= 1;
 
-    for (size_t ix3 = minX3; ix3 <= maxX3; ix3++) {
-        for (size_t ix2 = minX2; ix2 <= maxX2; ix2++) {
-            for (size_t ix1 = minX1; ix1 <= maxX1; ix1++) {
+    for (int ix3 = minX3; ix3 <= maxX3; ix3++) {
+        for (int ix2 = minX2; ix2 <= maxX2; ix2++) {
+            for (int ix1 = minX1; ix1 <= maxX1; ix1++) {
                 if (!bcArray->isUndefined(ix1, ix2, ix3)) {
                     int index                  = 0;
                     nodeNumbers(ix1, ix2, ix3) = nr++;
@@ -239,6 +237,9 @@ void WriteBoundaryConditionsCoProcessor::addDataGeo(SPtr<Block3D> block)
     maxX2 -= 1;
     maxX3 -= 1;
 
+    // knotennummerierung faengt immer bei 0 an!
+    int SWB = 0, SEB = 0, NEB = 0, NWB = 0, SWT = 0, SET = 0, NET = 0, NWT = 0;
+
     // cell vector erstellen
     for (int ix3 = minX3; ix3 <= maxX3; ix3++) {
         for (int ix2 = minX2; ix2 <= maxX2; ix2++) {
@@ -248,7 +249,9 @@ void WriteBoundaryConditionsCoProcessor::addDataGeo(SPtr<Block3D> block)
                     (SWT = nodeNumbers(ix1, ix2, ix3 + 1)) >= 0 && (SET = nodeNumbers(ix1 + 1, ix2, ix3 + 1)) >= 0 &&
                     (NET = nodeNumbers(ix1 + 1, ix2 + 1, ix3 + 1)) >= 0 &&
                     (NWT = nodeNumbers(ix1, ix2 + 1, ix3 + 1)) >= 0) {
-                    cells.push_back(makeUbTuple(SWB, SEB, NEB, NWB, SWT, SET, NET, NWT));
+                    cells.push_back(makeUbTuple((unsigned int)SWB, (unsigned int)SEB, (unsigned int)NEB,
+                                                (unsigned int)NWB, (unsigned int)SWT, (unsigned int)SET,
+                                                (unsigned int)NET, (unsigned int)NWT));
                 }
             }
         }
diff --git a/src/cpu/VirtualFluidsCore/CoProcessors/WriteMacroscopicQuantitiesCoProcessor.cpp b/src/cpu/VirtualFluidsCore/CoProcessors/WriteMacroscopicQuantitiesCoProcessor.cpp
index 27cf92056c88f1481f256ced4080003ed511f241..eef7d3bf6a0d0ad80aa1bdd7d83dfb469b044584 100644
--- a/src/cpu/VirtualFluidsCore/CoProcessors/WriteMacroscopicQuantitiesCoProcessor.cpp
+++ b/src/cpu/VirtualFluidsCore/CoProcessors/WriteMacroscopicQuantitiesCoProcessor.cpp
@@ -53,20 +53,24 @@ WriteMacroscopicQuantitiesCoProcessor::WriteMacroscopicQuantitiesCoProcessor(SPt
                                                                              WbWriter *const writer,
                                                                              SPtr<LBMUnitConverter> conv,
                                                                              SPtr<Communicator> comm)
-    : CoProcessor(grid, s), path(path), writer(writer), conv(conv), comm(comm)
+        : CoProcessor(grid, s), path(path), writer(writer), conv(conv), comm(comm)
 {
-    gridRank     = comm->getProcessID();
+    gridRank = comm->getProcessID();
     minInitLevel = this->grid->getCoarsestInitializedLevel();
     maxInitLevel = this->grid->getFinestInitializedLevel();
 
     blockVector.resize(maxInitLevel + 1);
 
-    for (int level = minInitLevel; level <= maxInitLevel; level++) {
+    for (int level = minInitLevel; level <= maxInitLevel; level++)
+    {
         grid->getBlocks(level, gridRank, true, blockVector[level]);
     }
 }
+
 //////////////////////////////////////////////////////////////////////////
-void WriteMacroscopicQuantitiesCoProcessor::init() {}
+void WriteMacroscopicQuantitiesCoProcessor::init()
+{}
+
 //////////////////////////////////////////////////////////////////////////
 void WriteMacroscopicQuantitiesCoProcessor::process(double step)
 {
@@ -75,14 +79,18 @@ void WriteMacroscopicQuantitiesCoProcessor::process(double step)
 
     UBLOG(logDEBUG3, "WriteMacroscopicQuantitiesCoProcessor::update:" << step);
 }
+
 //////////////////////////////////////////////////////////////////////////
 void WriteMacroscopicQuantitiesCoProcessor::collectData(double step)
 {
     int istep = static_cast<int>(step);
 
-    for (int level = minInitLevel; level <= maxInitLevel; level++) {
-        for (SPtr<Block3D> block : blockVector[level]) {
-            if (block) {
+    for (int level = minInitLevel; level <= maxInitLevel; level++)
+    {
+        for (SPtr<Block3D> block : blockVector[level])
+        {
+            if (block)
+            {
                 addDataMQ(block);
             }
         }
@@ -93,27 +101,29 @@ void WriteMacroscopicQuantitiesCoProcessor::collectData(double step)
     subfolder = "mq" + UbSystem::toString(istep);
     pfilePath = path + "/mq/" + subfolder;
     cfilePath = path + "/mq/mq_collection";
-    partPath  = pfilePath + "/mq" + UbSystem::toString(gridRank) + "_" + UbSystem::toString(istep);
+    partPath = pfilePath + "/mq" + UbSystem::toString(gridRank) + "_" + UbSystem::toString(istep);
 
     std::string partName = writer->writeOctsWithNodeData(partPath, nodes, cells, datanames, data);
-    size_t found         = partName.find_last_of("/");
-    std::string piece    = partName.substr(found + 1);
-    piece                = subfolder + "/" + piece;
+    size_t found = partName.find_last_of("/");
+    std::string piece = partName.substr(found + 1);
+    piece = subfolder + "/" + piece;
 
     std::vector<std::string> cellDataNames;
     std::vector<std::string> pieces;
     pieces.push_back(piece);
     if (comm->getProcessID() == comm->getRoot()) {
         std::string pname =
-            WbWriterVtkXmlASCII::getInstance()->writeParallelFile(pfilePath, pieces, datanames, cellDataNames);
+                WbWriterVtkXmlASCII::getInstance()->writeParallelFile(pfilePath, pieces, datanames, cellDataNames);
         found = pname.find_last_of("/");
         piece = pname.substr(found + 1);
 
         std::vector<std::string> filenames;
         filenames.push_back(piece);
-        if (step == CoProcessor::scheduler->getMinBegin()) {
+        if (step == CoProcessor::scheduler->getMinBegin())
+        {
             WbWriterVtkXmlASCII::getInstance()->writeCollection(cfilePath, filenames, istep, false);
-        } else {
+        } else
+        {
             WbWriterVtkXmlASCII::getInstance()->addFilesToCollection(cfilePath, filenames, istep, false);
         }
         UBLOG(logINFO, "WriteMacroscopicQuantitiesCoProcessor step: " << istep);
@@ -121,6 +131,7 @@ void WriteMacroscopicQuantitiesCoProcessor::collectData(double step)
 
     clearData();
 }
+
 //////////////////////////////////////////////////////////////////////////
 void WriteMacroscopicQuantitiesCoProcessor::clearData()
 {
@@ -129,16 +140,23 @@ void WriteMacroscopicQuantitiesCoProcessor::clearData()
     datanames.clear();
     data.clear();
 }
+
 //////////////////////////////////////////////////////////////////////////
 void WriteMacroscopicQuantitiesCoProcessor::addDataMQ(SPtr<Block3D> block)
 {
-    // This data is written:
+    double level   = (double)block->getLevel();
+
+    // Diese Daten werden geschrieben:
     datanames.resize(0);
-    datanames.emplace_back("DRho");
-    datanames.emplace_back("Press");
-    datanames.emplace_back("Vx");
-    datanames.emplace_back("Vy");
-    datanames.emplace_back("Vz");
+    datanames.push_back("Rho");
+    datanames.push_back("Vx");
+    datanames.push_back("Vy");
+    datanames.push_back("Vz");
+    // datanames.push_back("Press");
+    datanames.push_back("Level");
+    // datanames.push_back("BlockID");
+    // datanames.push_back("gamma");
+    // datanames.push_back("collFactor");
 
     data.resize(datanames.size());
 
@@ -146,10 +164,10 @@ void WriteMacroscopicQuantitiesCoProcessor::addDataMQ(SPtr<Block3D> block)
     SPtr<BCArray3D> bcArray                 = kernel->getBCProcessor()->getBCArray();
     SPtr<DistributionArray3D> distributions = kernel->getDataSet()->getFdistributions();
     LBMReal f[D3Q27System::ENDF + 1];
-    LBMReal vx1, vx2, vx3, drho, press;
+    LBMReal vx1, vx2, vx3, rho;
 
-    // node numbering always starts at 0!
-    unsigned int SWB, SEB, NEB, NWB, SWT, SET, NET, NWT;
+    // knotennummerierung faengt immer bei 0 an!
+    int SWB, SEB, NEB, NWB, SWT, SET, NET, NWT;
 
     if (block->getKernel()->getCompressible()) {
         calcMacros = &D3Q27System::calcCompMacroscopicValues;
@@ -165,12 +183,21 @@ void WriteMacroscopicQuantitiesCoProcessor::addDataMQ(SPtr<Block3D> block)
     int maxX2 = (int)(distributions->getNX2());
     int maxX3 = (int)(distributions->getNX3());
 
-    // assign numbers and create node vector + collect data
+    // int minX1 = 1;
+    // int minX2 = 1;
+    // int minX3 = 1;
+
+    // int maxX1 = (int)(distributions->getNX1());
+    // int maxX2 = (int)(distributions->getNX2());
+    // int maxX3 = (int)(distributions->getNX3());
+
+    // nummern vergeben und node vector erstellen + daten sammeln
     CbArray3D<int> nodeNumbers((int)maxX1, (int)maxX2, (int)maxX3, -1);
     maxX1 -= 2;
     maxX2 -= 2;
     maxX3 -= 2;
 
+    // D3Q27BoundaryConditionPtr bcPtr;
     int nr = (int)nodes.size();
 
     for (int ix3 = minX3; ix3 <= maxX3; ix3++) {
@@ -180,44 +207,60 @@ void WriteMacroscopicQuantitiesCoProcessor::addDataMQ(SPtr<Block3D> block)
                     int index                  = 0;
                     nodeNumbers(ix1, ix2, ix3) = nr++;
                     Vector3D worldCoordinates  = grid->getNodeCoordinates(block, ix1, ix2, ix3);
-                    nodes.emplace_back(float(worldCoordinates[0]), float(worldCoordinates[1]),
-                                       float(worldCoordinates[2]));
+                    nodes.push_back(UbTupleFloat3(float(worldCoordinates[0]), float(worldCoordinates[1]),
+                                                  float(worldCoordinates[2])));
 
                     distributions->getDistribution(f, ix1, ix2, ix3);
-                    calcMacros(f, drho, vx1, vx2, vx3);
-                    press = D3Q27System::calcPress(f, drho, vx1, vx2, vx3);
-
-                    if (UbMath::isNaN(drho) || UbMath::isInfinity(drho))
-                        UB_THROW(UbException(
-                            UB_EXARGS, "drho is not a number (nan or -1.#IND) or infinity number -1.#INF in block=" +
-                                           block->toString() + ", node=" + UbSystem::toString(ix1) + "," +
-                                           UbSystem::toString(ix2) + "," + UbSystem::toString(ix3)));
+                    calcMacros(f, rho, vx1, vx2, vx3);
+                    double press = D3Q27System::getPressure(f); // D3Q27System::calcPress(f,rho,vx1,vx2,vx3);
+
+                    if (UbMath::isNaN(rho) || UbMath::isInfinity(rho))
+                        // UB_THROW( UbException(UB_EXARGS,"rho is not a number (nan or -1.#IND) or infinity number
+                        // -1.#INF in block="+block->toString()+",
+                        // node="+UbSystem::toString(ix1)+","+UbSystem::toString(ix2)+","+UbSystem::toString(ix3)));
+                        rho = 999.0;
                     if (UbMath::isNaN(press) || UbMath::isInfinity(press))
-                        UB_THROW(UbException(
-                            UB_EXARGS, "press is not a number (nan or -1.#IND) or infinity number -1.#INF in block=" +
-                                           block->toString() + ", node=" + UbSystem::toString(ix1) + "," +
-                                           UbSystem::toString(ix2) + "," + UbSystem::toString(ix3)));
+                        // UB_THROW( UbException(UB_EXARGS,"press is not a number (nan or -1.#IND) or infinity number
+                        // -1.#INF in block="+block->toString()+",
+                        // node="+UbSystem::toString(ix1)+","+UbSystem::toString(ix2)+","+UbSystem::toString(ix3)));
+                        press = 999.0;
                     if (UbMath::isNaN(vx1) || UbMath::isInfinity(vx1))
-                        UB_THROW(UbException(
-                            UB_EXARGS, "vx1 is not a number (nan or -1.#IND) or infinity number -1.#INF in block=" +
-                                           block->toString() + ", node=" + UbSystem::toString(ix1) + "," +
-                                           UbSystem::toString(ix2) + "," + UbSystem::toString(ix3)));
+                        // UB_THROW( UbException(UB_EXARGS,"vx1 is not a number (nan or -1.#IND) or infinity number
+                        // -1.#INF in block="+block->toString()+",
+                        // node="+UbSystem::toString(ix1)+","+UbSystem::toString(ix2)+","+UbSystem::toString(ix3)));
+                        vx1 = 999.0;
                     if (UbMath::isNaN(vx2) || UbMath::isInfinity(vx2))
-                        UB_THROW(UbException(
-                            UB_EXARGS, "vx2 is not a number (nan or -1.#IND) or infinity number -1.#INF in block=" +
-                                           block->toString() + ", node=" + UbSystem::toString(ix1) + "," +
-                                           UbSystem::toString(ix2) + "," + UbSystem::toString(ix3)));
+                        // UB_THROW( UbException(UB_EXARGS,"vx2 is not a number (nan or -1.#IND) or infinity number
+                        // -1.#INF in block="+block->toString()+",
+                        // node="+UbSystem::toString(ix1)+","+UbSystem::toString(ix2)+","+UbSystem::toString(ix3)));
+                        vx2 = 999.0;
                     if (UbMath::isNaN(vx3) || UbMath::isInfinity(vx3))
-                        UB_THROW(UbException(
-                            UB_EXARGS, "vx3 is not a number (nan or -1.#IND) or infinity number -1.#INF in block=" +
-                                           block->toString() + ", node=" + UbSystem::toString(ix1) + "," +
-                                           UbSystem::toString(ix2) + "," + UbSystem::toString(ix3)));
-
-                    data[index++].push_back(drho * conv->getFactorDensityLbToW());
-                    data[index++].push_back(press * conv->getFactorPressureLbToW());
-                    data[index++].push_back(vx1 * conv->getFactorVelocityLbToW());
-                    data[index++].push_back(vx2 * conv->getFactorVelocityLbToW());
-                    data[index++].push_back(vx3 * conv->getFactorVelocityLbToW());
+                        // UB_THROW( UbException(UB_EXARGS,"vx3 is not a number (nan or -1.#IND) or infinity number
+                        // -1.#INF in block="+block->toString()+",
+                        // node="+UbSystem::toString(ix1)+","+UbSystem::toString(ix2)+","+UbSystem::toString(ix3)));
+                        vx3 = 999.0;
+
+                    data[index++].push_back(rho);
+                    data[index++].push_back(vx1);
+                    data[index++].push_back(vx2);
+                    data[index++].push_back(vx3);
+
+                    // shearRate = D3Q27System::getShearRate(f, collFactor);
+
+                    // LBMReal collFactorF = BinghamModelLBMKernel::getBinghamCollFactor(collFactor, yieldStress,
+                    // shearRate, rho);
+
+                    // data[index++].push_back(shearRate);
+                    // data[index++].push_back(collFactorF);
+
+                    // data[index++].push_back((rho+1.0) * conv->getFactorDensityLbToW() );
+                    // data[index++].push_back(vx1 * conv->getFactorVelocityLbToW());
+                    // data[index++].push_back(vx2 * conv->getFactorVelocityLbToW());
+                    // data[index++].push_back(vx3 * conv->getFactorVelocityLbToW());
+                    // data[index++].push_back((press * conv->getFactorPressureLbToW()) / ((rho+1.0) *
+                    // conv->getFactorDensityLbToW()));
+                    data[index++].push_back(level);
+                    // data[index++].push_back(blockID);
                 }
             }
         }
@@ -234,7 +277,9 @@ void WriteMacroscopicQuantitiesCoProcessor::addDataMQ(SPtr<Block3D> block)
                     (SWT = nodeNumbers(ix1, ix2, ix3 + 1)) >= 0 && (SET = nodeNumbers(ix1 + 1, ix2, ix3 + 1)) >= 0 &&
                     (NET = nodeNumbers(ix1 + 1, ix2 + 1, ix3 + 1)) >= 0 &&
                     (NWT = nodeNumbers(ix1, ix2 + 1, ix3 + 1)) >= 0) {
-                    cells.push_back(makeUbTuple(SWB, SEB, NEB, NWB, SWT, SET, NET, NWT));
+                    cells.push_back(makeUbTuple((unsigned int)SWB, (unsigned int)SEB, (unsigned int)NEB,
+                                                (unsigned int)NWB, (unsigned int)SWT, (unsigned int)SET,
+                                                (unsigned int)NET, (unsigned int)NWT));
                 }
             }
         }
diff --git a/src/cpu/VirtualFluidsCore/Data/D3Q27EsoTwist3DSplittedVector.cpp b/src/cpu/VirtualFluidsCore/Data/D3Q27EsoTwist3DSplittedVector.cpp
index d7e2286d4599aabf4e7b6e4c9d6a824b38e6d873..6f8a6e74664cf82a550b9000071d4f6beb9ebac2 100644
--- a/src/cpu/VirtualFluidsCore/Data/D3Q27EsoTwist3DSplittedVector.cpp
+++ b/src/cpu/VirtualFluidsCore/Data/D3Q27EsoTwist3DSplittedVector.cpp
@@ -47,7 +47,7 @@ D3Q27EsoTwist3DSplittedVector::D3Q27EsoTwist3DSplittedVector(size_t nx1, size_t
     this->nonLocalDistributions =
         std::make_shared<CbArray4D<LBMReal, IndexerX4X3X2X1>>(13, nx1 + 1, nx2 + 1, nx3 + 1, value);
 
-    this->restDistributions = std::make_shared<CbArray3D<LBMReal, IndexerX3X2X1>>(nx1, nx2, nx3, value);
+    this->zeroDistributions = std::make_shared<CbArray3D<LBMReal, IndexerX3X2X1>>(nx1, nx2, nx3, value);
 }
 //////////////////////////////////////////////////////////////////////////
 D3Q27EsoTwist3DSplittedVector::~D3Q27EsoTwist3DSplittedVector() = default;
@@ -84,7 +84,7 @@ void D3Q27EsoTwist3DSplittedVector::getDistribution(LBMReal *const f, size_t x1,
     f[D3Q27System::BNW] = (*this->nonLocalDistributions)(D3Q27System::ET_BNW, x1 + 1, x2, x3 + 1);
     f[D3Q27System::BNE] = (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3 + 1);
 
-    f[D3Q27System::REST] = (*this->restDistributions)(x1, x2, x3);
+    f[D3Q27System::ZERO] = (*this->zeroDistributions)(x1, x2, x3);
 }
 //////////////////////////////////////////////////////////////////////////
 void D3Q27EsoTwist3DSplittedVector::setDistribution(const LBMReal *const f, size_t x1, size_t x2, size_t x3)
@@ -117,7 +117,7 @@ void D3Q27EsoTwist3DSplittedVector::setDistribution(const LBMReal *const f, size
     (*this->nonLocalDistributions)(D3Q27System::ET_BNW, x1 + 1, x2, x3 + 1)     = f[D3Q27System::INV_BNW];
     (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3 + 1)         = f[D3Q27System::INV_BNE];
 
-    (*this->restDistributions)(x1, x2, x3) = f[D3Q27System::REST];
+    (*this->zeroDistributions)(x1, x2, x3) = f[D3Q27System::ZERO];
 }
 //////////////////////////////////////////////////////////////////////////
 void D3Q27EsoTwist3DSplittedVector::getDistributionInv(LBMReal *const f, size_t x1, size_t x2, size_t x3)
@@ -150,7 +150,7 @@ void D3Q27EsoTwist3DSplittedVector::getDistributionInv(LBMReal *const f, size_t
     f[D3Q27System::INV_BNW] = (*this->nonLocalDistributions)(D3Q27System::ET_BNW, x1 + 1, x2, x3 + 1);
     f[D3Q27System::INV_BNE] = (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3 + 1);
 
-    f[D3Q27System::REST] = (*this->restDistributions)(x1, x2, x3);
+    f[D3Q27System::ZERO] = (*this->zeroDistributions)(x1, x2, x3);
 }
 //////////////////////////////////////////////////////////////////////////
 void D3Q27EsoTwist3DSplittedVector::setDistributionInv(const LBMReal *const f, size_t x1, size_t x2, size_t x3)
@@ -183,7 +183,7 @@ void D3Q27EsoTwist3DSplittedVector::setDistributionInv(const LBMReal *const f, s
     (*this->nonLocalDistributions)(D3Q27System::ET_BNW, x1 + 1, x2, x3 + 1)     = f[D3Q27System::BNW];
     (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3 + 1)         = f[D3Q27System::BNE];
 
-    (*this->restDistributions)(x1, x2, x3) = f[D3Q27System::REST];
+    (*this->zeroDistributions)(x1, x2, x3) = f[D3Q27System::ZERO];
 }
 //////////////////////////////////////////////////////////////////////////
 void D3Q27EsoTwist3DSplittedVector::setDistributionForDirection(const LBMReal *const f, size_t x1, size_t x2, size_t x3,
@@ -241,8 +241,8 @@ void D3Q27EsoTwist3DSplittedVector::setDistributionForDirection(const LBMReal *c
         (*this->localDistributions)(D3Q27System::ET_TSW, x1 + 1, x2 + 1, x3) = f[D3Q27System::BNE];
     if ((direction & EsoTwistD3Q27System::etTSW) == EsoTwistD3Q27System::etTSW)
         (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3 + 1) = f[D3Q27System::TSW];
-    if ((direction & EsoTwistD3Q27System::REST) == EsoTwistD3Q27System::REST)
-        (*this->restDistributions)(x1, x2, x3) = f[D3Q27System::REST];
+    if ((direction & EsoTwistD3Q27System::ZERO) == EsoTwistD3Q27System::ZERO)
+        (*this->zeroDistributions)(x1, x2, x3) = f[D3Q27System::ZERO];
 }
 //////////////////////////////////////////////////////////////////////////
 void D3Q27EsoTwist3DSplittedVector::setDistributionForDirection(LBMReal f, size_t x1, size_t x2, size_t x3,
@@ -327,8 +327,8 @@ void D3Q27EsoTwist3DSplittedVector::setDistributionForDirection(LBMReal f, size_
         case D3Q27System::TSW:
             (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3 + 1) = f;
             break;
-        case D3Q27System::REST:
-            (*this->restDistributions)(x1, x2, x3) = f;
+        case D3Q27System::ZERO:
+            (*this->zeroDistributions)(x1, x2, x3) = f;
             break;
         default:
             UB_THROW(UbException(UB_EXARGS, "Direction didn't find"));
@@ -390,8 +390,8 @@ void D3Q27EsoTwist3DSplittedVector::setDistributionInvForDirection(const LBMReal
         (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3 + 1) = f[D3Q27System::BNE];
     if ((direction & EsoTwistD3Q27System::etTSW) == EsoTwistD3Q27System::etTSW)
         (*this->localDistributions)(D3Q27System::ET_TSW, x1 + 1, x2 + 1, x3) = f[D3Q27System::TSW];
-    if ((direction & EsoTwistD3Q27System::REST) == EsoTwistD3Q27System::REST)
-        (*this->restDistributions)(x1, x2, x3) = f[D3Q27System::REST];
+    if ((direction & EsoTwistD3Q27System::ZERO) == EsoTwistD3Q27System::ZERO)
+        (*this->zeroDistributions)(x1, x2, x3) = f[D3Q27System::ZERO];
 }
 //////////////////////////////////////////////////////////////////////////
 void D3Q27EsoTwist3DSplittedVector::setDistributionInvForDirection(LBMReal f, size_t x1, size_t x2, size_t x3,
@@ -476,8 +476,8 @@ void D3Q27EsoTwist3DSplittedVector::setDistributionInvForDirection(LBMReal f, si
         case D3Q27System::TSW:
             (*this->localDistributions)(D3Q27System::ET_TSW, x1 + 1, x2 + 1, x3) = f;
             break;
-        case D3Q27System::REST:
-            (*this->restDistributions)(x1, x2, x3) = f;
+        case D3Q27System::ZERO:
+            (*this->zeroDistributions)(x1, x2, x3) = f;
             break;
         default:
             UB_THROW(UbException(UB_EXARGS, "Direction didn't find"));
@@ -539,8 +539,8 @@ LBMReal D3Q27EsoTwist3DSplittedVector::getDistributionForDirection(size_t x1, si
             return (*this->localDistributions)(D3Q27System::ET_TSW, x1 + 1, x2 + 1, x3);
         case D3Q27System::BNE:
             return (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3 + 1);
-        case D3Q27System::REST:
-            return (*this->restDistributions)(x1, x2, x3);
+        case D3Q27System::ZERO:
+            return (*this->zeroDistributions)(x1, x2, x3);
         default:
             UB_THROW(UbException(UB_EXARGS, "Direction didn't find"));
     }
@@ -601,8 +601,8 @@ LBMReal D3Q27EsoTwist3DSplittedVector::getDistributionInvForDirection(size_t x1,
             return (*this->localDistributions)(D3Q27System::ET_TSW, x1 + 1, x2 + 1, x3);
         case D3Q27System::TSW:
             return (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3 + 1);
-        case D3Q27System::REST:
-            return (*this->restDistributions)(x1, x2, x3);
+        case D3Q27System::ZERO:
+            return (*this->zeroDistributions)(x1, x2, x3);
         default:
             UB_THROW(UbException(UB_EXARGS, "Direction didn't find"));
     }
@@ -626,7 +626,7 @@ CbArray4D<LBMReal, IndexerX4X3X2X1>::CbArray4DPtr D3Q27EsoTwist3DSplittedVector:
 //////////////////////////////////////////////////////////////////////////
 CbArray3D<LBMReal, IndexerX3X2X1>::CbArray3DPtr D3Q27EsoTwist3DSplittedVector::getZeroDistributions()
 {
-    return this->restDistributions;
+    return this->zeroDistributions;
 }
 //////////////////////////////////////////////////////////////////////////
 void D3Q27EsoTwist3DSplittedVector::setNX1(size_t newNX1) { NX1 = newNX1; }
@@ -647,7 +647,7 @@ void D3Q27EsoTwist3DSplittedVector::setNonLocalDistributions(CbArray4D<LBMReal,
 //////////////////////////////////////////////////////////////////////////
 void D3Q27EsoTwist3DSplittedVector::setZeroDistributions(CbArray3D<LBMReal, IndexerX3X2X1>::CbArray3DPtr array)
 {
-    restDistributions = array;
+    zeroDistributions = array;
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/src/cpu/VirtualFluidsCore/Data/D3Q27EsoTwist3DSplittedVector.h b/src/cpu/VirtualFluidsCore/Data/D3Q27EsoTwist3DSplittedVector.h
index df44457b04a36918643400d60ec5f514e32982ea..1c0d7d05f1392c8c116863e9e0b41000c90ed15e 100644
--- a/src/cpu/VirtualFluidsCore/Data/D3Q27EsoTwist3DSplittedVector.h
+++ b/src/cpu/VirtualFluidsCore/Data/D3Q27EsoTwist3DSplittedVector.h
@@ -100,7 +100,7 @@ public:
 protected:
     CbArray4D<LBMReal, IndexerX4X3X2X1>::CbArray4DPtr localDistributions;
     CbArray4D<LBMReal, IndexerX4X3X2X1>::CbArray4DPtr nonLocalDistributions;
-    CbArray3D<LBMReal, IndexerX3X2X1>::CbArray3DPtr restDistributions;
+    CbArray3D<LBMReal, IndexerX3X2X1>::CbArray3DPtr zeroDistributions;
     size_t NX1, NX2, NX3;
 
     friend class MPIIORestartCoProcessor;
diff --git a/src/cpu/VirtualFluidsCore/Data/DataSet3D.h b/src/cpu/VirtualFluidsCore/Data/DataSet3D.h
index c0171588bcd4a74680326c44db062dba63d4c41c..4930beeab491caf541c436a2e60b326b7cd54c64 100644
--- a/src/cpu/VirtualFluidsCore/Data/DataSet3D.h
+++ b/src/cpu/VirtualFluidsCore/Data/DataSet3D.h
@@ -51,6 +51,9 @@ public:
     SPtr<DistributionArray3D> getFdistributions() const;
     void setFdistributions(SPtr<DistributionArray3D> distributions);
 
+    SPtr<DistributionArray3D> getHdistributions() const;
+    void setHdistributions(SPtr<DistributionArray3D> distributions);
+
     SPtr<AverageValuesArray3D> getAverageDensity() const;
     void setAverageDensity(SPtr<AverageValuesArray3D> values);
 
@@ -71,10 +74,12 @@ public:
 
     SPtr<RelaxationFactorArray3D> getRelaxationFactor() const;
     void setRelaxationFactor(SPtr<RelaxationFactorArray3D> values);
-
 protected:
 private:
     SPtr<DistributionArray3D> fdistributions;
+
+    SPtr<DistributionArray3D> hdistributions;
+
     SPtr<AverageValuesArray3D> averageValues;
 
     SPtr<AverageValuesArray3D> averageDensity;
@@ -85,40 +90,96 @@ private:
     SPtr<ShearStressValuesArray3D> shearStressValues;
 
     SPtr<RelaxationFactorArray3D> relaxationFactor;
+
 };
 
-inline SPtr<DistributionArray3D> DataSet3D::getFdistributions() const { return fdistributions; }
+inline SPtr<DistributionArray3D> DataSet3D::getFdistributions() const
+{
+    return fdistributions;
+}
 
-inline void DataSet3D::setFdistributions(SPtr<DistributionArray3D> distributions) { fdistributions = distributions; }
+inline void DataSet3D::setFdistributions(SPtr<DistributionArray3D> distributions)
+{
+    fdistributions = distributions;
+}
 
-inline SPtr<AverageValuesArray3D> DataSet3D::getAverageValues() const { return averageValues; }
+inline SPtr<DistributionArray3D> DataSet3D::getHdistributions() const
+{
+    return hdistributions;
+}
 
-inline void DataSet3D::setAverageValues(SPtr<AverageValuesArray3D> values) { averageValues = values; }
+inline void DataSet3D::setHdistributions(SPtr<DistributionArray3D> distributions)
+{
+    hdistributions = distributions;
+}
 
-inline SPtr<AverageValuesArray3D> DataSet3D::getAverageDensity() const { return averageDensity; }
+inline SPtr<AverageValuesArray3D> DataSet3D::getAverageValues() const
+{
+    return averageValues;
+}
 
-inline void DataSet3D::setAverageDensity(SPtr<AverageValuesArray3D> values) { averageDensity = values; }
+inline void DataSet3D::setAverageValues(SPtr<AverageValuesArray3D> values)
+{
+    averageValues = values;
+}
 
-inline SPtr<AverageValuesArray3D> DataSet3D::getAverageVelocity() const { return averageVelocity; }
+inline SPtr<AverageValuesArray3D> DataSet3D::getAverageDensity() const
+{
+    return averageDensity;
+}
 
-inline void DataSet3D::setAverageVelocity(SPtr<AverageValuesArray3D> values) { averageVelocity = values; }
+inline void DataSet3D::setAverageDensity(SPtr<AverageValuesArray3D> values)
+{
+    averageDensity = values;
+}
 
-inline SPtr<AverageValuesArray3D> DataSet3D::getAverageFluctuations() const { return averageFluktuations; }
+inline SPtr<AverageValuesArray3D> DataSet3D::getAverageVelocity() const
+{
+    return averageVelocity;
+}
+
+inline void DataSet3D::setAverageVelocity(SPtr<AverageValuesArray3D> values)
+{
+    averageVelocity = values;
+}
 
-inline void DataSet3D::setAverageFluctuations(SPtr<AverageValuesArray3D> values) { averageFluktuations = values; }
+inline SPtr<AverageValuesArray3D> DataSet3D::getAverageFluctuations() const
+{
+    return averageFluktuations;
+}
+
+inline void DataSet3D::setAverageFluctuations(SPtr<AverageValuesArray3D> values)
+{
+    averageFluktuations = values;
+}
 
-inline SPtr<AverageValuesArray3D> DataSet3D::getAverageTriplecorrelations() const { return averageTriplecorrelations; }
+inline SPtr<AverageValuesArray3D> DataSet3D::getAverageTriplecorrelations() const
+{
+    return averageTriplecorrelations;
+}
 
 inline void DataSet3D::setAverageTriplecorrelations(SPtr<AverageValuesArray3D> values)
 {
     averageTriplecorrelations = values;
 }
 
-inline SPtr<ShearStressValuesArray3D> DataSet3D::getShearStressValues() const { return shearStressValues; }
+inline SPtr<ShearStressValuesArray3D> DataSet3D::getShearStressValues() const
+{
+    return shearStressValues;
+}
 
-inline void DataSet3D::setShearStressValues(SPtr<ShearStressValuesArray3D> values) { shearStressValues = values; }
+inline void DataSet3D::setShearStressValues(SPtr<ShearStressValuesArray3D> values)
+{
+    shearStressValues = values;
+}
 
-inline SPtr<RelaxationFactorArray3D> DataSet3D::getRelaxationFactor() const { return relaxationFactor; }
+inline SPtr<RelaxationFactorArray3D> DataSet3D::getRelaxationFactor() const
+{
+    return relaxationFactor;
+}
 
-inline void DataSet3D::setRelaxationFactor(SPtr<RelaxationFactorArray3D> values) { relaxationFactor = values; }
+inline void DataSet3D::setRelaxationFactor(SPtr<RelaxationFactorArray3D> values)
+{
+    relaxationFactor = values;
+}
 #endif
diff --git a/src/cpu/VirtualFluidsCore/Data/DistributionArray3D.h b/src/cpu/VirtualFluidsCore/Data/DistributionArray3D.h
index 242c6b7a1c156216a88d4c06c4945040383e39af..8fe4dccea1b53da0513a093e8a741cd0071caf48 100644
--- a/src/cpu/VirtualFluidsCore/Data/DistributionArray3D.h
+++ b/src/cpu/VirtualFluidsCore/Data/DistributionArray3D.h
@@ -41,9 +41,9 @@ class DistributionArray3D
 {
 public:
     DistributionArray3D() = default;
-    ;
+
     virtual ~DistributionArray3D() = default;
-    ;
+
     //! get number of nodes for x1 direction
     virtual size_t getNX1() const = 0;
     //! get number of nodes for x2 direction
diff --git a/src/cpu/VirtualFluidsCore/Data/EsoTwist3D.h b/src/cpu/VirtualFluidsCore/Data/EsoTwist3D.h
index 689589222634edfc52dcfbf358ffc8d32dd1186b..319a9200cc204b0f9b869b2e52353e717a89d783 100644
--- a/src/cpu/VirtualFluidsCore/Data/EsoTwist3D.h
+++ b/src/cpu/VirtualFluidsCore/Data/EsoTwist3D.h
@@ -43,13 +43,23 @@
 // Geier, M., & Schönherr, M. (2017). Esoteric twist: an efficient in-place streaming algorithmus for the lattice
 // Boltzmann method on massively parallel hardware. Computation, 5(2), 19.
 
+class EsoTwistD3Q27UnrollArray
+{
+};
+class EsoTwistPlusD3Q27UnrollArray
+{
+};
+class EsoTwistPlusD3Q19UnrollArray
+{
+};
+
 class EsoTwist3D : public DistributionArray3D
 {
 public:
     EsoTwist3D() = default;
-    ;
+
     ~EsoTwist3D() override = default;
-    ;
+
     //////////////////////////////////////////////////////////////////////////
     void swap() override = 0;
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/cpu/VirtualFluidsCore/Data/EsoTwistD3Q27System.cpp b/src/cpu/VirtualFluidsCore/Data/EsoTwistD3Q27System.cpp
index c456be678449744475a0ac6932850dceb0ee6f1c..1a13aa008ab49a48f1d16c7a2a71ea39dfb191ab 100644
--- a/src/cpu/VirtualFluidsCore/Data/EsoTwistD3Q27System.cpp
+++ b/src/cpu/VirtualFluidsCore/Data/EsoTwistD3Q27System.cpp
@@ -35,7 +35,7 @@
 
 // index                                                              0   1   2   3   4   5  6   7   8    9  10  11  12
 // 13  14  15  16  17  18  19  20  21  22  23  24  25  26 f: E,  W,  N,  S,  T,  B, NE, SW, SE, NW, TE, BW, BE, TW, TN,
-// BS, BN, TS, TNE TNW TSE TSW BNE BNW BSE BSW REST
+// BS, BN, TS, TNE TNW TSE TSW BNE BNW BSE BSW ZERO
 const int EsoTwistD3Q27System::ETX1[EsoTwistD3Q27System::ENDF + 1] = { 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
                                                                        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0 };
 const int EsoTwistD3Q27System::ETX2[EsoTwistD3Q27System::ENDF + 1] = { 0, 0, 0,  1, 0, 0,  0, 1, 0, -1, 0, 0, 0, 0,
@@ -49,7 +49,7 @@ const int EsoTwistD3Q27System::etINVDIR[EsoTwistD3Q27System::ENDF + 1] = {
     D3Q27System::INV_TE,  D3Q27System::INV_BW,  D3Q27System::INV_BE,  D3Q27System::INV_TW,  D3Q27System::INV_TN,
     D3Q27System::INV_BS,  D3Q27System::INV_BN,  D3Q27System::INV_TS,  D3Q27System::INV_TNE, D3Q27System::INV_TNW,
     D3Q27System::INV_TSE, D3Q27System::INV_TSW, D3Q27System::INV_BNE, D3Q27System::INV_BNW, D3Q27System::INV_BSE,
-    D3Q27System::INV_BSW, D3Q27System::REST
+    D3Q27System::INV_BSW, D3Q27System::ZERO
 };
 
 const unsigned long int EsoTwistD3Q27System::etDIR[EsoTwistD3Q27System::ENDF + 1] = {
diff --git a/src/cpu/VirtualFluidsCore/Data/EsoTwistD3Q27System.h b/src/cpu/VirtualFluidsCore/Data/EsoTwistD3Q27System.h
index 21752cc48a84b02bc24cb7efe9e3c5912f476dfd..a9214673ec4b4a66a52fa53f9b625ead0180768b 100644
--- a/src/cpu/VirtualFluidsCore/Data/EsoTwistD3Q27System.h
+++ b/src/cpu/VirtualFluidsCore/Data/EsoTwistD3Q27System.h
@@ -47,7 +47,7 @@ struct EsoTwistD3Q27System {
     const static int STARTDIR = D3Q27System::STARTDIR;
     const static int ENDDIR   = D3Q27System::ENDDIR;
 
-    static const int REST = D3Q27System::REST; /*f0 */
+    static const int ZERO = D3Q27System::ZERO; /*f0 */
     static const int E    = D3Q27System::E;    /*f1 */
     static const int W    = D3Q27System::W;    /*f2 */
     static const int N    = D3Q27System::N;    /*f3 */
diff --git a/src/cpu/VirtualFluidsCore/Grid/BasicCalculator.cpp b/src/cpu/VirtualFluidsCore/Grid/BasicCalculator.cpp
index 724d855e3dbfecbe3388ad4071a2a1b1666c1010..f1b2e5ad8c62babaea92cea50a646434f9757cd9 100644
--- a/src/cpu/VirtualFluidsCore/Grid/BasicCalculator.cpp
+++ b/src/cpu/VirtualFluidsCore/Grid/BasicCalculator.cpp
@@ -127,7 +127,7 @@ void BasicCalculator::calculate()
                 if (refinement) {
                     if (straightStartLevel < maxInitLevel)
                         exchangeBlockData(straightStartLevel, maxInitLevel);
-                        //////////////////////////////////////////////////////////////////////////
+                    //////////////////////////////////////////////////////////////////////////
 #ifdef TIMING
                     time[4] = timer.stop();
                     UBLOG(logINFO, "refinement exchangeBlockData time = " << time[4]);
@@ -155,14 +155,16 @@ void BasicCalculator::calculate()
     } catch (std::exception &e) {
         UBLOG(logERROR, e.what());
         UBLOG(logERROR, " step = " << calcStep);
-        // throw;
-        exit(EXIT_FAILURE);
+        // throw e;
+        // exit(EXIT_FAILURE);
     } catch (std::string &s) {
         UBLOG(logERROR, s);
-        exit(EXIT_FAILURE);
+        // exit(EXIT_FAILURE);
+        // throw s;
     } catch (...) {
         UBLOG(logERROR, "unknown exception");
-        exit(EXIT_FAILURE);
+        // exit(EXIT_FAILURE);
+        // throw;
     }
 }
 //////////////////////////////////////////////////////////////////////////
@@ -173,28 +175,27 @@ void BasicCalculator::calculateBlocks(int startLevel, int maxInitLevel, int calc
 #endif
     {
         SPtr<Block3D> blockTemp;
-        try {
-            // startLevel bis maxInitLevel
-            for (int level = startLevel; level <= maxInitLevel; level++) {
-                // timer.resetAndStart();
-                // call LBM kernel
-                int size = (int)blocks[level].size();
+        // startLevel bis maxInitLevel
+        for (int level = startLevel; level <= maxInitLevel; level++) {
+            // timer.resetAndStart();
+            // call LBM kernel
+            int size = (int)blocks[level].size();
 #ifdef _OPENMP
 #pragma omp for schedule(OMP_SCHEDULE)
 #endif
-                for (int i = 0; i < size; i++) {
+            for (int i = 0; i < size; i++) {
+                try {
                     blockTemp = blocks[level][i];
                     blockTemp->getKernel()->calculate(calcStep);
+                } catch (std::exception &e) {
+                    UBLOG(logERROR, e.what());
+                    UBLOG(logERROR, blockTemp->toString() << " step = " << calcStep);
+                    std::exit(EXIT_FAILURE);
                 }
-                // timer.stop();
-                // UBLOG(logINFO, "level = " << level << " blocks = " << blocks[level].size() << " collision time = " <<
-                // timer.getTotalTime());
             }
-        } catch (std::exception &e) {
-            UBLOG(logERROR, e.what());
-            // UBLOG(logERROR, blockTemp->toString()<<" step = "<<calcStep);
-            // throw;
-            exit(EXIT_FAILURE);
+            // timer.stop();
+            // UBLOG(logINFO, "level = " << level << " blocks = " << blocks[level].size() << " collision time = " <<
+            // timer.getTotalTime());
         }
     }
 }
@@ -239,8 +240,13 @@ void BasicCalculator::connectorsPrepareLocal(std::vector<SPtr<Block3DConnector>>
 #pragma omp parallel for schedule(OMP_SCHEDULE)
 #endif
     for (int i = 0; i < size; i++) {
-        connectors[i]->prepareForReceive();
-        connectors[i]->prepareForSend();
+        try {
+            connectors[i]->prepareForReceive();
+            connectors[i]->prepareForSend();
+        } catch (std::exception &e) {
+            UBLOG(logERROR, e.what());
+            std::exit(EXIT_FAILURE);
+        }
     }
 }
 //////////////////////////////////////////////////////////////////////////
@@ -251,8 +257,13 @@ void BasicCalculator::connectorsSendLocal(std::vector<SPtr<Block3DConnector>> &c
 #pragma omp parallel for schedule(OMP_SCHEDULE)
 #endif
     for (int i = 0; i < size; i++) {
-        connectors[i]->fillSendVectors();
-        connectors[i]->sendVectors();
+        try {
+            connectors[i]->fillSendVectors();
+            connectors[i]->sendVectors();
+        } catch (std::exception &e) {
+            UBLOG(logERROR, e.what());
+            std::exit(EXIT_FAILURE);
+        }
     }
 }
 //////////////////////////////////////////////////////////////////////////
@@ -321,36 +332,43 @@ void BasicCalculator::applyPreCollisionBC(int startLevel, int maxInitLevel)
 #pragma omp parallel for schedule(OMP_SCHEDULE)
 #endif
         for (int i = 0; i < size; i++) {
-            blocks[level][i]->getKernel()->getBCProcessor()->applyPreCollisionBC();
+            try {
+                blocks[level][i]->getKernel()->getBCProcessor()->applyPreCollisionBC();
+            } catch (std::exception &e) {
+                UBLOG(logERROR, e.what());
+                exit(EXIT_FAILURE);
+            } catch (std::string &s) {
+                UBLOG(logERROR, s);
+                exit(EXIT_FAILURE);
+            } catch (...) {
+                UBLOG(logERROR, "unknown exception");
+                exit(EXIT_FAILURE);
+            }
         }
     }
 }
 //////////////////////////////////////////////////////////////////////////
 void BasicCalculator::applyPostCollisionBC(int startLevel, int maxInitLevel)
 {
-    try {
-        // from startLevel to maxInitLevel
-        for (int level = startLevel; level <= maxInitLevel; level++) {
-            int size = (int)blocks[level].size();
+    //  from startLevel to maxInitLevel
+    for (int level = startLevel; level <= maxInitLevel; level++) {
+        int size = (int)blocks[level].size();
 #ifdef _OPENMP
 #pragma omp parallel for schedule(OMP_SCHEDULE)
 #endif
-            for (int i = 0; i < size; i++) {
+        for (int i = 0; i < size; i++) {
+            try {
                 blocks[level][i]->getKernel()->getBCProcessor()->applyPostCollisionBC();
+            } catch (std::exception &e) {
+                UBLOG(logERROR, e.what());
+                exit(EXIT_FAILURE);
+            } catch (std::string &s) {
+                UBLOG(logERROR, s);
+                exit(EXIT_FAILURE);
+            } catch (...) {
+                UBLOG(logERROR, "unknown exception");
+                exit(EXIT_FAILURE);
             }
         }
-    } catch (std::exception &e) {
-        UBLOG(logERROR, e.what());
-        // UBLOG(logERROR, " step = "<<calcStep);
-        // throw;
-        exit(EXIT_FAILURE);
-    } catch (std::string &s) {
-        UBLOG(logERROR, s);
-        // throw;
-        exit(EXIT_FAILURE);
-    } catch (...) {
-        UBLOG(logERROR, "unknown exception");
-        // throw;
-        exit(EXIT_FAILURE);
     }
 }
diff --git a/src/cpu/VirtualFluidsCore/Grid/BasicCalculator.h b/src/cpu/VirtualFluidsCore/Grid/BasicCalculator.h
index fb6e68ce0a56ed7e407ab23605a1facd83eace52..3ef1f4c712e552ea5d5b5e82306e2bd94d74d7ab 100644
--- a/src/cpu/VirtualFluidsCore/Grid/BasicCalculator.h
+++ b/src/cpu/VirtualFluidsCore/Grid/BasicCalculator.h
@@ -39,7 +39,8 @@
 class Block3DConnector;
 
 //! \class BasicCalculator
-//! \brief Class implements basic functionality with OpenMP parallelization for main calculation LBM loop
+//! \brief Class implements basic functionality with MPI + OpenMP parallelization for main calculation loop
+//! \author  Konstantin Kutscher
 
 class BasicCalculator : public Calculator
 {
diff --git a/src/cpu/VirtualFluidsCore/Grid/Block3D.cpp b/src/cpu/VirtualFluidsCore/Grid/Block3D.cpp
index 5e921c4f20390da51724f8e53caaab23e0270549..79753c144f5cfff831f1d0415e9434c50b11bcea 100644
--- a/src/cpu/VirtualFluidsCore/Grid/Block3D.cpp
+++ b/src/cpu/VirtualFluidsCore/Grid/Block3D.cpp
@@ -318,6 +318,13 @@ void Block3D::deleteInterpolationFlag()
     interpolationFlagCF = 0;
 }
 //////////////////////////////////////////////////////////////////////////
+double Block3D::getWorkLoad()
+{
+    double l = kernel->getCalculationTime();
+    l *= static_cast<double>(1 << level);
+    return l;
+}
+//////////////////////////////////////////////////////////////////////////
 std::string Block3D::toString()
 {
     std::stringstream ss;
diff --git a/src/cpu/VirtualFluidsCore/Grid/Block3D.h b/src/cpu/VirtualFluidsCore/Grid/Block3D.h
index 7a4a2aad75825d6e2aabd252f9c480ca3a96c5a5..b2279b069e6ee322023d30419f8eed5c587f95e8 100644
--- a/src/cpu/VirtualFluidsCore/Grid/Block3D.h
+++ b/src/cpu/VirtualFluidsCore/Grid/Block3D.h
@@ -134,6 +134,8 @@ public:
     bool hasInterpolationFlagFC(int dir);
     bool hasInterpolationFlagFC();
 
+    double getWorkLoad();
+
     std::string toString();
 
     static int getMaxGlobalID() { return counter; }
diff --git a/src/cpu/VirtualFluidsCore/Grid/Calculator.cpp b/src/cpu/VirtualFluidsCore/Grid/Calculator.cpp
index 083b30b9a4080183b2543dfe14f33d3551a093ec..fbeb2de979bb31dfb87441b5cfcfdf3393f0043c 100644
--- a/src/cpu/VirtualFluidsCore/Grid/Calculator.cpp
+++ b/src/cpu/VirtualFluidsCore/Grid/Calculator.cpp
@@ -124,11 +124,11 @@ void Calculator::initRemoteConnectors()
         // grid->getBlocks(level, gridRank, true, blockVector);
         grid->getBlocks(l, blockVector);
         for (SPtr<Block3D> block : blockVector) {
-            int l = block->getLevel();
-            block->pushBackRemoteSameLevelConnectors(remoteConns[l]);
+            int block_level = block->getLevel();
+            block->pushBackRemoteSameLevelConnectors(remoteConns[block_level]);
 
-            block->pushBackRemoteInterpolationConnectorsCF(remoteInterConnsCF[l]);
-            block->pushBackRemoteInterpolationConnectorsFC(remoteInterConnsFC[l]);
+            block->pushBackRemoteInterpolationConnectorsCF(remoteInterConnsCF[block_level]);
+            block->pushBackRemoteInterpolationConnectorsFC(remoteInterConnsFC[block_level]);
         }
     }
 
diff --git a/src/cpu/VirtualFluidsCore/Grid/Grid3D.cpp b/src/cpu/VirtualFluidsCore/Grid/Grid3D.cpp
index c78fda22d5f1b2fe95fcac5a94435f72cea1c6a5..92be5ed5a06e1909a34144cdd0d1b31000309281 100644
--- a/src/cpu/VirtualFluidsCore/Grid/Grid3D.cpp
+++ b/src/cpu/VirtualFluidsCore/Grid/Grid3D.cpp
@@ -153,6 +153,17 @@ bool Grid3D::deleteBlock(int ix1, int ix2, int ix3, int level)
         return false;
     }
 }
+void Grid3D::deleteBlocks()
+{
+    std::vector<std::vector<SPtr<Block3D>>> blocksVector(25);
+    int minInitLevel = Grid3DSystem::MINLEVEL;
+    int maxInitLevel = Grid3DSystem::MAXLEVEL;
+    for (int level = minInitLevel; level < maxInitLevel; level++) {
+        getBlocks(level, blocksVector[level]);
+        for (SPtr<Block3D> block : blocksVector[level]) //	blocks of the current level
+            deleteBlock(block);
+    }
+}
 //////////////////////////////////////////////////////////////////////////
 void Grid3D::replaceBlock(SPtr<Block3D> block)
 {
@@ -1328,7 +1339,7 @@ void Grid3D::getNeighborBlocksForDirectionWithDirZero(int dir, int ix1, int ix2,
         case Grid3DSystem::BSW:
             this->getNeighborsBottomSouthWest(ix1, ix2, ix3, level, levelDepth, blocks);
             break;
-        case Grid3DSystem::REST:
+        case Grid3DSystem::ZERO:
             this->getNeighborsZero(ix1, ix2, ix3, level, levelDepth, blocks);
             break;
         default:
diff --git a/src/cpu/VirtualFluidsCore/Grid/Grid3D.h b/src/cpu/VirtualFluidsCore/Grid/Grid3D.h
index 69c5106847bba69b651ee2c9ed84c0616798b1c3..84c821e84b8c98f17e39814a211de3262e75f804 100644
--- a/src/cpu/VirtualFluidsCore/Grid/Grid3D.h
+++ b/src/cpu/VirtualFluidsCore/Grid/Grid3D.h
@@ -74,6 +74,7 @@ public:
     void addBlock(SPtr<Block3D> block);
     bool deleteBlock(SPtr<Block3D> block);
     bool deleteBlock(int ix1, int ix2, int ix3, int level);
+    void deleteBlocks();
     void deleteBlocks(const std::vector<int> &ids);
     void replaceBlock(SPtr<Block3D> block);
     SPtr<Block3D> getBlock(int ix1, int ix2, int ix3, int level) const;
diff --git a/src/cpu/VirtualFluidsCore/Grid/Grid3DSystem.h b/src/cpu/VirtualFluidsCore/Grid/Grid3DSystem.h
index 008e38b88aec6be8695411d51c263084816fd2e1..ee61b8f7327e76a9393d4d3caa13c3a796470c08 100644
--- a/src/cpu/VirtualFluidsCore/Grid/Grid3DSystem.h
+++ b/src/cpu/VirtualFluidsCore/Grid/Grid3DSystem.h
@@ -71,7 +71,7 @@ static const int BNE          = 22;
 static const int BNW          = 23;
 static const int BSE          = 24;
 static const int BSW          = 25;
-static const int REST /*f0 */ = 26;
+static const int ZERO /*f0 */ = 26;
 
 static const int ENDDIR = 25;
 
@@ -104,6 +104,7 @@ static const int INV_BSW = TNE;
 
 extern const int INVDIR[ENDDIR + 1];
 
+static const int MINLEVEL = 0;
 static const int MAXLEVEL = 25;
 
 extern const int EX1[ENDDIR + 1];
diff --git a/src/cpu/VirtualFluidsCore/Interactors/D3Q27Interactor.cpp b/src/cpu/VirtualFluidsCore/Interactors/D3Q27Interactor.cpp
index 55ce8d1dd332c51d1acd8e1285ea0e81ec9ab00f..bf1895b930f1c61d36d537319b53fe4b0abcd960 100644
--- a/src/cpu/VirtualFluidsCore/Interactors/D3Q27Interactor.cpp
+++ b/src/cpu/VirtualFluidsCore/Interactors/D3Q27Interactor.cpp
@@ -219,7 +219,7 @@ void D3Q27Interactor::initInteractor(const double &timeStep)
     else
         this->unsetTimeDependent();
 
-    Interactor3D::initInteractor(timeStep);
+    updateBlocks();
 }
 //////////////////////////////////////////////////////////////////////////
 void D3Q27Interactor::updateInteractor(const double &timestep)
diff --git a/src/cpu/VirtualFluidsCore/Interactors/Interactor3D.cpp b/src/cpu/VirtualFluidsCore/Interactors/Interactor3D.cpp
index 0127c9c880f03d574f657ebc43e53ccaa4b67c7e..84526c62598b1d718b1f179228ae2a3f51839856 100644
--- a/src/cpu/VirtualFluidsCore/Interactors/Interactor3D.cpp
+++ b/src/cpu/VirtualFluidsCore/Interactors/Interactor3D.cpp
@@ -290,11 +290,10 @@ void Interactor3D::setInactive() { active = false; }
 //////////////////////////////////////////////////////////////////////////
 bool Interactor3D::isActive() { return active; }
 //////////////////////////////////////////////////////////////////////////
-void Interactor3D::initInteractor(const double & /*timeStep*/)
+void Interactor3D::updateBlocks()
 {
-    // UBLOG(logINFO, "transBlocks.size = "<<transBlocks.size());
-
-    for (SPtr<Block3D> block : bcBlocks) {
+    for (SPtr<Block3D> block : bcBlocks) 
+    {
         this->setDifferencesToGbObject3D(block);
     }
 }
diff --git a/src/cpu/VirtualFluidsCore/Interactors/Interactor3D.h b/src/cpu/VirtualFluidsCore/Interactors/Interactor3D.h
index 9bf3a03ba179a1da7fa932a209d78e39b6622bcf..74627b76addaf6badaea678d1c4a20b274234b3a 100644
--- a/src/cpu/VirtualFluidsCore/Interactors/Interactor3D.h
+++ b/src/cpu/VirtualFluidsCore/Interactors/Interactor3D.h
@@ -57,7 +57,7 @@ public:
     Interactor3D(SPtr<GbObject3D> geoObject3D, SPtr<Grid3D> grid, int type, Interactor3D::Accuracy a);
 
     virtual ~Interactor3D();
-    virtual void initInteractor(const double &timestep = 0);
+    virtual void initInteractor(const double &timestep = 0) = 0;
     virtual void updateInteractor(const double &timestep = 0) = 0;
 
     void setSolidBlock(SPtr<Block3D> block);
@@ -76,7 +76,7 @@ public:
     SPtr<Grid3D> getGrid3D() const { return grid.lock(); }
     void setGrid3D(SPtr<Grid3D> grid) { this->grid = grid; }
     virtual SPtr<GbObject3D> getGbObject3D() const { return geoObject3D; }
-    virtual bool setDifferencesToGbObject3D(const SPtr<Block3D>  /*block*//*, const double& x1, const double& x2, const double& x3, const double& blockLengthX1, const double& blockLengthX2, const double& blockLengthX3, const double& timestep=0*/)
+    virtual bool setDifferencesToGbObject3D(const SPtr<Block3D>)
     {
         // UBLOG(logINFO, "Interactor3D::setDifferencesToGbObject3D()");
         return false;
@@ -123,6 +123,8 @@ protected:
     bool isBlockCuttingGeoObject(double minX1, double minX2, double minX3, double maxX1, double maxX2, double maxX3,
                                  double delta);
 
+    void updateBlocks();
+
     SPtr<GbObject3D> geoObject3D;
     WPtr<Grid3D> grid;
     int type;
diff --git a/src/cpu/VirtualFluidsCore/LBM/CumulantK17LBMKernel.cpp b/src/cpu/VirtualFluidsCore/LBM/CumulantK17LBMKernel.cpp
index 9cecabe4653b78df759db47fe3f48a43a090dad5..daed493b9cc1afddbd92acabcd551da0f463ea26 100644
--- a/src/cpu/VirtualFluidsCore/LBM/CumulantK17LBMKernel.cpp
+++ b/src/cpu/VirtualFluidsCore/LBM/CumulantK17LBMKernel.cpp
@@ -30,24 +30,24 @@
 //! \ingroup LBM
 //! \author Konstantin Kutscher, Martin Geier
 //=======================================================================================
-
 #include "CumulantK17LBMKernel.h"
-#include "BCArray3D.h"
-#include "Block3D.h"
-#include "D3Q27EsoTwist3DSplittedVector.h"
 #include "D3Q27System.h"
+#include "D3Q27EsoTwist3DSplittedVector.h"
+#include <cmath>
 #include "DataSet3D.h"
 #include "LBMKernel.h"
-#include <cmath>
+#include "Block3D.h"
+#include "BCArray3D.h"
 
 #define PROOF_CORRECTNESS
 
 using namespace UbMath;
 
 //////////////////////////////////////////////////////////////////////////
-CumulantK17LBMKernel::CumulantK17LBMKernel() { this->compressible = true; }
-//////////////////////////////////////////////////////////////////////////
-CumulantK17LBMKernel::~CumulantK17LBMKernel(void) = default;
+CumulantK17LBMKernel::CumulantK17LBMKernel()
+{
+    this->compressible = true;
+}
 //////////////////////////////////////////////////////////////////////////
 void CumulantK17LBMKernel::initDataSet()
 {
@@ -75,561 +75,561 @@ SPtr<LBMKernel> CumulantK17LBMKernel::clone()
 //////////////////////////////////////////////////////////////////////////
 void CumulantK17LBMKernel::calculate(int step)
 {
-   //////////////////////////////////////////////////////////////////////////
-	//! Cumulant K17 Kernel is based on
-   //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-	//! and
-	//! <a href="https://doi.org/10.1016/j.jcp.2017.07.004"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.07.004 ]</b></a>
-	//!
-	//! The cumulant kernel is executed in the following steps
-	//!
-	////////////////////////////////////////////////////////////////////////////////
-	//! - Get node index coordinates from thredIdx, blockIdx, blockDim and gridDim.
-	//!
-   using namespace D3Q27System;
-   using namespace std;
-
-   //initializing of forcing stuff
-   if (withForcing)
-   {
-      muForcingX1.DefineVar("x1", &muX1); muForcingX1.DefineVar("x2", &muX2); muForcingX1.DefineVar("x3", &muX3);
-      muForcingX2.DefineVar("x1", &muX1); muForcingX2.DefineVar("x2", &muX2); muForcingX2.DefineVar("x3", &muX3);
-      muForcingX3.DefineVar("x1", &muX1); muForcingX3.DefineVar("x2", &muX2); muForcingX3.DefineVar("x3", &muX3);
-
-      muDeltaT = deltaT;
-
-      muForcingX1.DefineVar("dt", &muDeltaT);
-      muForcingX2.DefineVar("dt", &muDeltaT);
-      muForcingX3.DefineVar("dt", &muDeltaT);
-
-      muNu = (1.0 / 3.0) * (1.0 / collFactor - 1.0 / 2.0);
-
-      muForcingX1.DefineVar("nu", &muNu);
-      muForcingX2.DefineVar("nu", &muNu);
-      muForcingX3.DefineVar("nu", &muNu);
-   }
-   /////////////////////////////////////
-
-   localDistributions = dynamic_pointer_cast<D3Q27EsoTwist3DSplittedVector>(dataSet->getFdistributions())->getLocalDistributions();
-   nonLocalDistributions = dynamic_pointer_cast<D3Q27EsoTwist3DSplittedVector>(dataSet->getFdistributions())->getNonLocalDistributions();
-   restDistributions = dynamic_pointer_cast<D3Q27EsoTwist3DSplittedVector>(dataSet->getFdistributions())->getZeroDistributions();
-
-   SPtr<BCArray3D> bcArray = this->getBCProcessor()->getBCArray();
-
-   const int bcArrayMaxX1 = (int)bcArray->getNX1();
-   const int bcArrayMaxX2 = (int)bcArray->getNX2();
-   const int bcArrayMaxX3 = (int)bcArray->getNX3();
-
-   int minX1 = ghostLayerWidth;
-   int minX2 = ghostLayerWidth;
-   int minX3 = ghostLayerWidth;
-   int maxX1 = bcArrayMaxX1 - ghostLayerWidth;
-   int maxX2 = bcArrayMaxX2 - ghostLayerWidth;
-   int maxX3 = bcArrayMaxX3 - ghostLayerWidth;
-
-   LBMReal omega = collFactor;
-
-   for (int x3 = minX3; x3 < maxX3; x3++)
-   {
-      for (int x2 = minX2; x2 < maxX2; x2++)
-      {
-         for (int x1 = minX1; x1 < maxX1; x1++)
-         {
-            if (!bcArray->isSolid(x1, x2, x3) && !bcArray->isUndefined(x1, x2, x3))
+    //////////////////////////////////////////////////////////////////////////
+    //! Cumulant K17 Kernel is based on
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+    //! and
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.07.004"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.07.004 ]</b></a>
+    //!
+    //! The cumulant kernel is executed in the following steps
+    //!
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from thredIdx, blockIdx, blockDim and gridDim.
+    //!
+
+    using namespace std;
+
+    //initializing of forcing stuff
+    if (withForcing)
+    {
+        muForcingX1.DefineVar("x1", &muX1); muForcingX1.DefineVar("x2", &muX2); muForcingX1.DefineVar("x3", &muX3);
+        muForcingX2.DefineVar("x1", &muX1); muForcingX2.DefineVar("x2", &muX2); muForcingX2.DefineVar("x3", &muX3);
+        muForcingX3.DefineVar("x1", &muX1); muForcingX3.DefineVar("x2", &muX2); muForcingX3.DefineVar("x3", &muX3);
+
+        muDeltaT = deltaT;
+
+        muForcingX1.DefineVar("dt", &muDeltaT);
+        muForcingX2.DefineVar("dt", &muDeltaT);
+        muForcingX3.DefineVar("dt", &muDeltaT);
+
+        muNu = (1.0 / 3.0) * (1.0 / collFactor - 1.0 / 2.0);
+
+        muForcingX1.DefineVar("nu", &muNu);
+        muForcingX2.DefineVar("nu", &muNu);
+        muForcingX3.DefineVar("nu", &muNu);
+    }
+    /////////////////////////////////////
+
+    localDistributions = dynamic_pointer_cast<D3Q27EsoTwist3DSplittedVector>(dataSet->getFdistributions())->getLocalDistributions();
+    nonLocalDistributions = dynamic_pointer_cast<D3Q27EsoTwist3DSplittedVector>(dataSet->getFdistributions())->getNonLocalDistributions();
+    restDistributions = dynamic_pointer_cast<D3Q27EsoTwist3DSplittedVector>(dataSet->getFdistributions())->getZeroDistributions();
+
+    SPtr<BCArray3D> bcArray = this->getBCProcessor()->getBCArray();
+
+    const int bcArrayMaxX1 = (int)bcArray->getNX1();
+    const int bcArrayMaxX2 = (int)bcArray->getNX2();
+    const int bcArrayMaxX3 = (int)bcArray->getNX3();
+
+    int minX1 = ghostLayerWidth;
+    int minX2 = ghostLayerWidth;
+    int minX3 = ghostLayerWidth;
+    int maxX1 = bcArrayMaxX1 - ghostLayerWidth;
+    int maxX2 = bcArrayMaxX2 - ghostLayerWidth;
+    int maxX3 = bcArrayMaxX3 - ghostLayerWidth;
+
+    LBMReal omega = collFactor;
+
+    for (int x3 = minX3; x3 < maxX3; x3++)
+    {
+        for (int x2 = minX2; x2 < maxX2; x2++)
+        {
+            for (int x1 = minX1; x1 < maxX1; x1++)
             {
-               int x1p = x1 + 1;
-               int x2p = x2 + 1;
-               int x3p = x3 + 1;
-               //////////////////////////////////////////////////////////////////////////
-               //////////////////////////////////////////////////////////////////////////
-			      //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm
-			      //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
-			      //!
-               ////////////////////////////////////////////////////////////////////////////
-               //////////////////////////////////////////////////////////////////////////
-
-               //E   N  T
-               //c   c  c
-               //////////
-               //W   S  B
-               //a   a  a
-
-               //Rest is b
-
-               //mfxyz
-               //a - negative
-               //b - null
-               //c - positive
-
-               // a b c
-               //-1 0 1
-
-               LBMReal mfcbb = (*this->localDistributions)(D3Q27System::ET_E, x1, x2, x3);
-               LBMReal mfbcb = (*this->localDistributions)(D3Q27System::ET_N, x1, x2, x3);
-               LBMReal mfbbc = (*this->localDistributions)(D3Q27System::ET_T, x1, x2, x3);
-               LBMReal mfccb = (*this->localDistributions)(D3Q27System::ET_NE, x1, x2, x3);
-               LBMReal mfacb = (*this->localDistributions)(D3Q27System::ET_NW, x1p, x2, x3);
-               LBMReal mfcbc = (*this->localDistributions)(D3Q27System::ET_TE, x1, x2, x3);
-               LBMReal mfabc = (*this->localDistributions)(D3Q27System::ET_TW, x1p, x2, x3);
-               LBMReal mfbcc = (*this->localDistributions)(D3Q27System::ET_TN, x1, x2, x3);
-               LBMReal mfbac = (*this->localDistributions)(D3Q27System::ET_TS, x1, x2p, x3);
-               LBMReal mfccc = (*this->localDistributions)(D3Q27System::ET_TNE, x1, x2, x3);
-               LBMReal mfacc = (*this->localDistributions)(D3Q27System::ET_TNW, x1p, x2, x3);
-               LBMReal mfcac = (*this->localDistributions)(D3Q27System::ET_TSE, x1, x2p, x3);
-               LBMReal mfaac = (*this->localDistributions)(D3Q27System::ET_TSW, x1p, x2p, x3);
-
-               LBMReal mfabb = (*this->nonLocalDistributions)(D3Q27System::ET_W, x1p, x2, x3);
-               LBMReal mfbab = (*this->nonLocalDistributions)(D3Q27System::ET_S, x1, x2p, x3);
-               LBMReal mfbba = (*this->nonLocalDistributions)(D3Q27System::ET_B, x1, x2, x3p);
-               LBMReal mfaab = (*this->nonLocalDistributions)(D3Q27System::ET_SW, x1p, x2p, x3);
-               LBMReal mfcab = (*this->nonLocalDistributions)(D3Q27System::ET_SE, x1, x2p, x3);
-               LBMReal mfaba = (*this->nonLocalDistributions)(D3Q27System::ET_BW, x1p, x2, x3p);
-               LBMReal mfcba = (*this->nonLocalDistributions)(D3Q27System::ET_BE, x1, x2, x3p);
-               LBMReal mfbaa = (*this->nonLocalDistributions)(D3Q27System::ET_BS, x1, x2p, x3p);
-               LBMReal mfbca = (*this->nonLocalDistributions)(D3Q27System::ET_BN, x1, x2, x3p);
-               LBMReal mfaaa = (*this->nonLocalDistributions)(D3Q27System::ET_BSW, x1p, x2p, x3p);
-               LBMReal mfcaa = (*this->nonLocalDistributions)(D3Q27System::ET_BSE, x1, x2p, x3p);
-               LBMReal mfaca = (*this->nonLocalDistributions)(D3Q27System::ET_BNW, x1p, x2, x3p);
-               LBMReal mfcca = (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3p);
-
-               LBMReal mfbbb = (*this->restDistributions)(x1, x2, x3);
-
-               ////////////////////////////////////////////////////////////////////////////////////
-               //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3)
-			      //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-			      //!
-               LBMReal drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
-                  (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
-                  ((mfabb + mfcbb) + (mfbab + mfbcb)) + (mfbba + mfbbc)) + mfbbb;
-
-               LBMReal rho = c1 + drho;
-               LBMReal OOrho = c1 / rho;
-               ////////////////////////////////////////////////////////////////////////////////////
-               LBMReal vvx = ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
-                  (((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) +
-                  (mfcbb - mfabb)) / rho;
-               LBMReal vvy = ((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
-                  (((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) +
-                  (mfbcb - mfbab)) / rho;
-               LBMReal vvz = ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
-                  (((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) +
-                  (mfbbc - mfbba)) / rho;
-               ////////////////////////////////////////////////////////////////////////////////////
-               //forcing
-               ///////////////////////////////////////////////////////////////////////////////////////////
-               if (withForcing)
-               {
-                  muX1 = static_cast<double>(x1 - 1 + ix1 * maxX1);
-                  muX2 = static_cast<double>(x2 - 1 + ix2 * maxX2);
-                  muX3 = static_cast<double>(x3 - 1 + ix3 * maxX3);
-
-                  forcingX1 = muForcingX1.Eval();
-                  forcingX2 = muForcingX2.Eval();
-                  forcingX3 = muForcingX3.Eval();
-
-                  ////////////////////////////////////////////////////////////////////////////////////
-			         //! - Add half of the acceleration (body force) to the velocity as in Eq. (42)
-			         //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-			         //!
-                  vvx += forcingX1 * deltaT * c1o2; // X
-                  vvy += forcingX2 * deltaT * c1o2; // Y
-                  vvz += forcingX3 * deltaT * c1o2; // Z
-               }
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      // calculate the square of velocities for this lattice node
-               LBMReal vx2 = vvx * vvx;
-               LBMReal vy2 = vvy * vvy;
-               LBMReal vz2 = vvz * vvz;
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //! - Set relaxation limiters for third order cumulants to default value \f$ \lambda=0.001 \f$ according to section 6 in
-			      //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-			      //!
-               LBMReal wadjust;
-               LBMReal qudricLimitP = c1o100;
-               LBMReal qudricLimitM = c1o100;
-               LBMReal qudricLimitD = c1o100;
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //! - Chimera transform from well conditioned distributions to central moments as defined in Appendix J in
-			      //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-			      //! see also Eq. (6)-(14) in
-			      //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-			      //!
-               ////////////////////////////////////////////////////////////////////////////////////
-               // Z - Dir
-               forwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36, c1o36);
-               forwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2, c9, c1o9);
-               forwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36, c1o36);
-               forwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2, c9, c1o9);
-               forwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2, c9o4, c4o9);
-               forwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2, c9, c1o9);
-               forwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36, c1o36);
-               forwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2, c9, c1o9);
-               forwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36, c1o36);
-
-               ////////////////////////////////////////////////////////////////////////////////////
-               // Y - Dir
-               forwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2, c6, c1o6);
-               forwardChimera(mfaab, mfabb, mfacb, vvy, vy2);
-               forwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18, c1o18);
-               forwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2, c3o2, c2o3);
-               forwardChimera(mfbab, mfbbb, mfbcb, vvy, vy2);
-               forwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2, c9o2, c2o9);
-               forwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2, c6, c1o6);
-               forwardChimera(mfcab, mfcbb, mfccb, vvy, vy2);
-               forwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18, c1o18);
-
-               ////////////////////////////////////////////////////////////////////////////////////
-               // X - Dir
-               forwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1, c1);
-               forwardChimera(mfaba, mfbba, mfcba, vvx, vx2);
-               forwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3, c1o3);
-               forwardChimera(mfaab, mfbab, mfcab, vvx, vx2);
-               forwardChimera(mfabb, mfbbb, mfcbb, vvx, vx2);
-               forwardChimera(mfacb, mfbcb, mfccb, vvx, vx2);
-               forwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3, c1o3);
-               forwardChimera(mfabc, mfbbc, mfcbc, vvx, vx2);
-               forwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c9, c1o9);
-
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //! - Setting relaxation rates for non-hydrodynamic cumulants (default values). Variable names and equations according to
-			      //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-			      //!  => [NAME IN PAPER]=[NAME IN CODE]=[DEFAULT VALUE].
-			      //!  - Trace of second order cumulants \f$ C_{200}+C_{020}+C_{002} \f$ used to adjust bulk viscosity:\f$\omega_2=OxxPyyPzz=1.0 \f$.
-			      //!  - Third order cumulants \f$ C_{120}+C_{102} \f$, \f$ C_{210}+C_{012} \f$, \f$ C_{201}+C_{021} \f$: \f$\omega_3=OxyyPxzz\f$ set according to Eq. (111) with simplifications assuming \f$\omega_2=1.0\f$.
-			      //!  - Third order cumulants \f$ C_{120}-C_{102} \f$, \f$ C_{210}-C_{012} \f$, \f$ C_{201}-C_{021} \f$: \f$\omega_4 = OxyyMxzz\f$ set according to Eq. (112) with simplifications assuming \f$\omega_2 = 1.0\f$.
-			      //!  - Third order cumulants \f$ C_{111} \f$: \f$\omega_5 = Oxyz\f$ set according to Eq. (113) with simplifications assuming \f$\omega_2 = 1.0\f$  (modify for different bulk viscosity).
-			      //!  - Fourth order cumulants \f$ C_{220} \f$, \f$ C_{202} \f$, \f$ C_{022} \f$, \f$ C_{211} \f$, \f$ C_{121} \f$, \f$ C_{112} \f$: for simplification all set to the same default value \f$ \omega_6=\omega_7=\omega_8=O4=1.0 \f$.
-			      //!  - Fifth order cumulants \f$ C_{221}\f$, \f$C_{212}\f$, \f$C_{122}\f$: \f$\omega_9=O5=1.0\f$.
-			      //!  - Sixth order cumulant \f$ C_{222}\f$: \f$\omega_{10}=O6=1.0\f$.
-			      //!
-			      ////////////////////////////////////////////////////////////
-			      //2.
-			      LBMReal OxxPyyPzz = c1;
-			      ////////////////////////////////////////////////////////////
-			      //3.
-			      LBMReal OxyyPxzz = c8  * (-c2 + omega) * ( c1 + c2*omega) / (-c8 - c14*omega + c7*omega*omega);
-			      LBMReal OxyyMxzz = c8  * (-c2 + omega) * (-c7 + c4*omega) / (c56 - c50*omega + c9*omega*omega);
-			      LBMReal Oxyz     = c24 * (-c2 + omega) * (-c2 - c7*omega + c3*omega*omega) / (c48 + c152*omega - c130*omega*omega + c29*omega*omega*omega);
-			      ////////////////////////////////////////////////////////////
-			      //4.
-			      LBMReal O4 = c1;
-			      ////////////////////////////////////////////////////////////
-			      //5.
-			      LBMReal O5 = c1;
-			      ////////////////////////////////////////////////////////////
-			      //6.
-			      LBMReal O6 = c1;
-
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //! - A and B: parameters for fourth order convergence of the diffusion term according to Eq. (114) and (115)
-			      //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-			      //! with simplifications assuming \f$\omega_2 = 1.0\f$ (modify for different bulk viscosity).
-			      //!
-			      LBMReal A = (c4 + c2*omega - c3*omega*omega) / (c2 - c7*omega + c5*omega*omega);
-			      LBMReal B = (c4 + c28*omega - c14*omega*omega) / (c6 - c21*omega + c15*omega*omega);
-
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //! - Compute cumulants from central moments according to Eq. (20)-(23) in
-			      //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-			      //!
-			      ////////////////////////////////////////////////////////////
-               //4.
-               LBMReal CUMcbb = mfcbb - ((mfcaa + c1o3) * mfabb + c2 * mfbba * mfbab) * OOrho;
-               LBMReal CUMbcb = mfbcb - ((mfaca + c1o3) * mfbab + c2 * mfbba * mfabb) * OOrho;
-               LBMReal CUMbbc = mfbbc - ((mfaac + c1o3) * mfbba + c2 * mfbab * mfabb) * OOrho;
-
-               LBMReal CUMcca = mfcca - (((mfcaa * mfaca + c2 * mfbba * mfbba) + c1o3 * (mfcaa + mfaca)) * OOrho - c1o9 * (drho * OOrho));
-               LBMReal CUMcac = mfcac - (((mfcaa * mfaac + c2 * mfbab * mfbab) + c1o3 * (mfcaa + mfaac)) * OOrho - c1o9 * (drho * OOrho));
-               LBMReal CUMacc = mfacc - (((mfaac * mfaca + c2 * mfabb * mfabb) + c1o3 * (mfaac + mfaca)) * OOrho - c1o9 * (drho * OOrho));
-               ////////////////////////////////////////////////////////////
-               //5.
-               LBMReal CUMbcc = mfbcc - ((mfaac * mfbca + mfaca * mfbac + c4 * mfabb * mfbbb + c2 * (mfbab * mfacb + mfbba * mfabc)) + c1o3 * (mfbca + mfbac)) * OOrho;
-               LBMReal CUMcbc = mfcbc - ((mfaac * mfcba + mfcaa * mfabc + c4 * mfbab * mfbbb + c2 * (mfabb * mfcab + mfbba * mfbac)) + c1o3 * (mfcba + mfabc)) * OOrho;
-               LBMReal CUMccb = mfccb - ((mfcaa * mfacb + mfaca * mfcab + c4 * mfbba * mfbbb + c2 * (mfbab * mfbca + mfabb * mfcba)) + c1o3 * (mfacb + mfcab)) * OOrho;
-               ////////////////////////////////////////////////////////////
-               //6.
-               LBMReal CUMccc = mfccc + ((-c4 * mfbbb * mfbbb
-                     - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
-                     - c4 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
-                     - c2 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) * OOrho
-                     + (c4 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
-                     + c2 * (mfcaa * mfaca * mfaac)
-                     + c16 * mfbba * mfbab * mfabb) * OOrho * OOrho
-                     - c1o3 * (mfacc + mfcac + mfcca) * OOrho
-                     - c1o9 * (mfcaa + mfaca + mfaac) * OOrho
-                     + (c2 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
-                     + (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 * (mfaac + mfaca + mfcaa)) * OOrho * OOrho * c2o3
-                     + c1o27 * ((drho * drho - drho) * OOrho * OOrho));
-
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //! - Compute linear combinations of second and third order cumulants
-			      //!
-			      ////////////////////////////////////////////////////////////
-               //2.
-               LBMReal mxxPyyPzz = mfcaa + mfaca + mfaac;
-               LBMReal mxxMyy = mfcaa - mfaca;
-               LBMReal mxxMzz = mfcaa - mfaac;
-			      ////////////////////////////////////////////////////////////
-			      //3.
-               LBMReal mxxyPyzz = mfcba + mfabc;
-               LBMReal mxxyMyzz = mfcba - mfabc;
-
-               LBMReal mxxzPyyz = mfcab + mfacb;
-               LBMReal mxxzMyyz = mfcab - mfacb;
-
-               LBMReal mxyyPxzz = mfbca + mfbac;
-               LBMReal mxyyMxzz = mfbca - mfbac;
-
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //incl. correction
-			      ////////////////////////////////////////////////////////////
-			      //! - Compute velocity  gradients from second order cumulants according to Eq. (27)-(32)
-			      //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-			      //! Further explanations of the correction in viscosity in Appendix H of
-			      //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-			      //! Note that the division by rho is omitted here as we need rho times the gradients later.
-			      //!
-               LBMReal Dxy = -c3 * omega * mfbba;
-               LBMReal Dxz = -c3 * omega * mfbab;
-               LBMReal Dyz = -c3 * omega * mfabb;
-               LBMReal dxux = c1o2 * (-omega) * (mxxMyy + mxxMzz) + c1o2 * OxxPyyPzz * (mfaaa - mxxPyyPzz);
-               LBMReal dyuy = dxux + omega * c3o2 * mxxMyy;
-               LBMReal dzuz = dxux + omega * c3o2 * mxxMzz;
-			      ////////////////////////////////////////////////////////////
-			      //! - Relaxation of second order cumulants with correction terms according to Eq. (33)-(35) in
-			      //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-			      //!
-               mxxPyyPzz += OxxPyyPzz * (mfaaa - mxxPyyPzz) - c3 * (c1 - c1o2 * OxxPyyPzz) * (vx2 * dxux + vy2 * dyuy + vz2 * dzuz);
-               mxxMyy += omega * (-mxxMyy) - c3 * (c1 + c1o2 * (-omega)) * (vx2 * dxux - vy2 * dyuy);
-               mxxMzz += omega * (-mxxMzz) - c3 * (c1 + c1o2 * (-omega)) * (vx2 * dxux - vz2 * dzuz);
-
-               /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-               ////no correction
-               //mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz);
-               //mxxMyy += -(-omega) * (-mxxMyy);
-               //mxxMzz += -(-omega) * (-mxxMzz);
-               /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-               mfabb += omega * (-mfabb);
-               mfbab += omega * (-mfbab);
-               mfbba += omega * (-mfbba);
-
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //relax
-			      //////////////////////////////////////////////////////////////////////////
-			      // incl. limiter
-			      //! - Relaxation of third order cumulants including limiter according to Eq. (116)-(123)
-			      //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-			      //!
-               wadjust = Oxyz + (c1 - Oxyz) * abs(mfbbb) / (abs(mfbbb) + qudricLimitD);
-               mfbbb += wadjust * (-mfbbb);
-               wadjust = OxyyPxzz + (c1 - OxyyPxzz) * abs(mxxyPyzz) / (abs(mxxyPyzz) + qudricLimitP);
-               mxxyPyzz += wadjust * (-mxxyPyzz);
-               wadjust = OxyyMxzz + (c1 - OxyyMxzz) * abs(mxxyMyzz) / (abs(mxxyMyzz) + qudricLimitM);
-               mxxyMyzz += wadjust * (-mxxyMyzz);
-               wadjust = OxyyPxzz + (c1 - OxyyPxzz) * abs(mxxzPyyz) / (abs(mxxzPyyz) + qudricLimitP);
-               mxxzPyyz += wadjust * (-mxxzPyyz);
-               wadjust = OxyyMxzz + (c1 - OxyyMxzz) * abs(mxxzMyyz) / (abs(mxxzMyyz) + qudricLimitM);
-               mxxzMyyz += wadjust * (-mxxzMyyz);
-               wadjust = OxyyPxzz + (c1 - OxyyPxzz) * abs(mxyyPxzz) / (abs(mxyyPxzz) + qudricLimitP);
-               mxyyPxzz += wadjust * (-mxyyPxzz);
-               wadjust = OxyyMxzz + (c1 - OxyyMxzz) * abs(mxyyMxzz) / (abs(mxyyMxzz) + qudricLimitM);
-               mxyyMxzz += wadjust * (-mxyyMxzz);
-               //////////////////////////////////////////////////////////////////////////
-               // no limiter
-               //mfbbb += OxyyMxzz * (-mfbbb);
-               //mxxyPyzz += OxyyPxzz * (-mxxyPyzz);
-               //mxxyMyzz += OxyyMxzz * (-mxxyMyzz);
-               //mxxzPyyz += OxyyPxzz * (-mxxzPyyz);
-               //mxxzMyyz += OxyyMxzz * (-mxxzMyyz);
-               //mxyyPxzz += OxyyPxzz * (-mxyyPxzz);
-               //mxyyMxzz += OxyyMxzz * (-mxyyMxzz);
-
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //! - Compute inverse linear combinations of second and third order cumulants
-			      //!
-               mfcaa = c1o3 * (mxxMyy + mxxMzz + mxxPyyPzz);
-               mfaca = c1o3 * (-c2 * mxxMyy + mxxMzz + mxxPyyPzz);
-               mfaac = c1o3 * (mxxMyy - c2 * mxxMzz + mxxPyyPzz);
-
-               mfcba = (mxxyMyzz + mxxyPyzz) * c1o2;
-               mfabc = (-mxxyMyzz + mxxyPyzz) * c1o2;
-               mfcab = (mxxzMyyz + mxxzPyyz) * c1o2;
-               mfacb = (-mxxzMyyz + mxxzPyyz) * c1o2;
-               mfbca = (mxyyMxzz + mxyyPxzz) * c1o2;
-               mfbac = (-mxyyMxzz + mxyyPxzz) * c1o2;
-               //////////////////////////////////////////////////////////////////////////
-
-			      //////////////////////////////////////////////////////////////////////////
-			      //4.
-			      // no limiter
-			      //! - Relax fourth order cumulants to modified equilibrium for fourth order convergence of diffusion according to Eq. (43)-(48)
-			      //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-			      //!
-               CUMacc = -O4 * (c1 / omega - c1o2) * (dyuy + dzuz) * c2o3 * A + (c1 - O4) * (CUMacc);
-               CUMcac = -O4 * (c1 / omega - c1o2) * (dxux + dzuz) * c2o3 * A + (c1 - O4) * (CUMcac);
-               CUMcca = -O4 * (c1 / omega - c1o2) * (dyuy + dxux) * c2o3 * A + (c1 - O4) * (CUMcca);
-               CUMbbc = -O4 * (c1 / omega - c1o2) * Dxy * c1o3 * B + (c1 - O4) * (CUMbbc);
-               CUMbcb = -O4 * (c1 / omega - c1o2) * Dxz * c1o3 * B + (c1 - O4) * (CUMbcb);
-               CUMcbb = -O4 * (c1 / omega - c1o2) * Dyz * c1o3 * B + (c1 - O4) * (CUMcbb);
-
-               //////////////////////////////////////////////////////////////////////////
-               //5.
-               CUMbcc += O5 * (-CUMbcc);
-               CUMcbc += O5 * (-CUMcbc);
-               CUMccb += O5 * (-CUMccb);
-
-               //////////////////////////////////////////////////////////////////////////
-               //6.
-               CUMccc += O6 * (-CUMccc);
-
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //! - Compute central moments from post collision cumulants according to Eq. (53)-(56) in
-			      //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-			      //!
-
-			      //////////////////////////////////////////////////////////////////////////
-               //4.
-               mfcbb = CUMcbb + c1o3 * ((c3 * mfcaa + c1) * mfabb + c6 * mfbba * mfbab) * OOrho;
-               mfbcb = CUMbcb + c1o3 * ((c3 * mfaca + c1) * mfbab + c6 * mfbba * mfabb) * OOrho;
-               mfbbc = CUMbbc + c1o3 * ((c3 * mfaac + c1) * mfbba + c6 * mfbab * mfabb) * OOrho;
-
-               mfcca = CUMcca + (((mfcaa * mfaca + c2 * mfbba * mfbba) * c9 + c3 * (mfcaa + mfaca)) * OOrho - (drho * OOrho)) * c1o9;
-               mfcac = CUMcac + (((mfcaa * mfaac + c2 * mfbab * mfbab) * c9 + c3 * (mfcaa + mfaac)) * OOrho - (drho * OOrho)) * c1o9;
-               mfacc = CUMacc + (((mfaac * mfaca + c2 * mfabb * mfabb) * c9 + c3 * (mfaac + mfaca)) * OOrho - (drho * OOrho)) * c1o9;
-
-               //////////////////////////////////////////////////////////////////////////
-               //5.
-               mfbcc = CUMbcc + c1o3 * (c3 * (mfaac * mfbca + mfaca * mfbac + c4 * mfabb * mfbbb + c2 * (mfbab * mfacb + mfbba * mfabc)) + (mfbca + mfbac)) * OOrho;
-               mfcbc = CUMcbc + c1o3 * (c3 * (mfaac * mfcba + mfcaa * mfabc + c4 * mfbab * mfbbb + c2 * (mfabb * mfcab + mfbba * mfbac)) + (mfcba + mfabc)) * OOrho;
-               mfccb = CUMccb + c1o3 * (c3 * (mfcaa * mfacb + mfaca * mfcab + c4 * mfbba * mfbbb + c2 * (mfbab * mfbca + mfabb * mfcba)) + (mfacb + mfcab)) * OOrho;
-
-               //////////////////////////////////////////////////////////////////////////
-               //6.
-               mfccc = CUMccc - ((-c4 * mfbbb * mfbbb
-                     - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
-                     - c4 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
-                     - c2 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) * OOrho
-                     + (c4 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
-                     + c2 * (mfcaa * mfaca * mfaac)
-                     + c16 * mfbba * mfbab * mfabb) * OOrho * OOrho
-                     - c1o3 * (mfacc + mfcac + mfcca) * OOrho
-                     - c1o9 * (mfcaa + mfaca + mfaac) * OOrho
-                     + (c2 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
-                     + (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 * (mfaac + mfaca + mfcaa)) * OOrho * OOrho * c2o3
-                     + c1o27 * ((drho * drho - drho) * OOrho * OOrho));
-
-
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //! -  Add acceleration (body force) to first order cumulants according to Eq. (85)-(87) in
-			      //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-			      //!
-               mfbaa = -mfbaa;
-               mfaba = -mfaba;
-               mfaab = -mfaab;
-               ////////////////////////////////////////////////////////////////////////////////////
-
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //! - Chimera transform from central moments to well conditioned distributions as defined in Appendix J in
-			      //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
-			      //! see also Eq. (88)-(96) in
-			      //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-			      //!
-               ////////////////////////////////////////////////////////////////////////////////////
-               // X - Dir
-               backwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1, c1);
-               backwardChimera(mfaba, mfbba, mfcba, vvx, vx2);
-               backwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3, c1o3);
-               backwardChimera(mfaab, mfbab, mfcab, vvx, vx2);
-               backwardChimera(mfabb, mfbbb, mfcbb, vvx, vx2);
-               backwardChimera(mfacb, mfbcb, mfccb, vvx, vx2);
-               backwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3, c1o3);
-               backwardChimera(mfabc, mfbbc, mfcbc, vvx, vx2);
-               backwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c9, c1o9);
-
-               ////////////////////////////////////////////////////////////////////////////////////
-               // Y - Dir
-               backwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2, c6, c1o6);
-               backwardChimera(mfaab, mfabb, mfacb, vvy, vy2);
-               backwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18, c1o18);
-               backwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2, c3o2, c2o3);
-               backwardChimera(mfbab, mfbbb, mfbcb, vvy, vy2);
-               backwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2, c9o2, c2o9);
-               backwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2, c6, c1o6);
-               backwardChimera(mfcab, mfcbb, mfccb, vvy, vy2);
-               backwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18, c1o18);
-
-               ////////////////////////////////////////////////////////////////////////////////////
-               // Z - Dir
-               backwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36, c1o36);
-               backwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2, c9, c1o9);
-               backwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36, c1o36);
-               backwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2, c9, c1o9);
-               backwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2, c9o4, c4o9);
-               backwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2, c9, c1o9);
-               backwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36, c1o36);
-               backwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2, c9, c1o9);
-               backwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36, c1o36);
-               ////////////////////////////////////////////////////////////////////////////////////
-
-               //////////////////////////////////////////////////////////////////////////
-               //proof correctness
-               //////////////////////////////////////////////////////////////////////////
+                if (!bcArray->isSolid(x1, x2, x3) && !bcArray->isUndefined(x1, x2, x3))
+                {
+                    int x1p = x1 + 1;
+                    int x2p = x2 + 1;
+                    int x3p = x3 + 1;
+                    //////////////////////////////////////////////////////////////////////////
+                    //////////////////////////////////////////////////////////////////////////
+                    //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm
+                    //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
+                    //!
+                    ////////////////////////////////////////////////////////////////////////////
+                    //////////////////////////////////////////////////////////////////////////
+
+                    //E   N  T
+                    //c   c  c
+                    //////////
+                    //W   S  B
+                    //a   a  a
+
+                    //Rest is b
+
+                    //mfxyz
+                    //a - negative
+                    //b - null
+                    //c - positive
+
+                    // a b c
+                    //-1 0 1
+
+                    LBMReal mfcbb = (*this->localDistributions)(D3Q27System::ET_E, x1, x2, x3);
+                    LBMReal mfbcb = (*this->localDistributions)(D3Q27System::ET_N, x1, x2, x3);
+                    LBMReal mfbbc = (*this->localDistributions)(D3Q27System::ET_T, x1, x2, x3);
+                    LBMReal mfccb = (*this->localDistributions)(D3Q27System::ET_NE, x1, x2, x3);
+                    LBMReal mfacb = (*this->localDistributions)(D3Q27System::ET_NW, x1p, x2, x3);
+                    LBMReal mfcbc = (*this->localDistributions)(D3Q27System::ET_TE, x1, x2, x3);
+                    LBMReal mfabc = (*this->localDistributions)(D3Q27System::ET_TW, x1p, x2, x3);
+                    LBMReal mfbcc = (*this->localDistributions)(D3Q27System::ET_TN, x1, x2, x3);
+                    LBMReal mfbac = (*this->localDistributions)(D3Q27System::ET_TS, x1, x2p, x3);
+                    LBMReal mfccc = (*this->localDistributions)(D3Q27System::ET_TNE, x1, x2, x3);
+                    LBMReal mfacc = (*this->localDistributions)(D3Q27System::ET_TNW, x1p, x2, x3);
+                    LBMReal mfcac = (*this->localDistributions)(D3Q27System::ET_TSE, x1, x2p, x3);
+                    LBMReal mfaac = (*this->localDistributions)(D3Q27System::ET_TSW, x1p, x2p, x3);
+
+                    LBMReal mfabb = (*this->nonLocalDistributions)(D3Q27System::ET_W, x1p, x2, x3);
+                    LBMReal mfbab = (*this->nonLocalDistributions)(D3Q27System::ET_S, x1, x2p, x3);
+                    LBMReal mfbba = (*this->nonLocalDistributions)(D3Q27System::ET_B, x1, x2, x3p);
+                    LBMReal mfaab = (*this->nonLocalDistributions)(D3Q27System::ET_SW, x1p, x2p, x3);
+                    LBMReal mfcab = (*this->nonLocalDistributions)(D3Q27System::ET_SE, x1, x2p, x3);
+                    LBMReal mfaba = (*this->nonLocalDistributions)(D3Q27System::ET_BW, x1p, x2, x3p);
+                    LBMReal mfcba = (*this->nonLocalDistributions)(D3Q27System::ET_BE, x1, x2, x3p);
+                    LBMReal mfbaa = (*this->nonLocalDistributions)(D3Q27System::ET_BS, x1, x2p, x3p);
+                    LBMReal mfbca = (*this->nonLocalDistributions)(D3Q27System::ET_BN, x1, x2, x3p);
+                    LBMReal mfaaa = (*this->nonLocalDistributions)(D3Q27System::ET_BSW, x1p, x2p, x3p);
+                    LBMReal mfcaa = (*this->nonLocalDistributions)(D3Q27System::ET_BSE, x1, x2p, x3p);
+                    LBMReal mfaca = (*this->nonLocalDistributions)(D3Q27System::ET_BNW, x1p, x2, x3p);
+                    LBMReal mfcca = (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3p);
+
+                    LBMReal mfbbb = (*this->restDistributions)(x1, x2, x3);
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3)
+                    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+                    //!
+                    LBMReal drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
+                                    (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
+                                    ((mfabb + mfcbb) + (mfbab + mfbcb)) + (mfbba + mfbbc)) + mfbbb;
+
+                    LBMReal rho = c1 + drho;
+                    LBMReal OOrho = c1 / rho;
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    LBMReal vvx = ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
+                                   (((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) +
+                                   (mfcbb - mfabb)) / rho;
+                    LBMReal vvy = ((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
+                                   (((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) +
+                                   (mfbcb - mfbab)) / rho;
+                    LBMReal vvz = ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
+                                   (((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) +
+                                   (mfbbc - mfbba)) / rho;
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //forcing
+                    ///////////////////////////////////////////////////////////////////////////////////////////
+                    if (withForcing)
+                    {
+                        muX1 = static_cast<double>(x1 - 1 + ix1 * maxX1);
+                        muX2 = static_cast<double>(x2 - 1 + ix2 * maxX2);
+                        muX3 = static_cast<double>(x3 - 1 + ix3 * maxX3);
+
+                        forcingX1 = muForcingX1.Eval();
+                        forcingX2 = muForcingX2.Eval();
+                        forcingX3 = muForcingX3.Eval();
+
+                        ////////////////////////////////////////////////////////////////////////////////////
+                        //! - Add half of the acceleration (body force) to the velocity as in Eq. (42)
+                        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+                        //!
+                        vvx += forcingX1 * deltaT * c1o2; // X
+                        vvy += forcingX2 * deltaT * c1o2; // Y
+                        vvz += forcingX3 * deltaT * c1o2; // Z
+                    }
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    // calculate the square of velocities for this lattice node
+                    LBMReal vx2 = vvx * vvx;
+                    LBMReal vy2 = vvy * vvy;
+                    LBMReal vz2 = vvz * vvz;
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //! - Set relaxation limiters for third order cumulants to default value \f$ \lambda=0.001 \f$ according to section 6 in
+                    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+                    //!
+                    LBMReal wadjust;
+                    LBMReal qudricLimitP = c1o100;
+                    LBMReal qudricLimitM = c1o100;
+                    LBMReal qudricLimitD = c1o100;
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //! - Chimera transform from well conditioned distributions to central moments as defined in Appendix J in
+                    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+                    //! see also Eq. (6)-(14) in
+                    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+                    //!
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    // Z - Dir
+                    forwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36, c1o36);
+                    forwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2, c9, c1o9);
+                    forwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36, c1o36);
+                    forwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2, c9, c1o9);
+                    forwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2, c9o4, c4o9);
+                    forwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2, c9, c1o9);
+                    forwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36, c1o36);
+                    forwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2, c9, c1o9);
+                    forwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36, c1o36);
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    // Y - Dir
+                    forwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2, c6, c1o6);
+                    forwardChimera(mfaab, mfabb, mfacb, vvy, vy2);
+                    forwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18, c1o18);
+                    forwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2, c3o2, c2o3);
+                    forwardChimera(mfbab, mfbbb, mfbcb, vvy, vy2);
+                    forwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2, c9o2, c2o9);
+                    forwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2, c6, c1o6);
+                    forwardChimera(mfcab, mfcbb, mfccb, vvy, vy2);
+                    forwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18, c1o18);
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    // X - Dir
+                    forwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1, c1);
+                    forwardChimera(mfaba, mfbba, mfcba, vvx, vx2);
+                    forwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3, c1o3);
+                    forwardChimera(mfaab, mfbab, mfcab, vvx, vx2);
+                    forwardChimera(mfabb, mfbbb, mfcbb, vvx, vx2);
+                    forwardChimera(mfacb, mfbcb, mfccb, vvx, vx2);
+                    forwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3, c1o3);
+                    forwardChimera(mfabc, mfbbc, mfcbc, vvx, vx2);
+                    forwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c9, c1o9);
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //! - Setting relaxation rates for non-hydrodynamic cumulants (default values). Variable names and equations according to
+                    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+                    //!  => [NAME IN PAPER]=[NAME IN CODE]=[DEFAULT VALUE].
+                    //!  - Trace of second order cumulants \f$ C_{200}+C_{020}+C_{002} \f$ used to adjust bulk viscosity:\f$\omega_2=OxxPyyPzz=1.0 \f$.
+                    //!  - Third order cumulants \f$ C_{120}+C_{102} \f$, \f$ C_{210}+C_{012} \f$, \f$ C_{201}+C_{021} \f$: \f$\omega_3=OxyyPxzz\f$ set according to Eq. (111) with simplifications assuming \f$\omega_2=1.0\f$.
+                    //!  - Third order cumulants \f$ C_{120}-C_{102} \f$, \f$ C_{210}-C_{012} \f$, \f$ C_{201}-C_{021} \f$: \f$\omega_4 = OxyyMxzz\f$ set according to Eq. (112) with simplifications assuming \f$\omega_2 = 1.0\f$.
+                    //!  - Third order cumulants \f$ C_{111} \f$: \f$\omega_5 = Oxyz\f$ set according to Eq. (113) with simplifications assuming \f$\omega_2 = 1.0\f$  (modify for different bulk viscosity).
+                    //!  - Fourth order cumulants \f$ C_{220} \f$, \f$ C_{202} \f$, \f$ C_{022} \f$, \f$ C_{211} \f$, \f$ C_{121} \f$, \f$ C_{112} \f$: for simplification all set to the same default value \f$ \omega_6=\omega_7=\omega_8=O4=1.0 \f$.
+                    //!  - Fifth order cumulants \f$ C_{221}\f$, \f$C_{212}\f$, \f$C_{122}\f$: \f$\omega_9=O5=1.0\f$.
+                    //!  - Sixth order cumulant \f$ C_{222}\f$: \f$\omega_{10}=O6=1.0\f$.
+                    //!
+                    ////////////////////////////////////////////////////////////
+                    //2.
+                    LBMReal OxxPyyPzz = c1;
+                    ////////////////////////////////////////////////////////////
+                    //3.
+                    LBMReal OxyyPxzz = c8  * (-c2 + omega) * ( c1 + c2*omega) / (-c8 - c14*omega + c7*omega*omega);
+                    LBMReal OxyyMxzz = c8  * (-c2 + omega) * (-c7 + c4*omega) / (c56 - c50*omega + c9*omega*omega);
+                    LBMReal Oxyz     = c24 * (-c2 + omega) * (-c2 - c7*omega + c3*omega*omega) / (c48 + c152*omega - c130*omega*omega + c29*omega*omega*omega);
+                    ////////////////////////////////////////////////////////////
+                    //4.
+                    LBMReal O4 = c1;
+                    ////////////////////////////////////////////////////////////
+                    //5.
+                    LBMReal O5 = c1;
+                    ////////////////////////////////////////////////////////////
+                    //6.
+                    LBMReal O6 = c1;
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //! - A and B: parameters for fourth order convergence of the diffusion term according to Eq. (114) and (115)
+                    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+                    //! with simplifications assuming \f$\omega_2 = 1.0\f$ (modify for different bulk viscosity).
+                    //!
+                    LBMReal A = (c4 + c2*omega - c3*omega*omega) / (c2 - c7*omega + c5*omega*omega);
+                    LBMReal B = (c4 + c28*omega - c14*omega*omega) / (c6 - c21*omega + c15*omega*omega);
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //! - Compute cumulants from central moments according to Eq. (20)-(23) in
+                    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+                    //!
+                    ////////////////////////////////////////////////////////////
+                    //4.
+                    LBMReal CUMcbb = mfcbb - ((mfcaa + c1o3) * mfabb + c2 * mfbba * mfbab) * OOrho;
+                    LBMReal CUMbcb = mfbcb - ((mfaca + c1o3) * mfbab + c2 * mfbba * mfabb) * OOrho;
+                    LBMReal CUMbbc = mfbbc - ((mfaac + c1o3) * mfbba + c2 * mfbab * mfabb) * OOrho;
+
+                    LBMReal CUMcca = mfcca - (((mfcaa * mfaca + c2 * mfbba * mfbba) + c1o3 * (mfcaa + mfaca)) * OOrho - c1o9 * (drho * OOrho));
+                    LBMReal CUMcac = mfcac - (((mfcaa * mfaac + c2 * mfbab * mfbab) + c1o3 * (mfcaa + mfaac)) * OOrho - c1o9 * (drho * OOrho));
+                    LBMReal CUMacc = mfacc - (((mfaac * mfaca + c2 * mfabb * mfabb) + c1o3 * (mfaac + mfaca)) * OOrho - c1o9 * (drho * OOrho));
+                    ////////////////////////////////////////////////////////////
+                    //5.
+                    LBMReal CUMbcc = mfbcc - ((mfaac * mfbca + mfaca * mfbac + c4 * mfabb * mfbbb + c2 * (mfbab * mfacb + mfbba * mfabc)) + c1o3 * (mfbca + mfbac)) * OOrho;
+                    LBMReal CUMcbc = mfcbc - ((mfaac * mfcba + mfcaa * mfabc + c4 * mfbab * mfbbb + c2 * (mfabb * mfcab + mfbba * mfbac)) + c1o3 * (mfcba + mfabc)) * OOrho;
+                    LBMReal CUMccb = mfccb - ((mfcaa * mfacb + mfaca * mfcab + c4 * mfbba * mfbbb + c2 * (mfbab * mfbca + mfabb * mfcba)) + c1o3 * (mfacb + mfcab)) * OOrho;
+                    ////////////////////////////////////////////////////////////
+                    //6.
+                    LBMReal CUMccc = mfccc + ((-c4 * mfbbb * mfbbb
+                                               - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
+                                               - c4 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
+                                               - c2 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) * OOrho
+                                              + (c4 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
+                                                 + c2 * (mfcaa * mfaca * mfaac)
+                                                 + c16 * mfbba * mfbab * mfabb) * OOrho * OOrho
+                                              - c1o3 * (mfacc + mfcac + mfcca) * OOrho
+                                              - c1o9 * (mfcaa + mfaca + mfaac) * OOrho
+                                              + (c2 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
+                                                 + (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 * (mfaac + mfaca + mfcaa)) * OOrho * OOrho * c2o3
+                                              + c1o27 * ((drho * drho - drho) * OOrho * OOrho));
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //! - Compute linear combinations of second and third order cumulants
+                    //!
+                    ////////////////////////////////////////////////////////////
+                    //2.
+                    LBMReal mxxPyyPzz = mfcaa + mfaca + mfaac;
+                    LBMReal mxxMyy = mfcaa - mfaca;
+                    LBMReal mxxMzz = mfcaa - mfaac;
+                    ////////////////////////////////////////////////////////////
+                    //3.
+                    LBMReal mxxyPyzz = mfcba + mfabc;
+                    LBMReal mxxyMyzz = mfcba - mfabc;
+
+                    LBMReal mxxzPyyz = mfcab + mfacb;
+                    LBMReal mxxzMyyz = mfcab - mfacb;
+
+                    LBMReal mxyyPxzz = mfbca + mfbac;
+                    LBMReal mxyyMxzz = mfbca - mfbac;
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //incl. correction
+                    ////////////////////////////////////////////////////////////
+                    //! - Compute velocity  gradients from second order cumulants according to Eq. (27)-(32)
+                    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+                    //! Further explanations of the correction in viscosity in Appendix H of
+                    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+                    //! Note that the division by rho is omitted here as we need rho times the gradients later.
+                    //!
+                    LBMReal Dxy = -c3 * omega * mfbba;
+                    LBMReal Dxz = -c3 * omega * mfbab;
+                    LBMReal Dyz = -c3 * omega * mfabb;
+                    LBMReal dxux = c1o2 * (-omega) * (mxxMyy + mxxMzz) + c1o2 * OxxPyyPzz * (mfaaa - mxxPyyPzz);
+                    LBMReal dyuy = dxux + omega * c3o2 * mxxMyy;
+                    LBMReal dzuz = dxux + omega * c3o2 * mxxMzz;
+                    ////////////////////////////////////////////////////////////
+                    //! - Relaxation of second order cumulants with correction terms according to Eq. (33)-(35) in
+                    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+                    //!
+                    mxxPyyPzz += OxxPyyPzz * (mfaaa - mxxPyyPzz) - c3 * (c1 - c1o2 * OxxPyyPzz) * (vx2 * dxux + vy2 * dyuy + vz2 * dzuz);
+                    mxxMyy += omega * (-mxxMyy) - c3 * (c1 + c1o2 * (-omega)) * (vx2 * dxux - vy2 * dyuy);
+                    mxxMzz += omega * (-mxxMzz) - c3 * (c1 + c1o2 * (-omega)) * (vx2 * dxux - vz2 * dzuz);
+
+                    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+                    ////no correction
+                    //mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz);
+                    //mxxMyy += -(-omega) * (-mxxMyy);
+                    //mxxMzz += -(-omega) * (-mxxMzz);
+                    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+                    mfabb += omega * (-mfabb);
+                    mfbab += omega * (-mfbab);
+                    mfbba += omega * (-mfbba);
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //relax
+                    //////////////////////////////////////////////////////////////////////////
+                    // incl. limiter
+                    //! - Relaxation of third order cumulants including limiter according to Eq. (116)-(123)
+                    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+                    //!
+                    wadjust = Oxyz + (c1 - Oxyz) * abs(mfbbb) / (abs(mfbbb) + qudricLimitD);
+                    mfbbb += wadjust * (-mfbbb);
+                    wadjust = OxyyPxzz + (c1 - OxyyPxzz) * abs(mxxyPyzz) / (abs(mxxyPyzz) + qudricLimitP);
+                    mxxyPyzz += wadjust * (-mxxyPyzz);
+                    wadjust = OxyyMxzz + (c1 - OxyyMxzz) * abs(mxxyMyzz) / (abs(mxxyMyzz) + qudricLimitM);
+                    mxxyMyzz += wadjust * (-mxxyMyzz);
+                    wadjust = OxyyPxzz + (c1 - OxyyPxzz) * abs(mxxzPyyz) / (abs(mxxzPyyz) + qudricLimitP);
+                    mxxzPyyz += wadjust * (-mxxzPyyz);
+                    wadjust = OxyyMxzz + (c1 - OxyyMxzz) * abs(mxxzMyyz) / (abs(mxxzMyyz) + qudricLimitM);
+                    mxxzMyyz += wadjust * (-mxxzMyyz);
+                    wadjust = OxyyPxzz + (c1 - OxyyPxzz) * abs(mxyyPxzz) / (abs(mxyyPxzz) + qudricLimitP);
+                    mxyyPxzz += wadjust * (-mxyyPxzz);
+                    wadjust = OxyyMxzz + (c1 - OxyyMxzz) * abs(mxyyMxzz) / (abs(mxyyMxzz) + qudricLimitM);
+                    mxyyMxzz += wadjust * (-mxyyMxzz);
+                    //////////////////////////////////////////////////////////////////////////
+                    // no limiter
+                    //mfbbb += OxyyMxzz * (-mfbbb);
+                    //mxxyPyzz += OxyyPxzz * (-mxxyPyzz);
+                    //mxxyMyzz += OxyyMxzz * (-mxxyMyzz);
+                    //mxxzPyyz += OxyyPxzz * (-mxxzPyyz);
+                    //mxxzMyyz += OxyyMxzz * (-mxxzMyyz);
+                    //mxyyPxzz += OxyyPxzz * (-mxyyPxzz);
+                    //mxyyMxzz += OxyyMxzz * (-mxyyMxzz);
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //! - Compute inverse linear combinations of second and third order cumulants
+                    //!
+                    mfcaa = c1o3 * (mxxMyy + mxxMzz + mxxPyyPzz);
+                    mfaca = c1o3 * (-c2 * mxxMyy + mxxMzz + mxxPyyPzz);
+                    mfaac = c1o3 * (mxxMyy - c2 * mxxMzz + mxxPyyPzz);
+
+                    mfcba = (mxxyMyzz + mxxyPyzz) * c1o2;
+                    mfabc = (-mxxyMyzz + mxxyPyzz) * c1o2;
+                    mfcab = (mxxzMyyz + mxxzPyyz) * c1o2;
+                    mfacb = (-mxxzMyyz + mxxzPyyz) * c1o2;
+                    mfbca = (mxyyMxzz + mxyyPxzz) * c1o2;
+                    mfbac = (-mxyyMxzz + mxyyPxzz) * c1o2;
+                    //////////////////////////////////////////////////////////////////////////
+
+                    //////////////////////////////////////////////////////////////////////////
+                    //4.
+                    // no limiter
+                    //! - Relax fourth order cumulants to modified equilibrium for fourth order convergence of diffusion according to Eq. (43)-(48)
+                    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+                    //!
+                    CUMacc = -O4 * (c1 / omega - c1o2) * (dyuy + dzuz) * c2o3 * A + (c1 - O4) * (CUMacc);
+                    CUMcac = -O4 * (c1 / omega - c1o2) * (dxux + dzuz) * c2o3 * A + (c1 - O4) * (CUMcac);
+                    CUMcca = -O4 * (c1 / omega - c1o2) * (dyuy + dxux) * c2o3 * A + (c1 - O4) * (CUMcca);
+                    CUMbbc = -O4 * (c1 / omega - c1o2) * Dxy * c1o3 * B + (c1 - O4) * (CUMbbc);
+                    CUMbcb = -O4 * (c1 / omega - c1o2) * Dxz * c1o3 * B + (c1 - O4) * (CUMbcb);
+                    CUMcbb = -O4 * (c1 / omega - c1o2) * Dyz * c1o3 * B + (c1 - O4) * (CUMcbb);
+
+                    //////////////////////////////////////////////////////////////////////////
+                    //5.
+                    CUMbcc += O5 * (-CUMbcc);
+                    CUMcbc += O5 * (-CUMcbc);
+                    CUMccb += O5 * (-CUMccb);
+
+                    //////////////////////////////////////////////////////////////////////////
+                    //6.
+                    CUMccc += O6 * (-CUMccc);
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //! - Compute central moments from post collision cumulants according to Eq. (53)-(56) in
+                    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+                    //!
+
+                    //////////////////////////////////////////////////////////////////////////
+                    //4.
+                    mfcbb = CUMcbb + c1o3 * ((c3 * mfcaa + c1) * mfabb + c6 * mfbba * mfbab) * OOrho;
+                    mfbcb = CUMbcb + c1o3 * ((c3 * mfaca + c1) * mfbab + c6 * mfbba * mfabb) * OOrho;
+                    mfbbc = CUMbbc + c1o3 * ((c3 * mfaac + c1) * mfbba + c6 * mfbab * mfabb) * OOrho;
+
+                    mfcca = CUMcca + (((mfcaa * mfaca + c2 * mfbba * mfbba) * c9 + c3 * (mfcaa + mfaca)) * OOrho - (drho * OOrho)) * c1o9;
+                    mfcac = CUMcac + (((mfcaa * mfaac + c2 * mfbab * mfbab) * c9 + c3 * (mfcaa + mfaac)) * OOrho - (drho * OOrho)) * c1o9;
+                    mfacc = CUMacc + (((mfaac * mfaca + c2 * mfabb * mfabb) * c9 + c3 * (mfaac + mfaca)) * OOrho - (drho * OOrho)) * c1o9;
+
+                    //////////////////////////////////////////////////////////////////////////
+                    //5.
+                    mfbcc = CUMbcc + c1o3 * (c3 * (mfaac * mfbca + mfaca * mfbac + c4 * mfabb * mfbbb + c2 * (mfbab * mfacb + mfbba * mfabc)) + (mfbca + mfbac)) * OOrho;
+                    mfcbc = CUMcbc + c1o3 * (c3 * (mfaac * mfcba + mfcaa * mfabc + c4 * mfbab * mfbbb + c2 * (mfabb * mfcab + mfbba * mfbac)) + (mfcba + mfabc)) * OOrho;
+                    mfccb = CUMccb + c1o3 * (c3 * (mfcaa * mfacb + mfaca * mfcab + c4 * mfbba * mfbbb + c2 * (mfbab * mfbca + mfabb * mfcba)) + (mfacb + mfcab)) * OOrho;
+
+                    //////////////////////////////////////////////////////////////////////////
+                    //6.
+                    mfccc = CUMccc - ((-c4 * mfbbb * mfbbb
+                                       - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
+                                       - c4 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
+                                       - c2 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) * OOrho
+                                      + (c4 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
+                                         + c2 * (mfcaa * mfaca * mfaac)
+                                         + c16 * mfbba * mfbab * mfabb) * OOrho * OOrho
+                                      - c1o3 * (mfacc + mfcac + mfcca) * OOrho
+                                      - c1o9 * (mfcaa + mfaca + mfaac) * OOrho
+                                      + (c2 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
+                                         + (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 * (mfaac + mfaca + mfcaa)) * OOrho * OOrho * c2o3
+                                      + c1o27 * ((drho * drho - drho) * OOrho * OOrho));
+
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //! -  Add acceleration (body force) to first order cumulants according to Eq. (85)-(87) in
+                    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+                    //!
+                    mfbaa = -mfbaa;
+                    mfaba = -mfaba;
+                    mfaab = -mfaab;
+                    ////////////////////////////////////////////////////////////////////////////////////
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //! - Chimera transform from central moments to well conditioned distributions as defined in Appendix J in
+                    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+                    //! see also Eq. (88)-(96) in
+                    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+                    //!
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    // X - Dir
+                    backwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1, c1);
+                    backwardChimera(mfaba, mfbba, mfcba, vvx, vx2);
+                    backwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3, c1o3);
+                    backwardChimera(mfaab, mfbab, mfcab, vvx, vx2);
+                    backwardChimera(mfabb, mfbbb, mfcbb, vvx, vx2);
+                    backwardChimera(mfacb, mfbcb, mfccb, vvx, vx2);
+                    backwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3, c1o3);
+                    backwardChimera(mfabc, mfbbc, mfcbc, vvx, vx2);
+                    backwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c9, c1o9);
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    // Y - Dir
+                    backwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2, c6, c1o6);
+                    backwardChimera(mfaab, mfabb, mfacb, vvy, vy2);
+                    backwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18, c1o18);
+                    backwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2, c3o2, c2o3);
+                    backwardChimera(mfbab, mfbbb, mfbcb, vvy, vy2);
+                    backwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2, c9o2, c2o9);
+                    backwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2, c6, c1o6);
+                    backwardChimera(mfcab, mfcbb, mfccb, vvy, vy2);
+                    backwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18, c1o18);
+
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    // Z - Dir
+                    backwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36, c1o36);
+                    backwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2, c9, c1o9);
+                    backwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36, c1o36);
+                    backwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2, c9, c1o9);
+                    backwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2, c9o4, c4o9);
+                    backwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2, c9, c1o9);
+                    backwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36, c1o36);
+                    backwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2, c9, c1o9);
+                    backwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36, c1o36);
+                    ////////////////////////////////////////////////////////////////////////////////////
+
+                    //////////////////////////////////////////////////////////////////////////
+                    //proof correctness
+                    //////////////////////////////////////////////////////////////////////////
 #ifdef  PROOF_CORRECTNESS
-               LBMReal drho_post = (mfaaa + mfaac + mfaca + mfcaa + mfacc + mfcac + mfccc + mfcca)
-                  + (mfaab + mfacb + mfcab + mfccb) + (mfaba + mfabc + mfcba + mfcbc) + (mfbaa + mfbac + mfbca + mfbcc)
-                  + (mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc) + mfbbb;
-               LBMReal dif = drho - drho_post;
+                    LBMReal drho_post = (mfaaa + mfaac + mfaca + mfcaa + mfacc + mfcac + mfccc + mfcca)
+                                        + (mfaab + mfacb + mfcab + mfccb) + (mfaba + mfabc + mfcba + mfcbc) + (mfbaa + mfbac + mfbca + mfbcc)
+                                        + (mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc) + mfbbb;
+                    LBMReal dif = drho - drho_post;
 #ifdef SINGLEPRECISION
-               if (dif > 10.0E-7 || dif < -10.0E-7)
+                    if (dif > 10.0E-7 || dif < -10.0E-7)
 #else
-               if (dif > 10.0E-15 || dif < -10.0E-15)
+                    if (dif > 10.0E-15 || dif < -10.0E-15)
 #endif
-               {
-                  UB_THROW(UbException(UB_EXARGS, "rho=" + UbSystem::toString(drho) + ", rho_post=" + UbSystem::toString(drho_post)
-                     + " dif=" + UbSystem::toString(dif)
-                     + " rho is not correct for node " + UbSystem::toString(x1) + "," + UbSystem::toString(x2) + "," + UbSystem::toString(x3)
-                     + " in " + block.lock()->toString() + " step = " + UbSystem::toString(step)));
-               }
+                    {
+                        UB_THROW(UbException(UB_EXARGS, "rho=" + UbSystem::toString(drho) + ", rho_post=" + UbSystem::toString(drho_post)
+                                                        + " dif=" + UbSystem::toString(dif)
+                                                        + " rho is not correct for node " + UbSystem::toString(x1) + "," + UbSystem::toString(x2) + "," + UbSystem::toString(x3)
+                                                        + " in " + block.lock()->toString() + " step = " + UbSystem::toString(step)));
+                    }
 #endif
-			      ////////////////////////////////////////////////////////////////////////////////////
-			      //! - Write distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm
-			      //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
-			      //!
-               (*this->localDistributions)(D3Q27System::ET_E, x1, x2, x3) = mfabb;
-               (*this->localDistributions)(D3Q27System::ET_N, x1, x2, x3) = mfbab;
-               (*this->localDistributions)(D3Q27System::ET_T, x1, x2, x3) = mfbba;
-               (*this->localDistributions)(D3Q27System::ET_NE, x1, x2, x3) = mfaab;
-               (*this->localDistributions)(D3Q27System::ET_NW, x1p, x2, x3) = mfcab;
-               (*this->localDistributions)(D3Q27System::ET_TE, x1, x2, x3) = mfaba;
-               (*this->localDistributions)(D3Q27System::ET_TW, x1p, x2, x3) = mfcba;
-               (*this->localDistributions)(D3Q27System::ET_TN, x1, x2, x3) = mfbaa;
-               (*this->localDistributions)(D3Q27System::ET_TS, x1, x2p, x3) = mfbca;
-               (*this->localDistributions)(D3Q27System::ET_TNE, x1, x2, x3) = mfaaa;
-               (*this->localDistributions)(D3Q27System::ET_TNW, x1p, x2, x3) = mfcaa;
-               (*this->localDistributions)(D3Q27System::ET_TSE, x1, x2p, x3) = mfaca;
-               (*this->localDistributions)(D3Q27System::ET_TSW, x1p, x2p, x3) = mfcca;
-
-               (*this->nonLocalDistributions)(D3Q27System::ET_W, x1p, x2, x3) = mfcbb;
-               (*this->nonLocalDistributions)(D3Q27System::ET_S, x1, x2p, x3) = mfbcb;
-               (*this->nonLocalDistributions)(D3Q27System::ET_B, x1, x2, x3p) = mfbbc;
-               (*this->nonLocalDistributions)(D3Q27System::ET_SW, x1p, x2p, x3) = mfccb;
-               (*this->nonLocalDistributions)(D3Q27System::ET_SE, x1, x2p, x3) = mfacb;
-               (*this->nonLocalDistributions)(D3Q27System::ET_BW, x1p, x2, x3p) = mfcbc;
-               (*this->nonLocalDistributions)(D3Q27System::ET_BE, x1, x2, x3p) = mfabc;
-               (*this->nonLocalDistributions)(D3Q27System::ET_BS, x1, x2p, x3p) = mfbcc;
-               (*this->nonLocalDistributions)(D3Q27System::ET_BN, x1, x2, x3p) = mfbac;
-               (*this->nonLocalDistributions)(D3Q27System::ET_BSW, x1p, x2p, x3p) = mfccc;
-               (*this->nonLocalDistributions)(D3Q27System::ET_BSE, x1, x2p, x3p) = mfacc;
-               (*this->nonLocalDistributions)(D3Q27System::ET_BNW, x1p, x2, x3p) = mfcac;
-               (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3p) = mfaac;
-
-               (*this->restDistributions)(x1, x2, x3) = mfbbb;
-               //////////////////////////////////////////////////////////////////////////
-
+                    ////////////////////////////////////////////////////////////////////////////////////
+                    //! - Write distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm
+                    //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
+                    //!
+                    (*this->localDistributions)(D3Q27System::ET_E, x1, x2, x3) = mfabb;
+                    (*this->localDistributions)(D3Q27System::ET_N, x1, x2, x3) = mfbab;
+                    (*this->localDistributions)(D3Q27System::ET_T, x1, x2, x3) = mfbba;
+                    (*this->localDistributions)(D3Q27System::ET_NE, x1, x2, x3) = mfaab;
+                    (*this->localDistributions)(D3Q27System::ET_NW, x1p, x2, x3) = mfcab;
+                    (*this->localDistributions)(D3Q27System::ET_TE, x1, x2, x3) = mfaba;
+                    (*this->localDistributions)(D3Q27System::ET_TW, x1p, x2, x3) = mfcba;
+                    (*this->localDistributions)(D3Q27System::ET_TN, x1, x2, x3) = mfbaa;
+                    (*this->localDistributions)(D3Q27System::ET_TS, x1, x2p, x3) = mfbca;
+                    (*this->localDistributions)(D3Q27System::ET_TNE, x1, x2, x3) = mfaaa;
+                    (*this->localDistributions)(D3Q27System::ET_TNW, x1p, x2, x3) = mfcaa;
+                    (*this->localDistributions)(D3Q27System::ET_TSE, x1, x2p, x3) = mfaca;
+                    (*this->localDistributions)(D3Q27System::ET_TSW, x1p, x2p, x3) = mfcca;
+
+                    (*this->nonLocalDistributions)(D3Q27System::ET_W, x1p, x2, x3) = mfcbb;
+                    (*this->nonLocalDistributions)(D3Q27System::ET_S, x1, x2p, x3) = mfbcb;
+                    (*this->nonLocalDistributions)(D3Q27System::ET_B, x1, x2, x3p) = mfbbc;
+                    (*this->nonLocalDistributions)(D3Q27System::ET_SW, x1p, x2p, x3) = mfccb;
+                    (*this->nonLocalDistributions)(D3Q27System::ET_SE, x1, x2p, x3) = mfacb;
+                    (*this->nonLocalDistributions)(D3Q27System::ET_BW, x1p, x2, x3p) = mfcbc;
+                    (*this->nonLocalDistributions)(D3Q27System::ET_BE, x1, x2, x3p) = mfabc;
+                    (*this->nonLocalDistributions)(D3Q27System::ET_BS, x1, x2p, x3p) = mfbcc;
+                    (*this->nonLocalDistributions)(D3Q27System::ET_BN, x1, x2, x3p) = mfbac;
+                    (*this->nonLocalDistributions)(D3Q27System::ET_BSW, x1p, x2p, x3p) = mfccc;
+                    (*this->nonLocalDistributions)(D3Q27System::ET_BSE, x1, x2p, x3p) = mfacc;
+                    (*this->nonLocalDistributions)(D3Q27System::ET_BNW, x1p, x2, x3p) = mfcac;
+                    (*this->nonLocalDistributions)(D3Q27System::ET_BNE, x1, x2, x3p) = mfaac;
+
+                    (*this->restDistributions)(x1, x2, x3) = mfbbb;
+                    //////////////////////////////////////////////////////////////////////////
+
+                }
             }
-         }
-      }
-   }
+        }
+    }
 }
 //////////////////////////////////////////////////////////////////////////
 
diff --git a/src/cpu/VirtualFluidsCore/LBM/CumulantK17LBMKernel.h b/src/cpu/VirtualFluidsCore/LBM/CumulantK17LBMKernel.h
index ca652cb37953f530d8a49a6917026652e4382a87..10cfd49264bb829eac1fc6b9bedeee3b6eace265 100644
--- a/src/cpu/VirtualFluidsCore/LBM/CumulantK17LBMKernel.h
+++ b/src/cpu/VirtualFluidsCore/LBM/CumulantK17LBMKernel.h
@@ -34,12 +34,12 @@
 #ifndef CumulantK17LBMKernel_h__
 #define CumulantK17LBMKernel_h__
 
+#include "LBMKernel.h"
 #include "BCProcessor.h"
 #include "D3Q27System.h"
-#include "LBMKernel.h"
-#include "basics/container/CbArray3D.h"
-#include "basics/container/CbArray4D.h"
 #include "basics/utilities/UbTiming.h"
+#include "basics/container/CbArray4D.h"
+#include "basics/container/CbArray3D.h"
 
 //! \brief   Compressible cumulant LBM kernel.
 //! \details  LBM implementation that use Cascaded Cumulant Lattice Boltzmann method for D3Q27 model
@@ -52,17 +52,16 @@ class CumulantK17LBMKernel : public LBMKernel
 {
 public:
     CumulantK17LBMKernel();
-    ~CumulantK17LBMKernel() override;
+    ~CumulantK17LBMKernel() = default;
     void calculate(int step) override;
     SPtr<LBMKernel> clone() override;
+    double getCalculationTime() override { return .0; }
 
 protected:
-    inline void forwardInverseChimeraWithK(LBMReal &mfa, LBMReal &mfb, LBMReal &mfc, LBMReal vv, LBMReal v2,
-                                           LBMReal Kinverse, LBMReal K);
-    inline void backwardInverseChimeraWithK(LBMReal &mfa, LBMReal &mfb, LBMReal &mfc, LBMReal vv, LBMReal v2,
-                                            LBMReal Kinverse, LBMReal K);
-    inline void forwardChimera(LBMReal &mfa, LBMReal &mfb, LBMReal &mfc, LBMReal vv, LBMReal v2);
-    inline void backwardChimera(LBMReal &mfa, LBMReal &mfb, LBMReal &mfc, LBMReal vv, LBMReal v2);
+    inline void forwardInverseChimeraWithK(LBMReal& mfa, LBMReal& mfb, LBMReal& mfc, LBMReal vv, LBMReal v2, LBMReal Kinverse, LBMReal K);
+    inline void backwardInverseChimeraWithK(LBMReal& mfa, LBMReal& mfb, LBMReal& mfc, LBMReal vv, LBMReal v2, LBMReal Kinverse, LBMReal K);
+    inline void forwardChimera(LBMReal& mfa, LBMReal& mfb, LBMReal& mfc, LBMReal vv, LBMReal v2);
+    inline void backwardChimera(LBMReal& mfa, LBMReal& mfb, LBMReal& mfc, LBMReal vv, LBMReal v2);
 
     virtual void initDataSet();
     LBMReal f[D3Q27System::ENDF + 1];
@@ -80,19 +79,18 @@ protected:
 };
 
 ////////////////////////////////////////////////////////////////////////////////
-//! \brief forward chimera transformation \ref forwardInverseChimeraWithK
+//! \brief forward chimera transformation \ref forwardInverseChimeraWithK 
 //! Transformation from distributions to central moments according to Eq. (6)-(14) in
-//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-//! ]</b></a> Modified for lower round-off errors.
+//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//! Modified for lower round-off errors.
 ////////////////////////////////////////////////////////////////////////////////
-inline void CumulantK17LBMKernel::forwardInverseChimeraWithK(LBMReal &mfa, LBMReal &mfb, LBMReal &mfc, LBMReal vv,
-                                                             LBMReal v2, LBMReal Kinverse, LBMReal K)
+inline void CumulantK17LBMKernel::forwardInverseChimeraWithK(LBMReal& mfa, LBMReal& mfb, LBMReal& mfc, LBMReal vv, LBMReal v2, LBMReal Kinverse, LBMReal K)
 {
     using namespace UbMath;
     LBMReal m2 = mfa + mfc;
     LBMReal m1 = mfc - mfa;
     LBMReal m0 = m2 + mfb;
-    mfa        = m0;
+    mfa = m0;
     m0 *= Kinverse;
     m0 += c1;
     mfb = (m1 * Kinverse - m0 * vv) * K;
@@ -101,50 +99,49 @@ inline void CumulantK17LBMKernel::forwardInverseChimeraWithK(LBMReal &mfa, LBMRe
 ////////////////////////////////////////////////////////////////////////////////
 //! \brief backward chimera transformation \ref backwardInverseChimeraWithK
 //! Transformation from central moments to distributions according to Eq. (57)-(65) in
-//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-//! ]</b></a> Modified for lower round-off errors.
+//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//! ] Modified for lower round-off errors.
 ////////////////////////////////////////////////////////////////////////////////
-inline void CumulantK17LBMKernel::backwardInverseChimeraWithK(LBMReal &mfa, LBMReal &mfb, LBMReal &mfc, LBMReal vv,
-                                                              LBMReal v2, LBMReal Kinverse, LBMReal K)
+inline void CumulantK17LBMKernel::backwardInverseChimeraWithK(LBMReal& mfa, LBMReal& mfb, LBMReal& mfc, LBMReal vv, LBMReal v2, LBMReal Kinverse, LBMReal K)
 {
     using namespace UbMath;
     LBMReal m0 = (((mfc - mfb) * c1o2 + mfb * vv) * Kinverse + (mfa * Kinverse + c1) * (v2 - vv) * c1o2) * K;
     LBMReal m1 = (((mfa - mfc) - c2 * mfb * vv) * Kinverse + (mfa * Kinverse + c1) * (-v2)) * K;
-    mfc        = (((mfc + mfb) * c1o2 + mfb * vv) * Kinverse + (mfa * Kinverse + c1) * (v2 + vv) * c1o2) * K;
-    mfa        = m0;
-    mfb        = m1;
+    mfc = (((mfc + mfb) * c1o2 + mfb * vv) * Kinverse + (mfa * Kinverse + c1) * (v2 + vv) * c1o2) * K;
+    mfa = m0;
+    mfb = m1;
 }
 ////////////////////////////////////////////////////////////////////////////////
-//! \brief forward chimera transformation \ref forwardChimera
+//! \brief forward chimera transformation \ref forwardChimera 
 //! Transformation from distributions to central moments according to Eq. (6)-(14) in
-//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-//! ]</b></a> for \f$ K_{abc}=0 \f$. This is to avoid unnessary floating point operations. Modified for lower round-off
-//! errors.
+//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//! for \f$ K_{abc}=0 \f$. This is to avoid unnessary floating point operations.
+//! Modified for lower round-off errors.
 ////////////////////////////////////////////////////////////////////////////////
-inline void CumulantK17LBMKernel::forwardChimera(LBMReal &mfa, LBMReal &mfb, LBMReal &mfc, LBMReal vv, LBMReal v2)
+inline void CumulantK17LBMKernel::forwardChimera(LBMReal& mfa, LBMReal& mfb, LBMReal& mfc, LBMReal vv, LBMReal v2)
 {
     using namespace UbMath;
     LBMReal m1 = (mfa + mfc) + mfb;
     LBMReal m2 = mfc - mfa;
-    mfc        = (mfc + mfa) + (v2 * m1 - c2 * vv * m2);
-    mfb        = m2 - vv * m1;
-    mfa        = m1;
+    mfc = (mfc + mfa) + (v2 * m1 - c2 * vv * m2);
+    mfb = m2 - vv * m1;
+    mfa = m1;
 }
 ////////////////////////////////////////////////////////////////////////////////
-//! \brief backward chimera transformation \ref backwardChimera
+//! \brief backward chimera transformation \ref backwardChimera 
 //! Transformation from central moments to distributions according to Eq. (57)-(65) in
-//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
-//! ]</b></a> for \f$ K_{abc}=0 \f$. This is to avoid unnessary floating point operations. Modified for lower round-off
-//! errors.
+//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//! for \f$ K_{abc}=0 \f$. This is to avoid unnessary floating point operations.
+//! Modified for lower round-off errors.
 ////////////////////////////////////////////////////////////////////////////////
-inline void CumulantK17LBMKernel::backwardChimera(LBMReal &mfa, LBMReal &mfb, LBMReal &mfc, LBMReal vv, LBMReal v2)
+inline void CumulantK17LBMKernel::backwardChimera(LBMReal& mfa, LBMReal& mfb, LBMReal& mfc, LBMReal vv, LBMReal v2)
 {
     using namespace UbMath;
     LBMReal ma = (mfc + mfa * (v2 - vv)) * c1o2 + mfb * (vv - c1o2);
     LBMReal mb = ((mfa - mfc) - mfa * v2) - c2 * mfb * vv;
-    mfc        = (mfc + mfa * (v2 + vv)) * c1o2 + mfb * (vv + c1o2);
-    mfb        = mb;
-    mfa        = ma;
+    mfc = (mfc + mfa * (v2 + vv)) * c1o2 + mfb * (vv + c1o2);
+    mfb = mb;
+    mfa = ma;
 }
 
 #endif // CumulantK17LBMKernel_h__
\ No newline at end of file
diff --git a/src/cpu/VirtualFluidsCore/LBM/D3Q27System.cpp b/src/cpu/VirtualFluidsCore/LBM/D3Q27System.cpp
index e6362563e31ad994ccc7b42c26c5e81a56a6e101..e4bea8735887c3ee9237e4fc368554e4e0002b55 100644
--- a/src/cpu/VirtualFluidsCore/LBM/D3Q27System.cpp
+++ b/src/cpu/VirtualFluidsCore/LBM/D3Q27System.cpp
@@ -1,9 +1,11 @@
 #include "D3Q27System.h"
+
 namespace D3Q27System
 {
 using namespace UbMath;
+
 // index             0   1   2   3   4   5  6   7   8    9  10  11  12  13  14  15  16  17  18//falsch
-// f:              REST, E,  W,  N,  S,  T,  B, NE, SW, SE, NW, TE, BW, BE, TW, TN, BS, BN, TS, TNE TNW TSE TSW BNE BNW
+// f:              ZERO, E,  W,  N,  S,  T,  B, NE, SW, SE, NW, TE, BW, BE, TW, TN, BS, BN, TS, TNE TNW TSE TSW BNE BNW
 // BSE BSW const int EX1[] = { 0,  1, -1,  0,  0,  0,  0,  1, -1,  1, -1,  1, -1,  1, -1,  0,  0,  0,  0,  1, -1,  1, -1,
 // 1, -1,  1, -1 }; const int EX2[] = { 0,  0,  0,  1, -1,  0,  0,  1, -1, -1,  1,  0,  0,  0,  0,  1, -1,  1, -1,  1, 1,
 // -1, -1,  1,  1, -1, -1 }; const int EX3[] = { 0,  0,  0,  0,  0,  1, -1,  0,  0,  0,  0,  1, -1, -1,  1,  1, -1, -1,
@@ -152,7 +154,7 @@ const double cNorm[3][ENDDIR] = { { double(DX1[0]),
 // const int BNW          = 23;
 // const int BSE          = 24;
 // const int BSW          = 25;
-// const int REST /*f0 */ = 26;
+// const int ZERO /*f0 */ = 26;
 
 // const int INV_E   = W;
 // const int INV_W   = E;
diff --git a/src/cpu/VirtualFluidsCore/LBM/D3Q27System.h b/src/cpu/VirtualFluidsCore/LBM/D3Q27System.h
index b3da79948c60e986b1c96f6f4266012dd7bffd23..b5d88d6c3791d716cd0dca567d7aaa803e863536 100644
--- a/src/cpu/VirtualFluidsCore/LBM/D3Q27System.h
+++ b/src/cpu/VirtualFluidsCore/LBM/D3Q27System.h
@@ -35,8 +35,8 @@
 #define D3Q27SYSTEM_H
 
 #include <cmath>
-#include <iostream>
 #include <string>
+#include <iostream>
 
 #include "LBMSystem.h"
 #include "UbException.h"
@@ -89,7 +89,7 @@ static const int BNE  = 22;
 static const int BNW  = 23;
 static const int BSE  = 24;
 static const int BSW  = 25;
-static const int REST = 26;
+static const int ZERO = 26;
 
 static const int INV_E   = W;
 static const int INV_W   = E;
@@ -155,7 +155,7 @@ static LBMReal getDensity(const LBMReal *const &f /*[27]*/)
     return ((f[TNE] + f[BSW]) + (f[TSE] + f[BNW])) + ((f[BSE] + f[TNW]) + (f[TSW] + f[BNE])) +
            (((f[NE] + f[SW]) + (f[SE] + f[NW])) + ((f[TE] + f[BW]) + (f[BE] + f[TW])) +
             ((f[BN] + f[TS]) + (f[TN] + f[BS]))) +
-           ((f[E] + f[W]) + (f[N] + f[S]) + (f[T] + f[B])) + f[REST];
+           ((f[E] + f[W]) + (f[N] + f[S]) + (f[T] + f[B])) + f[ZERO];
 }
 /*=====================================================================*/
 // ATTENTION: does not apply to all models -> use certificate instead of static! to do
@@ -184,7 +184,7 @@ static void calcDensity(const LBMReal *const &f /*[27]*/, LBMReal &rho)
     rho = ((f[TNE] + f[BSW]) + (f[TSE] + f[BNW])) + ((f[BSE] + f[TNW]) + (f[TSW] + f[BNE])) +
           (((f[NE] + f[SW]) + (f[SE] + f[NW])) + ((f[TE] + f[BW]) + (f[BE] + f[TW])) +
            ((f[BN] + f[TS]) + (f[TN] + f[BS]))) +
-          ((f[E] + f[W]) + (f[N] + f[S]) + (f[T] + f[B])) + f[REST];
+          ((f[E] + f[W]) + (f[N] + f[S]) + (f[T] + f[B])) + f[ZERO];
 }
 /*=====================================================================*/
 static void calcIncompVelocityX1(const LBMReal *const &f /*[27]*/, LBMReal &vx1)
@@ -279,7 +279,7 @@ static LBMReal getCompFeqForDirection(const int &direction, const LBMReal &drho,
     ////-----
     LBMReal rho = drho + c1;
     switch (direction) {
-        case REST:
+        case ZERO:
             return REAL_CAST(c8o27 * (drho + rho * (-cu_sq)));
         case E:
             return REAL_CAST(c2o27 * (drho + rho * (3.0 * (vx1) + c9o2 * (vx1) * (vx1)-cu_sq)));
@@ -354,7 +354,7 @@ static void calcCompFeq(LBMReal *const &feq /*[27]*/, const LBMReal &drho, const
     LBMReal cu_sq = 1.5 * (vx1 * vx1 + vx2 * vx2 + vx3 * vx3);
     LBMReal rho   = drho + c1;
 
-    feq[REST] = c8o27 * (drho + rho * (-cu_sq));
+    feq[ZERO] = c8o27 * (drho + rho * (-cu_sq));
     feq[E]    = c2o27 * (drho + rho * (3.0 * (vx1) + c9o2 * (vx1) * (vx1)-cu_sq));
     feq[W]    = c2o27 * (drho + rho * (3.0 * (-vx1) + c9o2 * (-vx1) * (-vx1) - cu_sq));
     feq[N]    = c2o27 * (drho + rho * (3.0 * (vx2) + c9o2 * (vx2) * (vx2)-cu_sq));
@@ -395,7 +395,7 @@ static LBMReal getIncompFeqForDirection(const int &direction, const LBMReal &drh
     LBMReal cu_sq = 1.5f * (vx1 * vx1 + vx2 * vx2 + vx3 * vx3);
 
     switch (direction) {
-        case REST:
+        case ZERO:
             return REAL_CAST(c8o27 * (drho - cu_sq));
         case E:
             return REAL_CAST(c2o27 * (drho + 3.0 * (vx1) + c9o2 * (vx1) * (vx1)-cu_sq));
@@ -435,25 +435,25 @@ static LBMReal getIncompFeqForDirection(const int &direction, const LBMReal &drh
             return REAL_CAST(c1o54 * (drho + 3.0 * (-vx2 + vx3) + c9o2 * (-vx2 + vx3) * (-vx2 + vx3) - cu_sq));
         case TNE:
             return REAL_CAST(c1o216 *
-                             (drho + 3.0 * (vx1 + vx2 + vx3) + c9o2 * (vx1 + vx2 + vx3) * (vx1 + vx2 + vx3) - cu_sq));
+                                 (drho + 3.0 * (vx1 + vx2 + vx3) + c9o2 * (vx1 + vx2 + vx3) * (vx1 + vx2 + vx3) - cu_sq));
         case BSW:
             return REAL_CAST(
                 c1o216 * (drho + 3.0 * (-vx1 - vx2 - vx3) + c9o2 * (-vx1 - vx2 - vx3) * (-vx1 - vx2 - vx3) - cu_sq));
         case BNE:
             return REAL_CAST(c1o216 *
-                             (drho + 3.0 * (vx1 + vx2 - vx3) + c9o2 * (vx1 + vx2 - vx3) * (vx1 + vx2 - vx3) - cu_sq));
+                                 (drho + 3.0 * (vx1 + vx2 - vx3) + c9o2 * (vx1 + vx2 - vx3) * (vx1 + vx2 - vx3) - cu_sq));
         case TSW:
             return REAL_CAST(
                 c1o216 * (drho + 3.0 * (-vx1 - vx2 + vx3) + c9o2 * (-vx1 - vx2 + vx3) * (-vx1 - vx2 + vx3) - cu_sq));
         case TSE:
             return REAL_CAST(c1o216 *
-                             (drho + 3.0 * (vx1 - vx2 + vx3) + c9o2 * (vx1 - vx2 + vx3) * (vx1 - vx2 + vx3) - cu_sq));
+                                 (drho + 3.0 * (vx1 - vx2 + vx3) + c9o2 * (vx1 - vx2 + vx3) * (vx1 - vx2 + vx3) - cu_sq));
         case BNW:
             return REAL_CAST(
                 c1o216 * (drho + 3.0 * (-vx1 + vx2 - vx3) + c9o2 * (-vx1 + vx2 - vx3) * (-vx1 + vx2 - vx3) - cu_sq));
         case BSE:
             return REAL_CAST(c1o216 *
-                             (drho + 3.0 * (vx1 - vx2 - vx3) + c9o2 * (vx1 - vx2 - vx3) * (vx1 - vx2 - vx3) - cu_sq));
+                                 (drho + 3.0 * (vx1 - vx2 - vx3) + c9o2 * (vx1 - vx2 - vx3) * (vx1 - vx2 - vx3) - cu_sq));
         case TNW:
             return REAL_CAST(
                 c1o216 * (drho + 3.0 * (-vx1 + vx2 + vx3) + c9o2 * (-vx1 + vx2 + vx3) * (-vx1 + vx2 + vx3) - cu_sq));
@@ -469,7 +469,7 @@ static void calcIncompFeq(LBMReal *const &feq /*[27]*/, const LBMReal &drho, con
 
     LBMReal cu_sq = 1.5 * (vx1 * vx1 + vx2 * vx2 + vx3 * vx3);
 
-    feq[REST] = c8o27 * (drho - cu_sq);
+    feq[ZERO] = c8o27 * (drho - cu_sq);
     feq[E]    = c2o27 * (drho + 3.0 * (vx1) + c9o2 * (vx1) * (vx1)-cu_sq);
     feq[W]    = c2o27 * (drho + 3.0 * (-vx1) + c9o2 * (-vx1) * (-vx1) - cu_sq);
     feq[N]    = c2o27 * (drho + 3.0 * (vx2) + c9o2 * (vx2) * (vx2)-cu_sq);
@@ -758,7 +758,7 @@ static inline LBMReal calcPress(const LBMReal *const f, LBMReal rho, LBMReal vx1
              c2 * (f[NE] + f[SW] + f[SE] + f[NW] + f[TE] + f[BW] + f[BE] + f[TW] + f[TN] + f[BS] + f[BN] + f[TS]) +
              c3 * (f[TNE] + f[TSW] + f[TSE] + f[TNW] + f[BNE] + f[BSW] + f[BSE] + f[BNW]) -
              (vx1 * vx1 + vx2 * vx2 + vx3 * vx3)) *
-                (c1 - c1o2 * OxxPyyPzz) +
+            (c1 - c1o2 * OxxPyyPzz) +
             OxxPyyPzz * c1o2 * (rho)) *
            c1o3;
 }
diff --git a/src/cpu/VirtualFluidsCore/LBM/ILBMKernel.h b/src/cpu/VirtualFluidsCore/LBM/ILBMKernel.h
index 44d8d3273d6bfb555acb2c994b97dfbeb676f1c3..4dbe8eee09a37c0c220f47619b72bade2e6ec527 100644
--- a/src/cpu/VirtualFluidsCore/LBM/ILBMKernel.h
+++ b/src/cpu/VirtualFluidsCore/LBM/ILBMKernel.h
@@ -45,8 +45,9 @@ class ILBMKernel
 public:
     virtual ~ILBMKernel() = default;
 
-    virtual void calculate(int step) = 0;
-    virtual void swapDistributions() = 0;
+    virtual void calculate(int step)    = 0;
+    virtual double getCalculationTime() = 0;
+    virtual void swapDistributions()    = 0;
 
     virtual bool getCompressible() const                                             = 0;
     virtual SPtr<BCProcessor> getBCProcessor() const                                 = 0;
diff --git a/src/cpu/VirtualFluidsCore/LBM/LBMKernel.h b/src/cpu/VirtualFluidsCore/LBM/LBMKernel.h
index bfaf9d31275d12ac2d4795c46787af864329980d..be29589b9b7ab239bece700126ae906795c83977 100644
--- a/src/cpu/VirtualFluidsCore/LBM/LBMKernel.h
+++ b/src/cpu/VirtualFluidsCore/LBM/LBMKernel.h
@@ -56,6 +56,9 @@ public:
 
     virtual SPtr<LBMKernel> clone() = 0;
 
+    void calculate(int step) override    = 0;
+    double getCalculationTime() override = 0;
+
     void setBCProcessor(SPtr<BCProcessor> bcp) override;
     SPtr<BCProcessor> getBCProcessor() const override;
 
diff --git a/src/cpu/VirtualFluidsCore/LBM/LBMUnitConverter.h b/src/cpu/VirtualFluidsCore/LBM/LBMUnitConverter.h
index 376330314497224f2ac92e61de7ae4bf81589c93..40570cc3847f71a1942791afa7e95145daafb53b 100644
--- a/src/cpu/VirtualFluidsCore/LBM/LBMUnitConverter.h
+++ b/src/cpu/VirtualFluidsCore/LBM/LBMUnitConverter.h
@@ -97,6 +97,14 @@ public:
         this->init(refLengthWorld, csWorld, rhoWorld, csWorld, refLengthLb, rhoLb, csLb);
     }
 
+    LBMUnitConverter(int /*dummy*/, double uReal, double uLB, double nuReal, double nuLB)
+    {
+        factorVelocityLbToW  = uReal / uLB;
+        factorViscosityLbToW = nuReal / nuLB;
+        factorDensityLbToW   = factorViscosityLbToW * factorVelocityLbToW * factorVelocityLbToW;
+        factorPressureLbToW  = factorDensityLbToW;
+    }
+
     virtual ~LBMUnitConverter() = default;
 
     double getRefRhoLb() { return refRhoLb; }
@@ -132,6 +140,10 @@ public:
     double getFactorAccWToLb() { return 1.0 / this->getFactorAccLbToW(); }
 
     double getFactorTimeLbToW(double deltaX) const { return factorTimeWithoutDx * deltaX; }
+    //////////////////////////////////////////////////////////////////////////
+    double getFactorVelocityLbToW2() { return factorVelocityLbToW; }
+    double getFactorDensityLbToW2() { return factorDensityLbToW; }
+    double getFactorPressureLbToW2() { return factorPressureLbToW; }
 
     /*==========================================================*/
     friend inline std::ostream &operator<<(std::ostream &os, LBMUnitConverter c)
@@ -199,7 +211,12 @@ protected:
     double factorTimeLbToW{ 1.0 };
     double factorMassLbToW{ 1.0 };
     double refRhoLb{ 1.0 };
-    double factorTimeWithoutDx;
+    double factorTimeWithoutDx{ 0.0 };
+
+    double factorVelocityLbToW{ 1.0 };
+    double factorViscosityLbToW{ 1.0 };
+    double factorDensityLbToW{ 1.0 };
+    double factorPressureLbToW{ 1.0 };
 };
 
 #endif // LBMUNITCONVERTER_H
diff --git a/src/cpu/VirtualFluidsCore/Utilities/MemoryUtil.h b/src/cpu/VirtualFluidsCore/Utilities/MemoryUtil.h
index b476fd7750fb5ef950d13b6157349607b6d4d7c9..670a597cb84bd4e98450dad2743a8100f04497ea 100644
--- a/src/cpu/VirtualFluidsCore/Utilities/MemoryUtil.h
+++ b/src/cpu/VirtualFluidsCore/Utilities/MemoryUtil.h
@@ -36,8 +36,8 @@
 
 #if defined(_WIN32) || defined(_WIN64)
 #define MEMORYUTIL_WINDOWS
-#include "psapi.h"
 #include "windows.h"
+#include "psapi.h"
 #pragma comment(lib, "psapi.lib")
 #elif defined __APPLE__
 #define MEMORYUTIL_APPLE
diff --git a/src/cpu/VirtualFluidsCore/Visitors/InitDistributionsBlockVisitor.cpp b/src/cpu/VirtualFluidsCore/Visitors/InitDistributionsBlockVisitor.cpp
index 8cbf1801f94111f26f5874753271e24873420a1e..c424a6376a62159da2e4b9f73ccf01858fbde521 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/InitDistributionsBlockVisitor.cpp
+++ b/src/cpu/VirtualFluidsCore/Visitors/InitDistributionsBlockVisitor.cpp
@@ -294,7 +294,7 @@ void InitDistributionsBlockVisitor::visit(const SPtr<Grid3D> grid, SPtr<Block3D>
                     f[BNW]  = f_TSE + feq[BNW];
                     f[BSE]  = f_TNW + feq[BSE];
                     f[BSW]  = f_TNE + feq[BSW];
-                    f[REST] = f_ZERO + feq[REST];
+                    f[ZERO] = f_ZERO + feq[ZERO];
 
                     // calcFeqsFct(f,rho,vx1,vx2,vx3);
                     // distributions->setDistribution(f, ix1, ix2, ix3);
diff --git a/src/cpu/VirtualFluidsCore/Visitors/SetConnectorsBlockVisitor.cpp b/src/cpu/VirtualFluidsCore/Visitors/SetConnectorsBlockVisitor.cpp
index 9814ab036621d94833ad9b4baff61866b22eac4c..c0efdcc6135b1e06f766621dd4528f96fa32247d 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/SetConnectorsBlockVisitor.cpp
+++ b/src/cpu/VirtualFluidsCore/Visitors/SetConnectorsBlockVisitor.cpp
@@ -37,13 +37,11 @@
 #include "Grid3D.h"
 #include "Grid3DSystem.h"
 
-SetConnectorsBlockVisitor::SetConnectorsBlockVisitor(SPtr<Communicator> comm, bool fullConnector, int dirs, LBMReal nu)
-    : Block3DVisitor(0, Grid3DSystem::MAXLEVEL), comm(comm), fullConnector(fullConnector), dirs(dirs), nu(nu)
+SetConnectorsBlockVisitor::SetConnectorsBlockVisitor(SPtr<Communicator> comm, bool fullConnector, int dirs, LBMReal nue)
+    : Block3DVisitor(0, Grid3DSystem::MAXLEVEL), comm(comm), fullConnector(fullConnector), dirs(dirs), nue(nue)
 {
 }
 //////////////////////////////////////////////////////////////////////////
-SetConnectorsBlockVisitor::~SetConnectorsBlockVisitor(void) = default;
-//////////////////////////////////////////////////////////////////////////
 void SetConnectorsBlockVisitor::visit(SPtr<Grid3D> grid, SPtr<Block3D> block)
 {
     if (!block)
diff --git a/src/cpu/VirtualFluidsCore/Visitors/SetConnectorsBlockVisitor.h b/src/cpu/VirtualFluidsCore/Visitors/SetConnectorsBlockVisitor.h
index 7d209e0524a13ab89a48dfc2179203eb17b95162..f6eb15206371af2ff6106a5c82c6c71eba26fb34 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/SetConnectorsBlockVisitor.h
+++ b/src/cpu/VirtualFluidsCore/Visitors/SetConnectorsBlockVisitor.h
@@ -39,6 +39,7 @@
 #include "Block3DVisitor.h"
 #include "D3Q27System.h"
 
+
 class Grid3D;
 class Block3D;
 class Communicator;
@@ -48,8 +49,8 @@ class InterpolationProcessor;
 class SetConnectorsBlockVisitor : public Block3DVisitor
 {
 public:
-    SetConnectorsBlockVisitor(SPtr<Communicator> comm, bool fullConnector, int dirs, LBMReal nu);
-    ~SetConnectorsBlockVisitor() override;
+    SetConnectorsBlockVisitor(SPtr<Communicator> comm, bool fullConnector, int dirs, LBMReal nue);
+    ~SetConnectorsBlockVisitor() = default;
     void visit(SPtr<Grid3D> grid, SPtr<Block3D> block) override;
     //////////////////////////////////////////////////////////////////////////
 protected:
@@ -58,7 +59,7 @@ protected:
     bool fullConnector;
     int dirs;
     int gridRank;
-    LBMReal nu;
+    LBMReal nue;
     SPtr<InterpolationProcessor> iProcessor;
 };
 
diff --git a/src/cpu/VirtualFluidsCore/Visitors/SetKernelBlockVisitor.cpp b/src/cpu/VirtualFluidsCore/Visitors/SetKernelBlockVisitor.cpp
index bc4b3d25701789cfb62b107880680a3287486f8b..7dde0c34edf5c16dff22361a6bbc36394a9783ed 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/SetKernelBlockVisitor.cpp
+++ b/src/cpu/VirtualFluidsCore/Visitors/SetKernelBlockVisitor.cpp
@@ -1,3 +1,4 @@
+#include "MemoryUtil.h"
 //=======================================================================================
 // ____          ____    __    ______     __________   __      __       __        __
 // \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
@@ -43,13 +44,24 @@
 #include <utility>
 
 //////////////////////////////////////////////////////////////////////////
-SetKernelBlockVisitor::SetKernelBlockVisitor(SPtr<LBMKernel> kernel, LBMReal nue, SetKernelBlockVisitor::Action action)
+SetKernelBlockVisitor::SetKernelBlockVisitor(SPtr<LBMKernel> kernel, LBMReal nue,
+                                             SetKernelBlockVisitor::Action action)
     : Block3DVisitor(0, Grid3DSystem::MAXLEVEL), kernel(std::move(kernel)), nue(nue), action(action), dataSetFlag(true)
 {
 }
+
+SetKernelBlockVisitor::SetKernelBlockVisitor(SPtr<LBMKernel> kernel, LBMReal nue, int &numberOfProcesses,
+                                             SetKernelBlockVisitor::Action action)
+    : Block3DVisitor(0, Grid3DSystem::MAXLEVEL), kernel(std::move(kernel)), nue(nue), action(action), dataSetFlag(true),
+      numberOfProcesses(numberOfProcesses)
+{
+}
+
 //////////////////////////////////////////////////////////////////////////
 void SetKernelBlockVisitor::visit(SPtr<Grid3D> grid, SPtr<Block3D> block)
 {
+    throwExceptionIfNotEnoughMemory(grid);
+
     if (kernel && (block->getRank() == grid->getRank())) {
         LBMReal collFactor = LBMSystem::calcCollisionFactor(nue, block->getLevel());
         kernel->setCollisionFactor(collFactor);
@@ -98,3 +110,27 @@ void SetKernelBlockVisitor::visit(SPtr<Grid3D> grid, SPtr<Block3D> block)
 }
 
 void SetKernelBlockVisitor::setNoDataSetFlag(bool flag) { dataSetFlag = flag; }
+
+void SetKernelBlockVisitor::throwExceptionIfNotEnoughMemory(const SPtr<Grid3D> &grid)
+{
+    auto availableMemory = Utilities::getTotalPhysMem();
+    auto requiredMemory  = getRequiredPhysicalMemory(grid);
+    if (requiredMemory > availableMemory)
+        throw UbException(UB_EXARGS, "SetKernelBlockVisitor: Not enough memory!!!");
+}
+
+double SetKernelBlockVisitor::getRequiredPhysicalMemory(const SPtr<Grid3D> &grid) const
+{
+    unsigned long long numberOfNodesPerBlockWithGhostLayer;
+    auto numberOfBlocks = (unsigned long long)grid->getNumberOfBlocks();
+    auto blockNx        = grid->getBlockNX();
+    int ghostLayer      = 3;
+
+    numberOfNodesPerBlockWithGhostLayer = numberOfBlocks * (val<1>(blockNx) + ghostLayer) *
+                                          (val<2>(blockNx) + ghostLayer) * (val<3>(blockNx) + ghostLayer);
+
+    auto needMemAll =
+        double(numberOfNodesPerBlockWithGhostLayer * (27 * sizeof(double) + sizeof(int) + sizeof(float) * 4));
+
+    return needMemAll / double(numberOfProcesses);
+}
diff --git a/src/cpu/VirtualFluidsCore/Visitors/SetKernelBlockVisitor.h b/src/cpu/VirtualFluidsCore/Visitors/SetKernelBlockVisitor.h
index c2863555c0490b24e23ae2d99933e62ce94ac467..7ce7c852e2a815bdf0a37f3ef2960e1b5b76e4b4 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/SetKernelBlockVisitor.h
+++ b/src/cpu/VirtualFluidsCore/Visitors/SetKernelBlockVisitor.h
@@ -51,6 +51,10 @@ public:
 
     SetKernelBlockVisitor(SPtr<LBMKernel> kernel, LBMReal nue,
                           SetKernelBlockVisitor::Action action = SetKernelBlockVisitor::NewKernel);
+
+    SetKernelBlockVisitor(SPtr<LBMKernel> kernel, LBMReal nue, int &numberOfProcesses,
+                          SetKernelBlockVisitor::Action action = SetKernelBlockVisitor::NewKernel);
+
     ~SetKernelBlockVisitor() override = default;
 
     void visit(SPtr<Grid3D> grid, SPtr<Block3D> block) override;
@@ -62,6 +66,12 @@ private:
     LBMReal nue;
     Action action;
     bool dataSetFlag;
+
+    int numberOfProcesses{ 1 };
+
+    double getRequiredPhysicalMemory(const SPtr<Grid3D> &grid) const;
+
+    void throwExceptionIfNotEnoughMemory(const SPtr<Grid3D> &grid);
 };
 
 #endif
diff --git a/src/gpu/GksGpu/CMakeLists.txt b/src/gpu/GksGpu/CMakeLists.txt
index 88df905d3eb847c1ed5e9b5bc5a1ec68c549e313..da404e0209ed2c9f36ae323d2e6bd234fb6dfb96 100644
--- a/src/gpu/GksGpu/CMakeLists.txt
+++ b/src/gpu/GksGpu/CMakeLists.txt
@@ -1,8 +1,3 @@
+project(GksGpu LANGUAGES CUDA CXX)
 
-vf_add_library(BUILDTYPE shared PRIVATE_LINK basics GksMeshAdapter)
-
-linkCUDA()
-linkMPI()
-
-vf_get_library_name(library_name)
-linkOpenMP(${library_name})
+vf_add_library(PRIVATE_LINK basics GksMeshAdapter OpenMP::OpenMP_CXX MPI::MPI_CXX)
diff --git a/src/gpu/GksMeshAdapter/CMakeLists.txt b/src/gpu/GksMeshAdapter/CMakeLists.txt
index 4a16d6fbc0b99d1ec400944178844123eb50ffc0..cb00b3c016786c41ef5640eb362322bb0a3768f8 100644
--- a/src/gpu/GksMeshAdapter/CMakeLists.txt
+++ b/src/gpu/GksMeshAdapter/CMakeLists.txt
@@ -1,5 +1,3 @@
+project(GksMeshAdapter LANGUAGES CUDA CXX)
 
-
-vf_add_library(BUILDTYPE shared PRIVATE_LINK basics GridGenerator)
-
-linkCUDA()
+vf_add_library(PRIVATE_LINK basics GridGenerator)
diff --git a/src/gpu/GridGenerator/CMakeLists.txt b/src/gpu/GridGenerator/CMakeLists.txt
index aaabd6d5de07940194e8428f0d3c9a18a741ae33..1ce294bf420f657f35397c427929f9f310d04556 100644
--- a/src/gpu/GridGenerator/CMakeLists.txt
+++ b/src/gpu/GridGenerator/CMakeLists.txt
@@ -1,9 +1,13 @@
+project(GridGenerator LANGUAGES CUDA CXX)
 
 
-vf_add_library(BUILDTYPE shared PRIVATE_LINK basics)
-vf_get_library_name(library_name)
+vf_add_library(PRIVATE_LINK basics OpenMP::OpenMP_CXX)
 
-linkCUDA()
-linkOpenMP(${library_name})
+vf_get_library_name(library_name)
+set_target_properties(${library_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
-set_target_properties(${library_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
\ No newline at end of file
+# according to linker error when building static libraries.
+# https://stackoverflow.com/questions/50033435/cmake-cuda-separate-compilation-static-lib-link-error-on-windows-but-not-on-ubun
+if (NOT BUILD_SHARED_LIBRARY)
+    set_target_properties(${library_name} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+endif()
\ No newline at end of file
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp
index 750550d7e79b5aba978d8052b83668958cebb837..4081aeffd9165e838959c78e39c9b51d6082c4a7 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp
@@ -37,7 +37,7 @@
 #include "grid/BoundaryConditions/Side.h"
 #include "grid/Grid.h"
 
-bool BoundaryCondition::isSide( SideType side ) const
+bool gg::BoundaryCondition::isSide( SideType side ) const
 {
     return this->side->whoAmI() == side;
 }
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h
index e4e67af092841b426e0d7774048af78abedc79ac..b082e6a7402a606b72d08bc28e9b612fa6661974 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h
@@ -44,25 +44,30 @@ class Grid;
 class Side;
 enum class SideType;
 
+namespace gg
+{
+
 class BoundaryCondition
 {
 public:
     std::vector<uint> indices;
     SPtr<Side> side;
-    std::vector<std::vector<real> > qs;
+    std::vector<std::vector<real>> qs;
 
     std::vector<uint> patches;
 
     virtual char getType() const = 0;
 
-    bool isSide( SideType side ) const;
+    bool isSide(SideType side) const;
 
-    real getQ( uint index, uint dir ){ return this->qs[index][dir]; }
+    real getQ(uint index, uint dir) { return this->qs[index][dir]; }
 };
 
+}
+
 //////////////////////////////////////////////////////////////////////////
 
-class PressureBoundaryCondition : public BoundaryCondition
+class PressureBoundaryCondition : public gg::BoundaryCondition
 {
 public:
     static SPtr<PressureBoundaryCondition> make(real rho)
@@ -91,7 +96,7 @@ public:
 
 //////////////////////////////////////////////////////////////////////////
 
-class VelocityBoundaryCondition : public BoundaryCondition
+class VelocityBoundaryCondition : public gg ::BoundaryCondition
 {
 public:
     static SPtr<VelocityBoundaryCondition> make(real vx, real vy, real vz)
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
index 6937ac88c35e75e95c9e6a3d2973d170857dc028..99097a393735a31abad0dd717a2dcfc2b1d35326 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
@@ -38,6 +38,8 @@
 
 #include "utilities/math/Math.h"
 
+using namespace gg;
+
 void Side::addIndices(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition, std::string coord, real constant,
                       real startInner, real endInner, real startOuter, real endOuter)
 {
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h
index 7208ece4835e7476cdb17d236bf9bfabbba0493a..d8dc9a0e4ac8c2825d49dd19148d53538d027a8e 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h
@@ -46,7 +46,11 @@
 #define NEGATIVE_DIR -1
 
 class Grid;
+
+namespace gg
+{
 class BoundaryCondition;
+}
 
 class Side;
 
@@ -60,7 +64,7 @@ enum class SideType
 class Side
 {
 public:
-    virtual void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition) = 0;
+    virtual void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) = 0;
 
     virtual int getCoordinate() const = 0;
     virtual int getDirection() const = 0;
@@ -68,12 +72,12 @@ public:
     virtual SideType whoAmI() const = 0;
 
 protected:
-    static void addIndices(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition, std::string coord, real constant,
+    static void addIndices(SPtr<Grid> grid, SPtr<gg::BoundaryCondition> boundaryCondition, std::string coord, real constant,
                            real startInner, real endInner, real startOuter, real endOuter);
 
-    static void setPressureNeighborIndices(SPtr<BoundaryCondition> boundaryCondition, SPtr<Grid> grid, const uint index);
+    static void setPressureNeighborIndices(SPtr<gg::BoundaryCondition> boundaryCondition, SPtr<Grid> grid, const uint index);
 
-    static void setQs(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition, uint index);
+    static void setQs(SPtr<Grid> grid, SPtr<gg::BoundaryCondition> boundaryCondition, uint index);
 
 private:
     static uint getIndex(SPtr<Grid> grid, std::string coord, real constant, real v1, real v2);
@@ -82,7 +86,7 @@ private:
 class MX : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition) override;
+    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
@@ -103,7 +107,7 @@ public:
 class PX : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition) override;
+    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
@@ -125,7 +129,7 @@ public:
 class MY : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition) override;
+    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
@@ -146,7 +150,7 @@ public:
 class PY : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition) override;
+    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
@@ -168,7 +172,7 @@ public:
 class MZ : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition) override;
+    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
@@ -189,7 +193,7 @@ public:
 class PZ : public Side
 {
 public:
-    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCondition> boundaryCondition) override;
+    void addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<gg::BoundaryCondition> boundaryCondition) override;
 
     int getCoordinate() const override
     {
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
index 84b6c8062d5d4cbe77a8ac13919ba9b7ca4b976e..f398e89a0936e0e03261c7b75f22e7311d96e161 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
@@ -60,7 +60,10 @@ class Grid;
 
 enum class SideType;
 
+namespace gg
+{
 class BoundaryCondition;
+}
 class GeometryBoundaryCondition;
 
 class GridBuilder
@@ -93,7 +96,7 @@ public:
     virtual void getVelocityValues(real* vx, real* vy, real* vz, int* indices, int level) const = 0;
     virtual void getVelocityQs(real* qs[27], int level) const = 0;
 
-    virtual SPtr<BoundaryCondition> getBoundaryCondition( SideType side, uint level ) const = 0;
+    virtual SPtr<gg::BoundaryCondition> getBoundaryCondition( SideType side, uint level ) const = 0;
 
 };
 
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
index 09debba7d011ec20c389ae9657b8a46185863b58..7cf735197b9fd69c7f85351e659fb31c4818efd6 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
@@ -227,7 +227,7 @@ void LevelGridBuilder::getVelocityQs(real* qs[27], int level) const
     }
 }
 
-GRIDGENERATOR_EXPORT SPtr<BoundaryCondition> LevelGridBuilder::getBoundaryCondition(SideType side, uint level) const
+GRIDGENERATOR_EXPORT SPtr<gg::BoundaryCondition> LevelGridBuilder::getBoundaryCondition(SideType side, uint level) const
 {
     for( auto bc : this->boundaryConditions[level]->velocityBoundaryConditions )
         if( bc->isSide(side) )
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
index dcf491070de02ed4d700e778d0d883ce2b9b959e..100dedede48d962174b852b88ede465777bc25c2 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
@@ -87,7 +87,7 @@ public:
     GRIDGENERATOR_EXPORT virtual void getVelocityValues(real* vx, real* vy, real* vz, int* indices, int level) const;
     GRIDGENERATOR_EXPORT virtual void getVelocityQs(real* qs[27], int level) const;
 
-    GRIDGENERATOR_EXPORT SPtr<BoundaryCondition> getBoundaryCondition( SideType side, uint level ) const override;
+    GRIDGENERATOR_EXPORT SPtr<gg::BoundaryCondition> getBoundaryCondition( SideType side, uint level ) const override;
 
 protected:
     
diff --git a/src/gpu/VirtualFluids_GPU/CMakeLists.txt b/src/gpu/VirtualFluids_GPU/CMakeLists.txt
index 477a8cc73b17873dfbfc0e11173db0ff5ad5593d..95405b25864f506e5580fa6711a4389d39d8d7a7 100644
--- a/src/gpu/VirtualFluids_GPU/CMakeLists.txt
+++ b/src/gpu/VirtualFluids_GPU/CMakeLists.txt
@@ -1,21 +1,14 @@
+project(VirtualFluids_GPU LANGUAGES CUDA CXX)
 
+set(additional_libraries "")
+if(MSVC)
+    set(additional_libraries ws2_32 Traffic) # ws_32 throws an error on Phoenix
+endif()
 
-IF(MSVC)
-    set(libsToLink ws2_32 GridGenerator basics) # ws_32 throws an error on Phoenix
-ELSE(MSVC)
-    set(libsToLink GridGenerator basics)
-ENDIF(MSVC)
+vf_add_library(PRIVATE_LINK ${additional_libraries} GridGenerator basics MPI::MPI_CXX)
 
-
-vf_add_library(BUILDTYPE shared PRIVATE_LINK ${libsToLink})
-
-linkCUDA()
+linkBoost(COMPONENTS "serialization")
 
 #SET(TPN_WIN32 "/EHsc")
 #https://stackoverflow.com/questions/6832666/lnk2019-when-including-asio-headers-solution-generated-with-cmake
 #https://stackoverflow.com/questions/27442885/syntax-error-with-stdnumeric-limitsmax
-
-IF(MSVC)
-    vf_get_library_name(library_name)
-    set_target_properties(${library_name} PROPERTIES LINK_FLAGS "/ignore:4251")
-ENDIF(MSVC)