From da92c5b46be7818540075b0ff6d1a3fbc40581d7 Mon Sep 17 00:00:00 2001
From: Soeren Peters <peters@irmb.tu-bs.de>
Date: Mon, 29 Mar 2021 14:51:15 +0000
Subject: [PATCH] Add CUDA LTO.

---
 CMakeLists.txt                                |   19 +-
 apps/gpu/LBM/DrivenCavity/CMakeLists.txt      |    6 +-
 src/gpu/GridGenerator/CMakeLists.txt          |    2 +-
 src/gpu/VirtualFluids_GPU/CMakeLists.txt      |    6 +
 .../VirtualFluids_GPU/GPU/Cumulant27chim.cu   | 1338 -----------------
 .../GPU/CumulantK17ChimeraKernel.cu           |  742 +++++++++
 src/lbm/CMakeLists.txt                        |    4 +-
 src/lbm/CumulantChimeraPreCompiled.cu         |  433 ++++++
 src/lbm/CumulantChimeraPreCompiled.h          |   59 +
 9 files changed, 1263 insertions(+), 1346 deletions(-)
 create mode 100644 src/gpu/VirtualFluids_GPU/GPU/CumulantK17ChimeraKernel.cu
 create mode 100644 src/lbm/CumulantChimeraPreCompiled.cu
 create mode 100644 src/lbm/CumulantChimeraPreCompiled.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c85fcf7e..e1d5b0736 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,12 +94,21 @@ if(BUILD_VF_GPU)
 
     set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
 
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe --display_error_number")
 
-    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        message(WARNING "CMAKE_CUDA_ARCHITECTURES was not defined and is set to 30 (CUDA support until 10.1 only).")
-        set(CMAKE_CUDA_ARCHITECTURES 30)
-    endif()
+    #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -dlto")
+    enable_language(CUDA)
+    #set(CMAKE_CUDA_FLAGS "-gencode arch=compute_70,code=lto_70")
+
+    set(CMAKE_CUDA_FLAGS "-dlto -arch=sm_75")
+    #set(CMAKE_CUDA_ARCHITECTURES 75)
+    #set(CMAKE_CUDA_FLAGS "-arch=sm_75")
+
+    #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe --display_error_number")
+
+    #if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    #    message(WARNING "CMAKE_CUDA_ARCHITECTURES was not defined and is set to 30 (CUDA support until 10.1 only).")
+    #    set(CMAKE_CUDA_ARCHITECTURES 30)
+    #endif()
 
     message("CUDA Architecture: ${CMAKE_CUDA_ARCHITECTURES}")
 endif()
diff --git a/apps/gpu/LBM/DrivenCavity/CMakeLists.txt b/apps/gpu/LBM/DrivenCavity/CMakeLists.txt
index c72393791..10873fcd4 100644
--- a/apps/gpu/LBM/DrivenCavity/CMakeLists.txt
+++ b/apps/gpu/LBM/DrivenCavity/CMakeLists.txt
@@ -6,4 +6,8 @@ vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenera
 
 set_source_files_properties(DrivenCavity.cpp PROPERTIES LANGUAGE CUDA)
 
-set_target_properties(DrivenCavity PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
\ No newline at end of file
+set_target_properties(DrivenCavity PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+#target_compile_options(DrivenCavity PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-dlto>)
+
+#target_link_options(DrivenCavity PRIVATE $<DEVICE_LINK:-dlto>)
\ No newline at end of file
diff --git a/src/gpu/GridGenerator/CMakeLists.txt b/src/gpu/GridGenerator/CMakeLists.txt
index 29d77897a..84dd965d7 100644
--- a/src/gpu/GridGenerator/CMakeLists.txt
+++ b/src/gpu/GridGenerator/CMakeLists.txt
@@ -9,7 +9,7 @@ set_target_properties(${library_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 # according to linker error when building static libraries.
 # https://stackoverflow.com/questions/50033435/cmake-cuda-separate-compilation-static-lib-link-error-on-windows-but-not-on-ubun
 if (NOT BUILD_SHARED_LIBRARY)
-    set_target_properties(${library_name} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+    #set_target_properties(${library_name} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 endif()
 
 # we want to suppress all cuda warnings so far for this library.
diff --git a/src/gpu/VirtualFluids_GPU/CMakeLists.txt b/src/gpu/VirtualFluids_GPU/CMakeLists.txt
index c2d897563..bda286187 100644
--- a/src/gpu/VirtualFluids_GPU/CMakeLists.txt
+++ b/src/gpu/VirtualFluids_GPU/CMakeLists.txt
@@ -13,3 +13,9 @@ linkBoost(COMPONENTS "serialization")
 #SET(TPN_WIN32 "/EHsc")
 #https://stackoverflow.com/questions/6832666/lnk2019-when-including-asio-headers-solution-generated-with-cmake
 #https://stackoverflow.com/questions/27442885/syntax-error-with-stdnumeric-limitsmax
+
+set_target_properties(VirtualFluids_GPU PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+#target_compile_options(VirtualFluids_GPU PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-dlto>)
+
+#target_link_options(VirtualFluids_GPU PRIVATE $<DEVICE_LINK:-dlto>)
diff --git a/src/gpu/VirtualFluids_GPU/GPU/Cumulant27chim.cu b/src/gpu/VirtualFluids_GPU/GPU/Cumulant27chim.cu
index 49e2c10a2..d34d545dc 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/Cumulant27chim.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/Cumulant27chim.cu
@@ -37,1344 +37,6 @@
 #include "math.h"
 
 #include <lbm/Chimera.h>
-#include <lbm/CumulantChimera.h>
-
-
-
-__device__ Distributions27 getDistributions27(real* distributions, unsigned int size_Mat, bool isEvenTimestep)
-{
-    Distributions27 dist;
-    if (isEvenTimestep)
-    {
-        dist.f[dirE   ] = &distributions[dirE   *size_Mat];
-        dist.f[dirW   ] = &distributions[dirW   *size_Mat];
-        dist.f[dirN   ] = &distributions[dirN   *size_Mat];
-        dist.f[dirS   ] = &distributions[dirS   *size_Mat];
-        dist.f[dirT   ] = &distributions[dirT   *size_Mat];
-        dist.f[dirB   ] = &distributions[dirB   *size_Mat];
-        dist.f[dirNE  ] = &distributions[dirNE  *size_Mat];
-        dist.f[dirSW  ] = &distributions[dirSW  *size_Mat];
-        dist.f[dirSE  ] = &distributions[dirSE  *size_Mat];
-        dist.f[dirNW  ] = &distributions[dirNW  *size_Mat];
-        dist.f[dirTE  ] = &distributions[dirTE  *size_Mat];
-        dist.f[dirBW  ] = &distributions[dirBW  *size_Mat];
-        dist.f[dirBE  ] = &distributions[dirBE  *size_Mat];
-        dist.f[dirTW  ] = &distributions[dirTW  *size_Mat];
-        dist.f[dirTN  ] = &distributions[dirTN  *size_Mat];
-        dist.f[dirBS  ] = &distributions[dirBS  *size_Mat];
-        dist.f[dirBN  ] = &distributions[dirBN  *size_Mat];
-        dist.f[dirTS  ] = &distributions[dirTS  *size_Mat];
-        dist.f[dirREST] = &distributions[dirREST*size_Mat];
-        dist.f[dirTNE ] = &distributions[dirTNE *size_Mat];
-        dist.f[dirTSW ] = &distributions[dirTSW *size_Mat];
-        dist.f[dirTSE ] = &distributions[dirTSE *size_Mat];
-        dist.f[dirTNW ] = &distributions[dirTNW *size_Mat];
-        dist.f[dirBNE ] = &distributions[dirBNE *size_Mat];
-        dist.f[dirBSW ] = &distributions[dirBSW *size_Mat];
-        dist.f[dirBSE ] = &distributions[dirBSE *size_Mat];
-        dist.f[dirBNW ] = &distributions[dirBNW *size_Mat];
-    }
-    else
-    {
-        dist.f[dirW   ] = &distributions[dirE   *size_Mat];
-        dist.f[dirE   ] = &distributions[dirW   *size_Mat];
-        dist.f[dirS   ] = &distributions[dirN   *size_Mat];
-        dist.f[dirN   ] = &distributions[dirS   *size_Mat];
-        dist.f[dirB   ] = &distributions[dirT   *size_Mat];
-        dist.f[dirT   ] = &distributions[dirB   *size_Mat];
-        dist.f[dirSW  ] = &distributions[dirNE  *size_Mat];
-        dist.f[dirNE  ] = &distributions[dirSW  *size_Mat];
-        dist.f[dirNW  ] = &distributions[dirSE  *size_Mat];
-        dist.f[dirSE  ] = &distributions[dirNW  *size_Mat];
-        dist.f[dirBW  ] = &distributions[dirTE  *size_Mat];
-        dist.f[dirTE  ] = &distributions[dirBW  *size_Mat];
-        dist.f[dirTW  ] = &distributions[dirBE  *size_Mat];
-        dist.f[dirBE  ] = &distributions[dirTW  *size_Mat];
-        dist.f[dirBS  ] = &distributions[dirTN  *size_Mat];
-        dist.f[dirTN  ] = &distributions[dirBS  *size_Mat];
-        dist.f[dirTS  ] = &distributions[dirBN  *size_Mat];
-        dist.f[dirBN  ] = &distributions[dirTS  *size_Mat];
-        dist.f[dirREST] = &distributions[dirREST*size_Mat];
-        dist.f[dirBSW ] = &distributions[dirTNE *size_Mat];
-        dist.f[dirBNE ] = &distributions[dirTSW *size_Mat];
-        dist.f[dirBNW ] = &distributions[dirTSE *size_Mat];
-        dist.f[dirBSE ] = &distributions[dirTNW *size_Mat];
-        dist.f[dirTSW ] = &distributions[dirBNE *size_Mat];
-        dist.f[dirTNE ] = &distributions[dirBSW *size_Mat];
-        dist.f[dirTNW ] = &distributions[dirBSE *size_Mat];
-        dist.f[dirTSE ] = &distributions[dirBNW *size_Mat];
-    }
-    return dist;
-}
-
-struct DistributionWrapper
-{
-    __device__ DistributionWrapper(
-        real* distributions,
-        unsigned int size_Mat,
-        bool isEvenTimestep,
-        uint k,
-        uint* neighborX,
-        uint* neighborY,
-        uint* neighborZ) :
-        dist(getDistributions27(distributions, size_Mat, isEvenTimestep)),
-        k(k),
-        kw  (neighborX[k]),
-        ks  (neighborY[k]),
-        kb  (neighborZ[k]),
-        ksw (neighborY[kw]),
-        kbw (neighborZ[kw]),
-        kbs (neighborZ[ks]),
-        kbsw(neighborZ[ksw])
-    { 
-        read();
-    }
-
-    __device__ void read()
-    {
-        distribution.f[VF::LBM::DIR::PZZ] = (dist.f[dirE   ])[k];
-        distribution.f[VF::LBM::DIR::MZZ] = (dist.f[dirW   ])[kw];
-        distribution.f[VF::LBM::DIR::ZPZ] = (dist.f[dirN   ])[k];
-        distribution.f[VF::LBM::DIR::ZMZ] = (dist.f[dirS   ])[ks];
-        distribution.f[VF::LBM::DIR::ZZP] = (dist.f[dirT   ])[k];
-        distribution.f[VF::LBM::DIR::ZZM] = (dist.f[dirB   ])[kb];
-        distribution.f[VF::LBM::DIR::PPZ] = (dist.f[dirNE  ])[k];
-        distribution.f[VF::LBM::DIR::MMZ] = (dist.f[dirSW  ])[ksw];
-        distribution.f[VF::LBM::DIR::PMZ] = (dist.f[dirSE  ])[ks];
-        distribution.f[VF::LBM::DIR::MPZ] = (dist.f[dirNW  ])[kw];
-        distribution.f[VF::LBM::DIR::PZP] = (dist.f[dirTE  ])[k];
-        distribution.f[VF::LBM::DIR::MZM] = (dist.f[dirBW  ])[kbw];
-        distribution.f[VF::LBM::DIR::PZM] = (dist.f[dirBE  ])[kb];
-        distribution.f[VF::LBM::DIR::MZP] = (dist.f[dirTW  ])[kw];
-        distribution.f[VF::LBM::DIR::ZPP] = (dist.f[dirTN  ])[k];
-        distribution.f[VF::LBM::DIR::ZMM] = (dist.f[dirBS  ])[kbs];
-        distribution.f[VF::LBM::DIR::ZPM] = (dist.f[dirBN  ])[kb];
-        distribution.f[VF::LBM::DIR::ZMP] = (dist.f[dirTS  ])[ks];
-        distribution.f[VF::LBM::DIR::PPP] = (dist.f[dirTNE ])[k];
-        distribution.f[VF::LBM::DIR::MPP] = (dist.f[dirTNW ])[kw];
-        distribution.f[VF::LBM::DIR::PMP] = (dist.f[dirTSE ])[ks];
-        distribution.f[VF::LBM::DIR::MMP] = (dist.f[dirTSW ])[ksw];
-        distribution.f[VF::LBM::DIR::PPM] = (dist.f[dirBNE ])[kb];
-        distribution.f[VF::LBM::DIR::MPM] = (dist.f[dirBNW ])[kbw];
-        distribution.f[VF::LBM::DIR::PMM] = (dist.f[dirBSE ])[kbs];
-        distribution.f[VF::LBM::DIR::MMM] = (dist.f[dirBSW ])[kbsw];
-        distribution.f[VF::LBM::DIR::ZZZ] = (dist.f[dirREST])[k];
-    }
-
-    __device__ void write()
-    {
-        (dist.f[dirE   ])[k]    = distribution.f[VF::LBM::DIR::PZZ];
-        (dist.f[dirW   ])[kw]   = distribution.f[VF::LBM::DIR::MZZ];
-        (dist.f[dirN   ])[k]    = distribution.f[VF::LBM::DIR::ZPZ];
-        (dist.f[dirS   ])[ks]   = distribution.f[VF::LBM::DIR::ZMZ];
-        (dist.f[dirT   ])[k]    = distribution.f[VF::LBM::DIR::ZZP];
-        (dist.f[dirB   ])[kb]   = distribution.f[VF::LBM::DIR::ZZM];
-        (dist.f[dirNE  ])[k]    = distribution.f[VF::LBM::DIR::PPZ];
-        (dist.f[dirSW  ])[ksw]  = distribution.f[VF::LBM::DIR::MMZ];
-        (dist.f[dirSE  ])[ks]   = distribution.f[VF::LBM::DIR::PMZ];
-        (dist.f[dirNW  ])[kw]   = distribution.f[VF::LBM::DIR::MPZ];
-        (dist.f[dirTE  ])[k]    = distribution.f[VF::LBM::DIR::PZP];
-        (dist.f[dirBW  ])[kbw]  = distribution.f[VF::LBM::DIR::MZM];
-        (dist.f[dirBE  ])[kb]   = distribution.f[VF::LBM::DIR::PZM];
-        (dist.f[dirTW  ])[kw]   = distribution.f[VF::LBM::DIR::MZP];
-        (dist.f[dirTN  ])[k]    = distribution.f[VF::LBM::DIR::ZPP];
-        (dist.f[dirBS  ])[kbs]  = distribution.f[VF::LBM::DIR::ZMM];
-        (dist.f[dirBN  ])[kb]   = distribution.f[VF::LBM::DIR::ZPM];
-        (dist.f[dirTS  ])[ks]   = distribution.f[VF::LBM::DIR::ZMP];
-        (dist.f[dirTNE ])[k]    = distribution.f[VF::LBM::DIR::PPP];
-        (dist.f[dirTNW ])[kw]   = distribution.f[VF::LBM::DIR::MPP];
-        (dist.f[dirTSE ])[ks]   = distribution.f[VF::LBM::DIR::PMP];
-        (dist.f[dirTSW ])[ksw]  = distribution.f[VF::LBM::DIR::MMP];
-        (dist.f[dirBNE ])[kb]   = distribution.f[VF::LBM::DIR::PPM];
-        (dist.f[dirBNW ])[kbw]  = distribution.f[VF::LBM::DIR::MPM];
-        (dist.f[dirBSE ])[kbs]  = distribution.f[VF::LBM::DIR::PMM];
-        (dist.f[dirBSW ])[kbsw] = distribution.f[VF::LBM::DIR::MMM];
-        (dist.f[dirREST])[k]    = distribution.f[VF::LBM::DIR::ZZZ];
-    }
-
-    Distributions27 dist;
-
-    VF::LBM::Distribution27 distribution;
-
-    const uint k;
-    const uint kw;
-    const uint ks;
-    const uint kb;
-    const uint ksw;
-    const uint kbw;
-    const uint kbs;
-    const uint kbsw;
-};
-
-__device__ unsigned int getNodeIndex()
-{
-    const unsigned  x = threadIdx.x; 
-    const unsigned  y = blockIdx.x;  
-    const unsigned  z = blockIdx.y;  
-
-    const unsigned nx = blockDim.x;
-    const unsigned ny = gridDim.x;
-
-    return nx*(ny*z + y) + x;
-}
-
-__device__ bool isValidFluidNode(uint k, int size_Mat, uint nodeType)
-{
-    return (k < size_Mat) && (nodeType == GEO_FLUID);
-}
-
-__device__ void getLevelForce(real fx, real fy, real fz, int level, real* forces)
-{
-    real fx_t {1.}, fy_t {1.}, fz_t {1.};
-    for (int i = 0; i < level; i++)
-    {
-        fx_t *= c2o1;
-        fy_t *= c2o1;
-        fz_t *= c2o1;
-    }
-
-    forces[0] = fx / fx_t;
-    forces[1] = fy / fy_t;
-    forces[2] = fz / fz_t;
-}
-
-
-extern "C" __global__ void Cumulant_K17_LBM_Device_Kernel(
-    real omega,
-    uint* typeOfGridNode,
-    uint* neighborX,
-    uint* neighborY,
-    uint* neighborZ,
-    real* distributions,
-    int size_Mat,
-    int level,
-    real* forces,
-    bool isEvenTimestep)
-{
-    const uint k = getNodeIndex();
-    const uint nodeType = typeOfGridNode[k];
-
-    if (isValidFluidNode(k, size_Mat, nodeType))
-    {
-        DistributionWrapper distributionWrapper {
-            distributions, size_Mat, isEvenTimestep, k, neighborX, neighborY, neighborZ
-        };
-
-        real level_forces[3];
-        getLevelForce(forces[0], forces[1], forces[2], level, level_forces);
-
-        VF::LBM::cumulantChimera(distributionWrapper.distribution, omega, level_forces);
-
-        distributionWrapper.write();
-    }
-}
-
-
-
-////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ void Cumulant_K17_LBM_Device_Kernel_old(
-    real omega,
-    uint* typeOfGridNode,
-    uint* neighborX,
-    uint* neighborY,
-    uint* neighborZ,
-    real* distributions,
-    int size_Mat,
-    int level,
-    real* forces,
-    bool isEvenTimestep)
-{
-    //////////////////////////////////////////////////////////////////////////
-    //! Cumulant K17 Kernel is based on \ref
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-    //! and \ref
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.07.004"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.07.004 ]</b></a>
-    //!
-    //! The cumulant kernel is executed in the following steps
-    //!
-    ////////////////////////////////////////////////////////////////////////////////
-    //! - Get node index coordinates from thredIdx, blockIdx, blockDim and gridDim.
-    //!
-    const unsigned  x = threadIdx.x; 
-    const unsigned  y = blockIdx.x;  
-    const unsigned  z = blockIdx.y;  
-
-    const unsigned nx = blockDim.x;
-    const unsigned ny = gridDim.x;
-
-    const unsigned k = nx*(ny*z + y) + x;
-
-    //////////////////////////////////////////////////////////////////////////
-    // run for all indices in size_Mat and fluid nodes
-    if ((k < size_Mat) && (typeOfGridNode[k] == GEO_FLUID))
-    {
-        //////////////////////////////////////////////////////////////////////////
-        //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
-        //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
-        //!
-        Distributions27 dist = getDistributions27(distributions, size_Mat, isEvenTimestep);
-
-        ////////////////////////////////////////////////////////////////////////////////
-        //! - Set neighbor indices (necessary for indirect addressing) 
-        uint kw   = neighborX[k];
-        uint ks   = neighborY[k];
-        uint kb   = neighborZ[k];
-        uint ksw  = neighborY[kw];
-        uint kbw  = neighborZ[kw];
-        uint kbs  = neighborZ[ks];
-        uint kbsw = neighborZ[ksw];
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Set local distributions
-        //!
-        // real mfcbb = (dist.f[dirE   ])[k];
-        // real mfabb = (dist.f[dirW   ])[kw];
-        // real mfbcb = (dist.f[dirN   ])[k];
-        // real mfbab = (dist.f[dirS   ])[ks];
-        // real mfbbc = (dist.f[dirT   ])[k];
-        // real mfbba = (dist.f[dirB   ])[kb];
-        // real mfccb = (dist.f[dirNE  ])[k];
-        // real mfaab = (dist.f[dirSW  ])[ksw];
-        // real mfcab = (dist.f[dirSE  ])[ks];
-        // real mfacb = (dist.f[dirNW  ])[kw];
-        // real mfcbc = (dist.f[dirTE  ])[k];
-        // real mfaba = (dist.f[dirBW  ])[kbw];
-        // real mfcba = (dist.f[dirBE  ])[kb];
-        // real mfabc = (dist.f[dirTW  ])[kw];
-        // real mfbcc = (dist.f[dirTN  ])[k];
-        // real mfbaa = (dist.f[dirBS  ])[kbs];
-        // real mfbca = (dist.f[dirBN  ])[kb];
-        // real mfbac = (dist.f[dirTS  ])[ks];
-        // real mfbbb = (dist.f[dirREST])[k];
-        // real mfccc = (dist.f[dirTNE ])[k];
-        // real mfaac = (dist.f[dirTSW ])[ksw];
-        // real mfcac = (dist.f[dirTSE ])[ks];
-        // real mfacc = (dist.f[dirTNW ])[kw];
-        // real mfcca = (dist.f[dirBNE ])[kb];
-        // real mfaaa = (dist.f[dirBSW ])[kbsw];
-        // real mfcaa = (dist.f[dirBSE ])[kbs];
-        // real mfaca = (dist.f[dirBNW ])[kbw];
-
-        VF::LBM::Distribution27 distribution_in;
-
-        distribution_in.f[VF::LBM::DIR::PZZ] = (dist.f[dirE   ])[k];
-        distribution_in.f[VF::LBM::DIR::MZZ] = (dist.f[dirW   ])[kw];
-        distribution_in.f[VF::LBM::DIR::ZPZ] = (dist.f[dirN   ])[k];
-        distribution_in.f[VF::LBM::DIR::ZMZ] = (dist.f[dirS   ])[ks];
-        distribution_in.f[VF::LBM::DIR::ZZP] = (dist.f[dirT   ])[k];
-        distribution_in.f[VF::LBM::DIR::ZZM] = (dist.f[dirB   ])[kb];
-        distribution_in.f[VF::LBM::DIR::PPZ] = (dist.f[dirNE  ])[k];
-        distribution_in.f[VF::LBM::DIR::MMZ] = (dist.f[dirSW  ])[ksw];
-        distribution_in.f[VF::LBM::DIR::PMZ] = (dist.f[dirSE  ])[ks];
-        distribution_in.f[VF::LBM::DIR::MPZ] = (dist.f[dirNW  ])[kw];
-        distribution_in.f[VF::LBM::DIR::PZP] = (dist.f[dirTE  ])[k];
-        distribution_in.f[VF::LBM::DIR::MZM] = (dist.f[dirBW  ])[kbw];
-        distribution_in.f[VF::LBM::DIR::PZM] = (dist.f[dirBE  ])[kb];
-        distribution_in.f[VF::LBM::DIR::MZP] = (dist.f[dirTW  ])[kw];
-        distribution_in.f[VF::LBM::DIR::ZPP] = (dist.f[dirTN  ])[k];
-        distribution_in.f[VF::LBM::DIR::ZMM] = (dist.f[dirBS  ])[kbs];
-        distribution_in.f[VF::LBM::DIR::ZPM] = (dist.f[dirBN  ])[kb];
-        distribution_in.f[VF::LBM::DIR::ZMP] = (dist.f[dirTS  ])[ks];
-        distribution_in.f[VF::LBM::DIR::PPP] = (dist.f[dirTNE ])[k];
-        distribution_in.f[VF::LBM::DIR::MPP] = (dist.f[dirTNW ])[kw];
-        distribution_in.f[VF::LBM::DIR::PMP] = (dist.f[dirTSE ])[ks];
-        distribution_in.f[VF::LBM::DIR::MMP] = (dist.f[dirTSW ])[ksw];
-        distribution_in.f[VF::LBM::DIR::PPM] = (dist.f[dirBNE ])[kb];
-        distribution_in.f[VF::LBM::DIR::MPM] = (dist.f[dirBNW ])[kbw];
-        distribution_in.f[VF::LBM::DIR::PMM] = (dist.f[dirBSE ])[kbs];
-        distribution_in.f[VF::LBM::DIR::MMM] = (dist.f[dirBSW ])[kbsw];
-        distribution_in.f[VF::LBM::DIR::ZZZ] = (dist.f[dirREST])[k];
-
-        real mfcbb = distribution_in.f[VF::LBM::DIR::PZZ];
-        real mfabb = distribution_in.f[VF::LBM::DIR::MZZ];
-        real mfbcb = distribution_in.f[VF::LBM::DIR::ZPZ];
-        real mfbab = distribution_in.f[VF::LBM::DIR::ZMZ];
-        real mfbbc = distribution_in.f[VF::LBM::DIR::ZZP];
-        real mfbba = distribution_in.f[VF::LBM::DIR::ZZM];
-        real mfccb = distribution_in.f[VF::LBM::DIR::PPZ];
-        real mfaab = distribution_in.f[VF::LBM::DIR::MMZ];
-        real mfcab = distribution_in.f[VF::LBM::DIR::PMZ];
-        real mfacb = distribution_in.f[VF::LBM::DIR::MPZ];
-        real mfcbc = distribution_in.f[VF::LBM::DIR::PZP];
-        real mfaba = distribution_in.f[VF::LBM::DIR::MZM];
-        real mfcba = distribution_in.f[VF::LBM::DIR::PZM];
-        real mfabc = distribution_in.f[VF::LBM::DIR::MZP];
-        real mfbcc = distribution_in.f[VF::LBM::DIR::ZPP];
-        real mfbaa = distribution_in.f[VF::LBM::DIR::ZMM];
-        real mfbca = distribution_in.f[VF::LBM::DIR::ZPM];
-        real mfbac = distribution_in.f[VF::LBM::DIR::ZMP];
-        real mfccc = distribution_in.f[VF::LBM::DIR::PPP];
-        real mfacc = distribution_in.f[VF::LBM::DIR::MPP];
-        real mfcac = distribution_in.f[VF::LBM::DIR::PMP];
-        real mfaac = distribution_in.f[VF::LBM::DIR::MMP];
-        real mfcca = distribution_in.f[VF::LBM::DIR::PPM];
-        real mfaca = distribution_in.f[VF::LBM::DIR::MPM];
-        real mfcaa = distribution_in.f[VF::LBM::DIR::PMM];
-        real mfaaa = distribution_in.f[VF::LBM::DIR::MMM];
-        real mfbbb = distribution_in.f[VF::LBM::DIR::ZZZ];
-
-
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
-    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
-    //!
-    real drho =
-        ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
-        (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
-        ((mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc))) + mfbbb; 
-    real rho = c1o1 + drho;
-    real OOrho = c1o1 / rho;    
-    real vvx = 
-        ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
-        (((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) +
-        (mfcbb - mfabb)) * OOrho;
-    real vvy = 
-        ((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
-        (((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) +
-        (mfbcb - mfbab)) * OOrho;
-    real vvz = 
-        ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
-        (((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) +
-        (mfbbc - mfbba)) * OOrho;
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Add half of the acceleration (body force) to the velocity as in Eq. (42) \ref
-    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
-    //!
-    real fx = forces[0];
-    real fy = forces[1];
-    real fz = forces[2];
-
-    real fx_t {1.}, fy_t {1.}, fz_t {1.};
-    for (int i = 0; i < level; i++)
-    {
-        fx_t *= c2o1;
-        fy_t *= c2o1;
-        fz_t *= c2o1;
-    }
-
-    fx /= fx_t;
-    fy /= fy_t;
-    fz /= fz_t;
-    //real forces[3] {fx, fy, fz};
-
-    vvx += fx * c1o2;
-    vvy += fy * c1o2;
-    vvz += fz * c1o2;
-    ////////////////////////////////////////////////////////////////////////////////////
-    // calculate the square of velocities for this lattice node
-    real vx2 = vvx*vvx;
-    real vy2 = vvy*vvy;
-    real vz2 = vvz*vvz;
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Set relaxation limiters for third order cumulants to default value \f$ \lambda=0.001 \f$ according to section 6 in \ref
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    real wadjust;
-    real qudricLimitP = c1o100;
-    real qudricLimitM = c1o100;
-    real qudricLimitD = c1o100;
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Chimera transform from well conditioned distributions to central moments as defined in Appendix J in \ref
-    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
-    //! see also Eq. (6)-(14) in \ref
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Z - Dir
-    VF::LBM::forwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
-    VF::LBM::forwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::forwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
-    VF::LBM::forwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::forwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2,  c9o4,  c4o9);
-    VF::LBM::forwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::forwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
-    VF::LBM::forwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::forwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);   
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Y - Dir
-    VF::LBM::forwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2,  c6o1,  c1o6);
-    VF::LBM::forwardChimera(            mfaab, mfabb, mfacb, vvy, vy2);
-    VF::LBM::forwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
-    VF::LBM::forwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2,  c3o2,  c2o3);
-    VF::LBM::forwardChimera(            mfbab, mfbbb, mfbcb, vvy, vy2);
-    VF::LBM::forwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2,  c9o2,  c2o9);
-    VF::LBM::forwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2,  c6o1,  c1o6);
-    VF::LBM::forwardChimera(            mfcab, mfcbb, mfccb, vvy, vy2);
-    VF::LBM::forwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);   
-    ////////////////////////////////////////////////////////////////////////////////////
-    // X - Dir
-    VF::LBM::forwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
-    VF::LBM::forwardChimera(            mfaba, mfbba, mfcba, vvx, vx2);
-    VF::LBM::forwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
-    VF::LBM::forwardChimera(            mfaab, mfbab, mfcab, vvx, vx2);
-    VF::LBM::forwardChimera(            mfabb, mfbbb, mfcbb, vvx, vx2);
-    VF::LBM::forwardChimera(            mfacb, mfbcb, mfccb, vvx, vx2);
-    VF::LBM::forwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
-    VF::LBM::forwardChimera(            mfabc, mfbbc, mfcbc, vvx, vx2);
-    VF::LBM::forwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c3o1, c1o9); 
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Setting relaxation rates for non-hydrodynamic cumulants (default values). Variable names and equations    according to
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!  => [NAME IN PAPER]=[NAME IN CODE]=[DEFAULT VALUE].
-    //!  - Trace of second order cumulants \f$ C_{200}+C_{020}+C_{002} \f$ used to adjust bulk  viscosity:\f$\omega_2=OxxPyyPzz=1.0 \f$.
-    //!  - Third order cumulants \f$ C_{120}+C_{102}, C_{210}+C_{012}, C_{201}+C_{021} \f$: \f$ \omega_3=OxyyPxzz   \f$ set according to Eq. (111) with simplifications assuming \f$ \omega_2=1.0\f$.
-    //!  - Third order cumulants \f$ C_{120}-C_{102}, C_{210}-C_{012}, C_{201}-C_{021} \f$: \f$ \omega_4 =  OxyyMxzz \f$ set according to Eq. (112) with simplifications assuming \f$ \omega_2 = 1.0\f$.
-    //!  - Third order cumulants \f$ C_{111} \f$: \f$ \omega_5 = Oxyz \f$ set according to Eq. (113) with   simplifications assuming \f$ \omega_2 = 1.0\f$  (modify for different bulk viscosity).
-    //!  - Fourth order cumulants \f$ C_{220}, C_{202}, C_{022}, C_{211}, C_{121}, C_{112} \f$: for simplification  all set to the same default value \f$ \omega_6=\omega_7=\omega_8=O4=1.0 \f$.
-    //!  - Fifth order cumulants \f$ C_{221}, C_{212}, C_{122}\f$: \f$\omega_9=O5=1.0\f$.
-    //!  - Sixth order cumulant \f$ C_{222}\f$: \f$\omega_{10}=O6=1.0\f$.
-    //!
-    ////////////////////////////////////////////////////////////
-    //2.
-    real OxxPyyPzz = c1o1;
-    ////////////////////////////////////////////////////////////
-    //3.
-    real OxyyPxzz = c8o1  * (-c2o1 + omega) * ( c1o1 + c2o1*omega) / (-c8o1 - c14o1*omega + c7o1*omega*omega);
-    real OxyyMxzz = c8o1  * (-c2o1 + omega) * (-c7o1 + c4o1*omega) / (c56o1 - c50o1*omega + c9o1*omega*omega);
-    real Oxyz     = c24o1 * (-c2o1 + omega) * (-c2o1 - c7o1*omega + c3o1*omega*omega) / (c48o1 + c152o1*omega - c130o1*omega*omega + c29o1*omega*omega*omega);
-    ////////////////////////////////////////////////////////////
-    //4.
-    real O4 = c1o1;
-    ////////////////////////////////////////////////////////////
-    //5.
-    real O5 = c1o1;
-    ////////////////////////////////////////////////////////////
-    //6.
-    real O6 = c1o1; 
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - A and B: parameters for fourth order convergence of the diffusion term according to Eq. (114) and (115) 
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //! with simplifications assuming \f$ \omega_2 = 1.0 \f$ (modify for different bulk viscosity).
-    //!
-    real A = (c4o1 + c2o1*omega - c3o1*omega*omega) / (c2o1 - c7o1*omega + c5o1*omega*omega);
-    real B = (c4o1 + c28o1*omega - c14o1*omega*omega) / (c6o1 - c21o1*omega + c15o1*omega*omega);   
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Compute cumulants from central moments according to Eq. (20)-(23) in
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    ////////////////////////////////////////////////////////////
-    //4.
-    real CUMcbb = mfcbb - ((mfcaa + c1o3) * mfabb + c2o1 * mfbba * mfbab) * OOrho;
-    real CUMbcb = mfbcb - ((mfaca + c1o3) * mfbab + c2o1 * mfbba * mfabb) * OOrho;
-    real CUMbbc = mfbbc - ((mfaac + c1o3) * mfbba + c2o1 * mfbab * mfabb) * OOrho;  
-    real CUMcca = mfcca - (((mfcaa * mfaca + c2o1 * mfbba * mfbba) + c1o3 * (mfcaa + mfaca)) * OOrho - c1o9*(drho   * OOrho));
-    real CUMcac = mfcac - (((mfcaa * mfaac + c2o1 * mfbab * mfbab) + c1o3 * (mfcaa + mfaac)) * OOrho - c1o9*(drho   * OOrho));
-    real CUMacc = mfacc - (((mfaac * mfaca + c2o1 * mfabb * mfabb) + c1o3 * (mfaac + mfaca)) * OOrho - c1o9*(drho   * OOrho));
-    ////////////////////////////////////////////////////////////
-    //5.
-    real CUMbcc = mfbcc - ((mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb + mfbba *  mfabc)) + c1o3 * (mfbca + mfbac)) * OOrho;
-    real CUMcbc = mfcbc - ((mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab + mfbba *  mfbac)) + c1o3 * (mfcba + mfabc)) * OOrho;
-    real CUMccb = mfccb - ((mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca + mfabb *  mfcba)) + c1o3 * (mfacb + mfcab)) * OOrho;
-    ////////////////////////////////////////////////////////////
-    //6.
-    real CUMccc = mfccc + ((-c4o1 *  mfbbb * mfbbb
-        - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
-        - c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
-        - c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) * OOrho
-        + (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
-        + c2o1 * (mfcaa * mfaca * mfaac)
-        + c16o1 *  mfbba * mfbab * mfabb) * OOrho * OOrho
-        - c1o3 * (mfacc + mfcac + mfcca) * OOrho
-        - c1o9 * (mfcaa + mfaca + mfaac) * OOrho
-        + (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
-        + (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 *(mfaac + mfaca + mfcaa)) * OOrho * OOrho  * c2o3
-        + c1o27*((drho * drho - drho) * OOrho * OOrho));    
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Compute linear combinations of second and third order cumulants
-    //!
-    ////////////////////////////////////////////////////////////
-    //2.
-    real mxxPyyPzz = mfcaa + mfaca + mfaac;
-    real mxxMyy = mfcaa - mfaca;
-    real mxxMzz = mfcaa - mfaac;
-    ////////////////////////////////////////////////////////////
-    //3.
-    real mxxyPyzz = mfcba + mfabc;
-    real mxxyMyzz = mfcba - mfabc;  
-    real mxxzPyyz = mfcab + mfacb;
-    real mxxzMyyz = mfcab - mfacb;  
-    real mxyyPxzz = mfbca + mfbac;
-    real mxyyMxzz = mfbca - mfbac;  
-    ////////////////////////////////////////////////////////////////////////////////////
-    //incl. correction
-    ////////////////////////////////////////////////////////////
-    //! - Compute velocity  gradients from second order cumulants according to Eq. (27)-(32)
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //! Further explanations of the correction in viscosity in Appendix H of
-    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
-    //! Note that the division by rho is omitted here as we need rho times the gradients later.
-    //!
-    real Dxy = -c3o1*omega*mfbba;
-    real Dxz = -c3o1*omega*mfbab;
-    real Dyz = -c3o1*omega*mfabb;
-    real dxux = c1o2 * (-omega) *(mxxMyy + mxxMzz) + c1o2 *  OxxPyyPzz * (mfaaa - mxxPyyPzz);
-    real dyuy = dxux + omega * c3o2 * mxxMyy;
-    real dzuz = dxux + omega * c3o2 * mxxMzz;
-    ////////////////////////////////////////////////////////////
-    //! - Relaxation of second order cumulants with correction terms according to Eq. (33)-(35) in
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz) - c3o1 * (c1o1 - c1o2 * OxxPyyPzz) * (vx2 * dxux + vy2 * dyuy + vz2  * dzuz);
-    mxxMyy    += omega * (-mxxMyy) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vy2 * dyuy);
-    mxxMzz    += omega * (-mxxMzz) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vz2 * dzuz);   
-    ////////////////////////////////////////////////////////////////////////////////////
-    ////no correction
-    //mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz);
-    //mxxMyy += -(-omega) * (-mxxMyy);
-    //mxxMzz += -(-omega) * (-mxxMzz);
-    //////////////////////////////////////////////////////////////////////////
-    mfabb += omega * (-mfabb);
-    mfbab += omega * (-mfbab);
-    mfbba += omega * (-mfbba);  
-    ////////////////////////////////////////////////////////////////////////////////////
-    //relax
-    //////////////////////////////////////////////////////////////////////////
-    // incl. limiter
-    //! - Relaxation of third order cumulants including limiter according to Eq. (116)-(123)
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    wadjust   = Oxyz + (c1o1 - Oxyz)*abs(mfbbb) / (abs(mfbbb) + qudricLimitD);
-    mfbbb    += wadjust * (-mfbbb);
-    wadjust   = OxyyPxzz + (c1o1 - OxyyPxzz)*abs(mxxyPyzz) / (abs(mxxyPyzz) + qudricLimitP);
-    mxxyPyzz += wadjust * (-mxxyPyzz);
-    wadjust   = OxyyMxzz + (c1o1 - OxyyMxzz)*abs(mxxyMyzz) / (abs(mxxyMyzz) + qudricLimitM);
-    mxxyMyzz += wadjust * (-mxxyMyzz);
-    wadjust   = OxyyPxzz + (c1o1 - OxyyPxzz)*abs(mxxzPyyz) / (abs(mxxzPyyz) + qudricLimitP);
-    mxxzPyyz += wadjust * (-mxxzPyyz);
-    wadjust   = OxyyMxzz + (c1o1 - OxyyMxzz)*abs(mxxzMyyz) / (abs(mxxzMyyz) + qudricLimitM);
-    mxxzMyyz += wadjust * (-mxxzMyyz);
-    wadjust   = OxyyPxzz + (c1o1 - OxyyPxzz)*abs(mxyyPxzz) / (abs(mxyyPxzz) + qudricLimitP);
-    mxyyPxzz += wadjust * (-mxyyPxzz);
-    wadjust   = OxyyMxzz + (c1o1 - OxyyMxzz)*abs(mxyyMxzz) / (abs(mxyyMxzz) + qudricLimitM);
-    mxyyMxzz += wadjust * (-mxyyMxzz);
-    //////////////////////////////////////////////////////////////////////////
-    // no limiter
-    //mfbbb += OxyyMxzz * (-mfbbb);
-    //mxxyPyzz += OxyyPxzz * (-mxxyPyzz);
-    //mxxyMyzz += OxyyMxzz * (-mxxyMyzz);
-    //mxxzPyyz += OxyyPxzz * (-mxxzPyyz);
-    //mxxzMyyz += OxyyMxzz * (-mxxzMyyz);
-    //mxyyPxzz += OxyyPxzz * (-mxyyPxzz);
-    //mxyyMxzz += OxyyMxzz * (-mxyyMxzz);   
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Compute inverse linear combinations of second and third order cumulants
-    //!
-    mfcaa = c1o3 * (mxxMyy + mxxMzz + mxxPyyPzz);
-    mfaca = c1o3 * (-c2o1*  mxxMyy + mxxMzz + mxxPyyPzz);
-    mfaac = c1o3 * (mxxMyy - c2o1* mxxMzz + mxxPyyPzz); 
-    mfcba = ( mxxyMyzz + mxxyPyzz) * c1o2;
-    mfabc = (-mxxyMyzz + mxxyPyzz) * c1o2;
-    mfcab = ( mxxzMyyz + mxxzPyyz) * c1o2;
-    mfacb = (-mxxzMyyz + mxxzPyyz) * c1o2;
-    mfbca = ( mxyyMxzz + mxyyPxzz) * c1o2;
-    mfbac = (-mxyyMxzz + mxyyPxzz) * c1o2;
-    //////////////////////////////////////////////////////////////////////////  
-    //////////////////////////////////////////////////////////////////////////
-    //4.
-    // no limiter
-    //! - Relax fourth order cumulants to modified equilibrium for fourth order convergence of diffusion according  to Eq. (43)-(48)
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    CUMacc = -O4*(c1o1 / omega - c1o2) * (dyuy + dzuz) * c2o3 * A + (c1o1 - O4) * (CUMacc);
-    CUMcac = -O4*(c1o1 / omega - c1o2) * (dxux + dzuz) * c2o3 * A + (c1o1 - O4) * (CUMcac);
-    CUMcca = -O4*(c1o1 / omega - c1o2) * (dyuy + dxux) * c2o3 * A + (c1o1 - O4) * (CUMcca);
-    CUMbbc = -O4*(c1o1 / omega - c1o2) * Dxy           * c1o3 * B + (c1o1 - O4) * (CUMbbc);
-    CUMbcb = -O4*(c1o1 / omega - c1o2) * Dxz           * c1o3 * B + (c1o1 - O4) * (CUMbcb);
-    CUMcbb = -O4*(c1o1 / omega - c1o2) * Dyz           * c1o3 * B + (c1o1 - O4) * (CUMcbb); 
-    //////////////////////////////////////////////////////////////////////////
-    //5.
-    CUMbcc += O5 * (-CUMbcc);
-    CUMcbc += O5 * (-CUMcbc);
-    CUMccb += O5 * (-CUMccb);   
-    //////////////////////////////////////////////////////////////////////////
-    //6.
-    CUMccc += O6 * (-CUMccc);   
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Compute central moments from post collision cumulants according to Eq. (53)-(56) in
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //! 
-    //////////////////////////////////////////////////////////////////////////
-    //4.
-    mfcbb = CUMcbb + c1o3*((c3o1*mfcaa + c1o1) * mfabb + c6o1 * mfbba * mfbab) * OOrho;
-    mfbcb = CUMbcb + c1o3*((c3o1*mfaca + c1o1) * mfbab + c6o1 * mfbba * mfabb) * OOrho;
-    mfbbc = CUMbbc + c1o3*((c3o1*mfaac + c1o1) * mfbba + c6o1 * mfbab * mfabb) * OOrho; 
-    mfcca = CUMcca + (((mfcaa * mfaca + c2o1 * mfbba * mfbba)*c9o1 + c3o1 * (mfcaa + mfaca)) * OOrho - (drho *  OOrho))*c1o9;
-    mfcac = CUMcac + (((mfcaa * mfaac + c2o1 * mfbab * mfbab)*c9o1 + c3o1 * (mfcaa + mfaac)) * OOrho - (drho *  OOrho))*c1o9;
-    mfacc = CUMacc + (((mfaac * mfaca + c2o1 * mfabb * mfabb)*c9o1 + c3o1 * (mfaac + mfaca)) * OOrho - (drho *  OOrho))*c1o9; 
-    //////////////////////////////////////////////////////////////////////////
-    //5.
-    mfbcc = CUMbcc + c1o3 *(c3o1*(mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb +    mfbba * mfabc)) + (mfbca + mfbac)) * OOrho;
-    mfcbc = CUMcbc + c1o3 *(c3o1*(mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab +    mfbba * mfbac)) + (mfcba + mfabc)) * OOrho;
-    mfccb = CUMccb + c1o3 *(c3o1*(mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca +    mfabb * mfcba)) + (mfacb + mfcab)) * OOrho; 
-    //////////////////////////////////////////////////////////////////////////
-    //6.
-    mfccc =	CUMccc - ((-c4o1 *  mfbbb * mfbbb
-            - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
-            - c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
-            - c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) * OOrho
-            + (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
-                + c2o1 * (mfcaa * mfaca * mfaac)
-                + c16o1 *  mfbba * mfbab * mfabb) * OOrho * OOrho
-            - c1o3 * (mfacc + mfcac + mfcca) * OOrho
-            - c1o9 * (mfcaa + mfaca + mfaac) * OOrho
-            + (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
-                + (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 *(mfaac + mfaca + mfcaa)) * OOrho * OOrho * c2o3
-            + c1o27*((drho * drho - drho) * OOrho * OOrho));    
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! -  Add acceleration (body force) to first order cumulants according to Eq. (85)-(87) in
-    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
-    //!
-    mfbaa = -mfbaa;
-    mfaba = -mfaba;
-    mfaab = -mfaab; 
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Chimera transform from central moments to well conditioned distributions as defined in Appendix J in
-    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
-    //! see also Eq. (88)-(96) in
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    ////////////////////////////////////////////////////////////////////////////////////
-    // X - Dir
-    VF::LBM::backwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
-    VF::LBM::backwardChimera(            mfaba, mfbba, mfcba, vvx, vx2);
-    VF::LBM::backwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
-    VF::LBM::backwardChimera(            mfaab, mfbab, mfcab, vvx, vx2);
-    VF::LBM::backwardChimera(            mfabb, mfbbb, mfcbb, vvx, vx2);
-    VF::LBM::backwardChimera(            mfacb, mfbcb, mfccb, vvx, vx2);
-    VF::LBM::backwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
-    VF::LBM::backwardChimera(            mfabc, mfbbc, mfcbc, vvx, vx2);
-    VF::LBM::backwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c9o1, c1o9);    
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Y - Dir
-    VF::LBM::backwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2,  c6o1,  c1o6);
-    VF::LBM::backwardChimera(            mfaab, mfabb, mfacb, vvy, vy2);
-    VF::LBM::backwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
-    VF::LBM::backwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2,  c3o2,  c2o3);
-    VF::LBM::backwardChimera(            mfbab, mfbbb, mfbcb, vvy, vy2);
-    VF::LBM::backwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2,  c9o2,  c2o9);
-    VF::LBM::backwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2,  c6o1,  c1o6);
-    VF::LBM::backwardChimera(            mfcab, mfcbb, mfccb, vvy, vy2);
-    VF::LBM::backwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);  
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Z - Dir
-    VF::LBM::backwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
-    VF::LBM::backwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::backwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
-    VF::LBM::backwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::backwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2,  c9o4,  c4o9);
-    VF::LBM::backwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::backwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
-    VF::LBM::backwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::backwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Write distributions: style of reading and writing the distributions from/to 
-        //! stored arrays dependent on timestep is based on the esoteric twist algorithm
-        //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
-        //!
-        // (dist.f[dirE   ])[k   ] = mfabb;
-        // (dist.f[dirW   ])[kw  ] = mfcbb;
-        // (dist.f[dirN   ])[k   ] = mfbab;
-        // (dist.f[dirS   ])[ks  ] = mfbcb;
-        // (dist.f[dirT   ])[k   ] = mfbba;
-        // (dist.f[dirB   ])[kb  ] = mfbbc;
-        // (dist.f[dirNE  ])[k   ] = mfaab;
-        // (dist.f[dirSW  ])[ksw ] = mfccb;
-        // (dist.f[dirSE  ])[ks  ] = mfacb;
-        // (dist.f[dirNW  ])[kw  ] = mfcab;
-        // (dist.f[dirTE  ])[k   ] = mfaba;
-        // (dist.f[dirBW  ])[kbw ] = mfcbc;
-        // (dist.f[dirBE  ])[kb  ] = mfabc;
-        // (dist.f[dirTW  ])[kw  ] = mfcba;
-        // (dist.f[dirTN  ])[k   ] = mfbaa;
-        // (dist.f[dirBS  ])[kbs ] = mfbcc;
-        // (dist.f[dirBN  ])[kb  ] = mfbac;
-        // (dist.f[dirTS  ])[ks  ] = mfbca;
-        // (dist.f[dirREST])[k   ] = mfbbb;
-        // (dist.f[dirTNE ])[k   ] = mfaaa;
-        // (dist.f[dirTSE ])[ks  ] = mfaca;
-        // (dist.f[dirBNE ])[kb  ] = mfaac;
-        // (dist.f[dirBSE ])[kbs ] = mfacc;
-        // (dist.f[dirTNW ])[kw  ] = mfcaa;
-        // (dist.f[dirTSW ])[ksw ] = mfcca;
-        // (dist.f[dirBNW ])[kbw ] = mfcac;
-        // (dist.f[dirBSW ])[kbsw] = mfccc;
-
-        VF::LBM::Distribution27 distribution_out;
-    
-        distribution_out.f[VF::LBM::DIR::MZZ] = mfcbb;
-        distribution_out.f[VF::LBM::DIR::PZZ] = mfabb;
-        distribution_out.f[VF::LBM::DIR::ZMZ] = mfbcb;
-        distribution_out.f[VF::LBM::DIR::ZPZ] = mfbab;
-        distribution_out.f[VF::LBM::DIR::ZZM] = mfbbc;
-        distribution_out.f[VF::LBM::DIR::ZZP] = mfbba;
-        distribution_out.f[VF::LBM::DIR::MMZ] = mfccb;
-        distribution_out.f[VF::LBM::DIR::PPZ] = mfaab;
-        distribution_out.f[VF::LBM::DIR::MPZ] = mfcab;
-        distribution_out.f[VF::LBM::DIR::PMZ] = mfacb;
-        distribution_out.f[VF::LBM::DIR::MZM] = mfcbc;
-        distribution_out.f[VF::LBM::DIR::PZP] = mfaba;
-        distribution_out.f[VF::LBM::DIR::MZP] = mfcba;
-        distribution_out.f[VF::LBM::DIR::PZM] = mfabc;
-        distribution_out.f[VF::LBM::DIR::ZMM] = mfbcc;
-        distribution_out.f[VF::LBM::DIR::ZPP] = mfbaa;
-        distribution_out.f[VF::LBM::DIR::ZMP] = mfbca;
-        distribution_out.f[VF::LBM::DIR::ZPM] = mfbac;
-        distribution_out.f[VF::LBM::DIR::MMM] = mfccc;
-        distribution_out.f[VF::LBM::DIR::PMM] = mfacc;
-        distribution_out.f[VF::LBM::DIR::MPM] = mfcac;
-        distribution_out.f[VF::LBM::DIR::PPM] = mfaac;
-        distribution_out.f[VF::LBM::DIR::MMP] = mfcca;
-        distribution_out.f[VF::LBM::DIR::PMP] = mfaca;
-        distribution_out.f[VF::LBM::DIR::MPP] = mfcaa;
-        distribution_out.f[VF::LBM::DIR::PPP] = mfaaa;
-        distribution_out.f[VF::LBM::DIR::ZZZ] = mfbbb;
-
-
-        (dist.f[dirE   ])[k]    = distribution_out.f[VF::LBM::DIR::PZZ];
-        (dist.f[dirW   ])[kw]   = distribution_out.f[VF::LBM::DIR::MZZ];
-        (dist.f[dirN   ])[k]    = distribution_out.f[VF::LBM::DIR::ZPZ];
-        (dist.f[dirS   ])[ks]   = distribution_out.f[VF::LBM::DIR::ZMZ];
-        (dist.f[dirT   ])[k]    = distribution_out.f[VF::LBM::DIR::ZZP];
-        (dist.f[dirB   ])[kb]   = distribution_out.f[VF::LBM::DIR::ZZM];
-        (dist.f[dirNE  ])[k]    = distribution_out.f[VF::LBM::DIR::PPZ];
-        (dist.f[dirSW  ])[ksw]  = distribution_out.f[VF::LBM::DIR::MMZ];
-        (dist.f[dirSE  ])[ks]   = distribution_out.f[VF::LBM::DIR::PMZ];
-        (dist.f[dirNW  ])[kw]   = distribution_out.f[VF::LBM::DIR::MPZ];
-        (dist.f[dirTE  ])[k]    = distribution_out.f[VF::LBM::DIR::PZP];
-        (dist.f[dirBW  ])[kbw]  = distribution_out.f[VF::LBM::DIR::MZM];
-        (dist.f[dirBE  ])[kb]   = distribution_out.f[VF::LBM::DIR::PZM];
-        (dist.f[dirTW  ])[kw]   = distribution_out.f[VF::LBM::DIR::MZP];
-        (dist.f[dirTN  ])[k]    = distribution_out.f[VF::LBM::DIR::ZPP];
-        (dist.f[dirBS  ])[kbs]  = distribution_out.f[VF::LBM::DIR::ZMM];
-        (dist.f[dirBN  ])[kb]   = distribution_out.f[VF::LBM::DIR::ZPM];
-        (dist.f[dirTS  ])[ks]   = distribution_out.f[VF::LBM::DIR::ZMP];
-        (dist.f[dirTNE ])[k]    = distribution_out.f[VF::LBM::DIR::PPP];
-        (dist.f[dirTNW ])[kw]   = distribution_out.f[VF::LBM::DIR::MPP];
-        (dist.f[dirTSE ])[ks]   = distribution_out.f[VF::LBM::DIR::PMP];
-        (dist.f[dirTSW ])[ksw]  = distribution_out.f[VF::LBM::DIR::MMP];
-        (dist.f[dirBNE ])[kb]   = distribution_out.f[VF::LBM::DIR::PPM];
-        (dist.f[dirBNW ])[kbw]  = distribution_out.f[VF::LBM::DIR::MPM];
-        (dist.f[dirBSE ])[kbs]  = distribution_out.f[VF::LBM::DIR::PMM];
-        (dist.f[dirBSW ])[kbsw] = distribution_out.f[VF::LBM::DIR::MMM];
-        (dist.f[dirREST])[k]    = distribution_out.f[VF::LBM::DIR::ZZZ];
-
-    }
-}
-////////////////////////////////////////////////////////////////////////////////
-
-extern "C" __global__ void Cumulant_K17_LBM_Device_Kernel_old_origin(
-    real omega,
-    uint* typeOfGridNode,
-    uint* neighborX,
-    uint* neighborY,
-    uint* neighborZ,
-    real* distributions,
-    int size_Mat,
-    int level,
-    real* forces,
-    bool isEvenTimestep)
-{
-    //////////////////////////////////////////////////////////////////////////
-    //! Cumulant K17 Kernel is based on \ref
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
-    //! and \ref
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.07.004"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.07.004 ]</b></a>
-    //!
-    //! The cumulant kernel is executed in the following steps
-    //!
-    ////////////////////////////////////////////////////////////////////////////////
-    //! - Get node index coordinates from thredIdx, blockIdx, blockDim and gridDim.
-    //!
-    const unsigned  x = threadIdx.x; 
-    const unsigned  y = blockIdx.x;  
-    const unsigned  z = blockIdx.y;  
-
-    const unsigned nx = blockDim.x;
-    const unsigned ny = gridDim.x;
-
-    const unsigned k = nx*(ny*z + y) + x;
-
-    //////////////////////////////////////////////////////////////////////////
-    // run for all indices in size_Mat and fluid nodes
-    if ((k < size_Mat) && (typeOfGridNode[k] == GEO_FLUID))
-    {
-        //////////////////////////////////////////////////////////////////////////
-        //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
-        //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
-        //!
-        Distributions27 dist;
-        if (isEvenTimestep)
-        {
-            dist.f[dirE   ] = &distributions[dirE   *size_Mat];
-            dist.f[dirW   ] = &distributions[dirW   *size_Mat];
-            dist.f[dirN   ] = &distributions[dirN   *size_Mat];
-            dist.f[dirS   ] = &distributions[dirS   *size_Mat];
-            dist.f[dirT   ] = &distributions[dirT   *size_Mat];
-            dist.f[dirB   ] = &distributions[dirB   *size_Mat];
-            dist.f[dirNE  ] = &distributions[dirNE  *size_Mat];
-            dist.f[dirSW  ] = &distributions[dirSW  *size_Mat];
-            dist.f[dirSE  ] = &distributions[dirSE  *size_Mat];
-            dist.f[dirNW  ] = &distributions[dirNW  *size_Mat];
-            dist.f[dirTE  ] = &distributions[dirTE  *size_Mat];
-            dist.f[dirBW  ] = &distributions[dirBW  *size_Mat];
-            dist.f[dirBE  ] = &distributions[dirBE  *size_Mat];
-            dist.f[dirTW  ] = &distributions[dirTW  *size_Mat];
-            dist.f[dirTN  ] = &distributions[dirTN  *size_Mat];
-            dist.f[dirBS  ] = &distributions[dirBS  *size_Mat];
-            dist.f[dirBN  ] = &distributions[dirBN  *size_Mat];
-            dist.f[dirTS  ] = &distributions[dirTS  *size_Mat];
-            dist.f[dirREST] = &distributions[dirREST*size_Mat];
-            dist.f[dirTNE ] = &distributions[dirTNE *size_Mat];
-            dist.f[dirTSW ] = &distributions[dirTSW *size_Mat];
-            dist.f[dirTSE ] = &distributions[dirTSE *size_Mat];
-            dist.f[dirTNW ] = &distributions[dirTNW *size_Mat];
-            dist.f[dirBNE ] = &distributions[dirBNE *size_Mat];
-            dist.f[dirBSW ] = &distributions[dirBSW *size_Mat];
-            dist.f[dirBSE ] = &distributions[dirBSE *size_Mat];
-            dist.f[dirBNW ] = &distributions[dirBNW *size_Mat];
-        }
-        else
-        {
-            dist.f[dirW   ] = &distributions[dirE   *size_Mat];
-            dist.f[dirE   ] = &distributions[dirW   *size_Mat];
-            dist.f[dirS   ] = &distributions[dirN   *size_Mat];
-            dist.f[dirN   ] = &distributions[dirS   *size_Mat];
-            dist.f[dirB   ] = &distributions[dirT   *size_Mat];
-            dist.f[dirT   ] = &distributions[dirB   *size_Mat];
-            dist.f[dirSW  ] = &distributions[dirNE  *size_Mat];
-            dist.f[dirNE  ] = &distributions[dirSW  *size_Mat];
-            dist.f[dirNW  ] = &distributions[dirSE  *size_Mat];
-            dist.f[dirSE  ] = &distributions[dirNW  *size_Mat];
-            dist.f[dirBW  ] = &distributions[dirTE  *size_Mat];
-            dist.f[dirTE  ] = &distributions[dirBW  *size_Mat];
-            dist.f[dirTW  ] = &distributions[dirBE  *size_Mat];
-            dist.f[dirBE  ] = &distributions[dirTW  *size_Mat];
-            dist.f[dirBS  ] = &distributions[dirTN  *size_Mat];
-            dist.f[dirTN  ] = &distributions[dirBS  *size_Mat];
-            dist.f[dirTS  ] = &distributions[dirBN  *size_Mat];
-            dist.f[dirBN  ] = &distributions[dirTS  *size_Mat];
-            dist.f[dirREST] = &distributions[dirREST*size_Mat];
-            dist.f[dirBSW ] = &distributions[dirTNE *size_Mat];
-            dist.f[dirBNE ] = &distributions[dirTSW *size_Mat];
-            dist.f[dirBNW ] = &distributions[dirTSE *size_Mat];
-            dist.f[dirBSE ] = &distributions[dirTNW *size_Mat];
-            dist.f[dirTSW ] = &distributions[dirBNE *size_Mat];
-            dist.f[dirTNE ] = &distributions[dirBSW *size_Mat];
-            dist.f[dirTNW ] = &distributions[dirBSE *size_Mat];
-            dist.f[dirTSE ] = &distributions[dirBNW *size_Mat];
-        }
-        ////////////////////////////////////////////////////////////////////////////////
-        //! - Set neighbor indices (necessary for indirect addressing) 
-        uint kw   = neighborX[k];
-        uint ks   = neighborY[k];
-        uint kb   = neighborZ[k];
-        uint ksw  = neighborY[kw];
-        uint kbw  = neighborZ[kw];
-        uint kbs  = neighborZ[ks];
-        uint kbsw = neighborZ[ksw];
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Set local distributions
-        //!
-        real mfcbb = (dist.f[dirE   ])[k];
-        real mfabb = (dist.f[dirW   ])[kw];
-        real mfbcb = (dist.f[dirN   ])[k];
-        real mfbab = (dist.f[dirS   ])[ks];
-        real mfbbc = (dist.f[dirT   ])[k];
-        real mfbba = (dist.f[dirB   ])[kb];
-        real mfccb = (dist.f[dirNE  ])[k];
-        real mfaab = (dist.f[dirSW  ])[ksw];
-        real mfcab = (dist.f[dirSE  ])[ks];
-        real mfacb = (dist.f[dirNW  ])[kw];
-        real mfcbc = (dist.f[dirTE  ])[k];
-        real mfaba = (dist.f[dirBW  ])[kbw];
-        real mfcba = (dist.f[dirBE  ])[kb];
-        real mfabc = (dist.f[dirTW  ])[kw];
-        real mfbcc = (dist.f[dirTN  ])[k];
-        real mfbaa = (dist.f[dirBS  ])[kbs];
-        real mfbca = (dist.f[dirBN  ])[kb];
-        real mfbac = (dist.f[dirTS  ])[ks];
-        real mfbbb = (dist.f[dirREST])[k];
-        real mfccc = (dist.f[dirTNE ])[k];
-        real mfaac = (dist.f[dirTSW ])[ksw];
-        real mfcac = (dist.f[dirTSE ])[ks];
-        real mfacc = (dist.f[dirTNW ])[kw];
-        real mfcca = (dist.f[dirBNE ])[kb];
-        real mfaaa = (dist.f[dirBSW ])[kbsw];
-        real mfcaa = (dist.f[dirBSE ])[kbs];
-        real mfaca = (dist.f[dirBNW ])[kbw];
-
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
-    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
-    //!
-    real drho =
-        ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
-        (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
-        ((mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc))) + mfbbb; 
-    real rho = c1o1 + drho;
-    real OOrho = c1o1 / rho;    
-    real vvx = 
-        ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
-        (((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) +
-        (mfcbb - mfabb)) * OOrho;
-    real vvy = 
-        ((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
-        (((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) +
-        (mfbcb - mfbab)) * OOrho;
-    real vvz = 
-        ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
-        (((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) +
-        (mfbbc - mfbba)) * OOrho;
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Add half of the acceleration (body force) to the velocity as in Eq. (42) \ref
-    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
-    //!
-    vvx += forces[0] * c1o2;
-    vvy += forces[1] * c1o2;
-    vvz += forces[2] * c1o2;
-    ////////////////////////////////////////////////////////////////////////////////////
-    // calculate the square of velocities for this lattice node
-    real vx2 = vvx*vvx;
-    real vy2 = vvy*vvy;
-    real vz2 = vvz*vvz;
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Set relaxation limiters for third order cumulants to default value \f$ \lambda=0.001 \f$ according to section 6 in \ref
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    real wadjust;
-    real qudricLimitP = c1o100;
-    real qudricLimitM = c1o100;
-    real qudricLimitD = c1o100;
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Chimera transform from well conditioned distributions to central moments as defined in Appendix J in \ref
-    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
-    //! see also Eq. (6)-(14) in \ref
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Z - Dir
-    VF::LBM::forwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
-    VF::LBM::forwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::forwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
-    VF::LBM::forwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::forwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2,  c9o4,  c4o9);
-    VF::LBM::forwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::forwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
-    VF::LBM::forwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::forwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);   
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Y - Dir
-    VF::LBM::forwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2,  c6o1,  c1o6);
-    VF::LBM::forwardChimera(            mfaab, mfabb, mfacb, vvy, vy2);
-    VF::LBM::forwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
-    VF::LBM::forwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2,  c3o2,  c2o3);
-    VF::LBM::forwardChimera(            mfbab, mfbbb, mfbcb, vvy, vy2);
-    VF::LBM::forwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2,  c9o2,  c2o9);
-    VF::LBM::forwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2,  c6o1,  c1o6);
-    VF::LBM::forwardChimera(            mfcab, mfcbb, mfccb, vvy, vy2);
-    VF::LBM::forwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);   
-    ////////////////////////////////////////////////////////////////////////////////////
-    // X - Dir
-    VF::LBM::forwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
-    VF::LBM::forwardChimera(            mfaba, mfbba, mfcba, vvx, vx2);
-    VF::LBM::forwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
-    VF::LBM::forwardChimera(            mfaab, mfbab, mfcab, vvx, vx2);
-    VF::LBM::forwardChimera(            mfabb, mfbbb, mfcbb, vvx, vx2);
-    VF::LBM::forwardChimera(            mfacb, mfbcb, mfccb, vvx, vx2);
-    VF::LBM::forwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
-    VF::LBM::forwardChimera(            mfabc, mfbbc, mfcbc, vvx, vx2);
-    VF::LBM::forwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c3o1, c1o9); 
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Setting relaxation rates for non-hydrodynamic cumulants (default values). Variable names and equations    according to
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!  => [NAME IN PAPER]=[NAME IN CODE]=[DEFAULT VALUE].
-    //!  - Trace of second order cumulants \f$ C_{200}+C_{020}+C_{002} \f$ used to adjust bulk  viscosity:\f$\omega_2=OxxPyyPzz=1.0 \f$.
-    //!  - Third order cumulants \f$ C_{120}+C_{102}, C_{210}+C_{012}, C_{201}+C_{021} \f$: \f$ \omega_3=OxyyPxzz   \f$ set according to Eq. (111) with simplifications assuming \f$ \omega_2=1.0\f$.
-    //!  - Third order cumulants \f$ C_{120}-C_{102}, C_{210}-C_{012}, C_{201}-C_{021} \f$: \f$ \omega_4 =  OxyyMxzz \f$ set according to Eq. (112) with simplifications assuming \f$ \omega_2 = 1.0\f$.
-    //!  - Third order cumulants \f$ C_{111} \f$: \f$ \omega_5 = Oxyz \f$ set according to Eq. (113) with   simplifications assuming \f$ \omega_2 = 1.0\f$  (modify for different bulk viscosity).
-    //!  - Fourth order cumulants \f$ C_{220}, C_{202}, C_{022}, C_{211}, C_{121}, C_{112} \f$: for simplification  all set to the same default value \f$ \omega_6=\omega_7=\omega_8=O4=1.0 \f$.
-    //!  - Fifth order cumulants \f$ C_{221}, C_{212}, C_{122}\f$: \f$\omega_9=O5=1.0\f$.
-    //!  - Sixth order cumulant \f$ C_{222}\f$: \f$\omega_{10}=O6=1.0\f$.
-    //!
-    ////////////////////////////////////////////////////////////
-    //2.
-    real OxxPyyPzz = c1o1;
-    ////////////////////////////////////////////////////////////
-    //3.
-    real OxyyPxzz = c8o1  * (-c2o1 + omega) * ( c1o1 + c2o1*omega) / (-c8o1 - c14o1*omega + c7o1*omega*omega);
-    real OxyyMxzz = c8o1  * (-c2o1 + omega) * (-c7o1 + c4o1*omega) / (c56o1 - c50o1*omega + c9o1*omega*omega);
-    real Oxyz     = c24o1 * (-c2o1 + omega) * (-c2o1 - c7o1*omega + c3o1*omega*omega) / (c48o1 + c152o1*omega - c130o1*omega*omega + c29o1*omega*omega*omega);
-    ////////////////////////////////////////////////////////////
-    //4.
-    real O4 = c1o1;
-    ////////////////////////////////////////////////////////////
-    //5.
-    real O5 = c1o1;
-    ////////////////////////////////////////////////////////////
-    //6.
-    real O6 = c1o1; 
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - A and B: parameters for fourth order convergence of the diffusion term according to Eq. (114) and (115) 
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //! with simplifications assuming \f$ \omega_2 = 1.0 \f$ (modify for different bulk viscosity).
-    //!
-    real A = (c4o1 + c2o1*omega - c3o1*omega*omega) / (c2o1 - c7o1*omega + c5o1*omega*omega);
-    real B = (c4o1 + c28o1*omega - c14o1*omega*omega) / (c6o1 - c21o1*omega + c15o1*omega*omega);   
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Compute cumulants from central moments according to Eq. (20)-(23) in
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    ////////////////////////////////////////////////////////////
-    //4.
-    real CUMcbb = mfcbb - ((mfcaa + c1o3) * mfabb + c2o1 * mfbba * mfbab) * OOrho;
-    real CUMbcb = mfbcb - ((mfaca + c1o3) * mfbab + c2o1 * mfbba * mfabb) * OOrho;
-    real CUMbbc = mfbbc - ((mfaac + c1o3) * mfbba + c2o1 * mfbab * mfabb) * OOrho;  
-    real CUMcca = mfcca - (((mfcaa * mfaca + c2o1 * mfbba * mfbba) + c1o3 * (mfcaa + mfaca)) * OOrho - c1o9*(drho   * OOrho));
-    real CUMcac = mfcac - (((mfcaa * mfaac + c2o1 * mfbab * mfbab) + c1o3 * (mfcaa + mfaac)) * OOrho - c1o9*(drho   * OOrho));
-    real CUMacc = mfacc - (((mfaac * mfaca + c2o1 * mfabb * mfabb) + c1o3 * (mfaac + mfaca)) * OOrho - c1o9*(drho   * OOrho));
-    ////////////////////////////////////////////////////////////
-    //5.
-    real CUMbcc = mfbcc - ((mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb + mfbba *  mfabc)) + c1o3 * (mfbca + mfbac)) * OOrho;
-    real CUMcbc = mfcbc - ((mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab + mfbba *  mfbac)) + c1o3 * (mfcba + mfabc)) * OOrho;
-    real CUMccb = mfccb - ((mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca + mfabb *  mfcba)) + c1o3 * (mfacb + mfcab)) * OOrho;
-    ////////////////////////////////////////////////////////////
-    //6.
-    real CUMccc = mfccc + ((-c4o1 *  mfbbb * mfbbb
-        - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
-        - c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
-        - c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) * OOrho
-        + (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
-        + c2o1 * (mfcaa * mfaca * mfaac)
-        + c16o1 *  mfbba * mfbab * mfabb) * OOrho * OOrho
-        - c1o3 * (mfacc + mfcac + mfcca) * OOrho
-        - c1o9 * (mfcaa + mfaca + mfaac) * OOrho
-        + (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
-        + (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 *(mfaac + mfaca + mfcaa)) * OOrho * OOrho  * c2o3
-        + c1o27*((drho * drho - drho) * OOrho * OOrho));    
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Compute linear combinations of second and third order cumulants
-    //!
-    ////////////////////////////////////////////////////////////
-    //2.
-    real mxxPyyPzz = mfcaa + mfaca + mfaac;
-    real mxxMyy = mfcaa - mfaca;
-    real mxxMzz = mfcaa - mfaac;
-    ////////////////////////////////////////////////////////////
-    //3.
-    real mxxyPyzz = mfcba + mfabc;
-    real mxxyMyzz = mfcba - mfabc;  
-    real mxxzPyyz = mfcab + mfacb;
-    real mxxzMyyz = mfcab - mfacb;  
-    real mxyyPxzz = mfbca + mfbac;
-    real mxyyMxzz = mfbca - mfbac;  
-    ////////////////////////////////////////////////////////////////////////////////////
-    //incl. correction
-    ////////////////////////////////////////////////////////////
-    //! - Compute velocity  gradients from second order cumulants according to Eq. (27)-(32)
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //! Further explanations of the correction in viscosity in Appendix H of
-    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
-    //! Note that the division by rho is omitted here as we need rho times the gradients later.
-    //!
-    real Dxy = -c3o1*omega*mfbba;
-    real Dxz = -c3o1*omega*mfbab;
-    real Dyz = -c3o1*omega*mfabb;
-    real dxux = c1o2 * (-omega) *(mxxMyy + mxxMzz) + c1o2 *  OxxPyyPzz * (mfaaa - mxxPyyPzz);
-    real dyuy = dxux + omega * c3o2 * mxxMyy;
-    real dzuz = dxux + omega * c3o2 * mxxMzz;
-    ////////////////////////////////////////////////////////////
-    //! - Relaxation of second order cumulants with correction terms according to Eq. (33)-(35) in
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz) - c3o1 * (c1o1 - c1o2 * OxxPyyPzz) * (vx2 * dxux + vy2 * dyuy + vz2  * dzuz);
-    mxxMyy    += omega * (-mxxMyy) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vy2 * dyuy);
-    mxxMzz    += omega * (-mxxMzz) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vz2 * dzuz);   
-    ////////////////////////////////////////////////////////////////////////////////////
-    ////no correction
-    //mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz);
-    //mxxMyy += -(-omega) * (-mxxMyy);
-    //mxxMzz += -(-omega) * (-mxxMzz);
-    //////////////////////////////////////////////////////////////////////////
-    mfabb += omega * (-mfabb);
-    mfbab += omega * (-mfbab);
-    mfbba += omega * (-mfbba);  
-    ////////////////////////////////////////////////////////////////////////////////////
-    //relax
-    //////////////////////////////////////////////////////////////////////////
-    // incl. limiter
-    //! - Relaxation of third order cumulants including limiter according to Eq. (116)-(123)
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    wadjust   = Oxyz + (c1o1 - Oxyz)*abs(mfbbb) / (abs(mfbbb) + qudricLimitD);
-    mfbbb    += wadjust * (-mfbbb);
-    wadjust   = OxyyPxzz + (c1o1 - OxyyPxzz)*abs(mxxyPyzz) / (abs(mxxyPyzz) + qudricLimitP);
-    mxxyPyzz += wadjust * (-mxxyPyzz);
-    wadjust   = OxyyMxzz + (c1o1 - OxyyMxzz)*abs(mxxyMyzz) / (abs(mxxyMyzz) + qudricLimitM);
-    mxxyMyzz += wadjust * (-mxxyMyzz);
-    wadjust   = OxyyPxzz + (c1o1 - OxyyPxzz)*abs(mxxzPyyz) / (abs(mxxzPyyz) + qudricLimitP);
-    mxxzPyyz += wadjust * (-mxxzPyyz);
-    wadjust   = OxyyMxzz + (c1o1 - OxyyMxzz)*abs(mxxzMyyz) / (abs(mxxzMyyz) + qudricLimitM);
-    mxxzMyyz += wadjust * (-mxxzMyyz);
-    wadjust   = OxyyPxzz + (c1o1 - OxyyPxzz)*abs(mxyyPxzz) / (abs(mxyyPxzz) + qudricLimitP);
-    mxyyPxzz += wadjust * (-mxyyPxzz);
-    wadjust   = OxyyMxzz + (c1o1 - OxyyMxzz)*abs(mxyyMxzz) / (abs(mxyyMxzz) + qudricLimitM);
-    mxyyMxzz += wadjust * (-mxyyMxzz);
-    //////////////////////////////////////////////////////////////////////////
-    // no limiter
-    //mfbbb += OxyyMxzz * (-mfbbb);
-    //mxxyPyzz += OxyyPxzz * (-mxxyPyzz);
-    //mxxyMyzz += OxyyMxzz * (-mxxyMyzz);
-    //mxxzPyyz += OxyyPxzz * (-mxxzPyyz);
-    //mxxzMyyz += OxyyMxzz * (-mxxzMyyz);
-    //mxyyPxzz += OxyyPxzz * (-mxyyPxzz);
-    //mxyyMxzz += OxyyMxzz * (-mxyyMxzz);   
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Compute inverse linear combinations of second and third order cumulants
-    //!
-    mfcaa = c1o3 * (mxxMyy + mxxMzz + mxxPyyPzz);
-    mfaca = c1o3 * (-c2o1*  mxxMyy + mxxMzz + mxxPyyPzz);
-    mfaac = c1o3 * (mxxMyy - c2o1* mxxMzz + mxxPyyPzz); 
-    mfcba = ( mxxyMyzz + mxxyPyzz) * c1o2;
-    mfabc = (-mxxyMyzz + mxxyPyzz) * c1o2;
-    mfcab = ( mxxzMyyz + mxxzPyyz) * c1o2;
-    mfacb = (-mxxzMyyz + mxxzPyyz) * c1o2;
-    mfbca = ( mxyyMxzz + mxyyPxzz) * c1o2;
-    mfbac = (-mxyyMxzz + mxyyPxzz) * c1o2;
-    //////////////////////////////////////////////////////////////////////////  
-    //////////////////////////////////////////////////////////////////////////
-    //4.
-    // no limiter
-    //! - Relax fourth order cumulants to modified equilibrium for fourth order convergence of diffusion according  to Eq. (43)-(48)
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    CUMacc = -O4*(c1o1 / omega - c1o2) * (dyuy + dzuz) * c2o3 * A + (c1o1 - O4) * (CUMacc);
-    CUMcac = -O4*(c1o1 / omega - c1o2) * (dxux + dzuz) * c2o3 * A + (c1o1 - O4) * (CUMcac);
-    CUMcca = -O4*(c1o1 / omega - c1o2) * (dyuy + dxux) * c2o3 * A + (c1o1 - O4) * (CUMcca);
-    CUMbbc = -O4*(c1o1 / omega - c1o2) * Dxy           * c1o3 * B + (c1o1 - O4) * (CUMbbc);
-    CUMbcb = -O4*(c1o1 / omega - c1o2) * Dxz           * c1o3 * B + (c1o1 - O4) * (CUMbcb);
-    CUMcbb = -O4*(c1o1 / omega - c1o2) * Dyz           * c1o3 * B + (c1o1 - O4) * (CUMcbb); 
-    //////////////////////////////////////////////////////////////////////////
-    //5.
-    CUMbcc += O5 * (-CUMbcc);
-    CUMcbc += O5 * (-CUMcbc);
-    CUMccb += O5 * (-CUMccb);   
-    //////////////////////////////////////////////////////////////////////////
-    //6.
-    CUMccc += O6 * (-CUMccc);   
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Compute central moments from post collision cumulants according to Eq. (53)-(56) in
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //! 
-    //////////////////////////////////////////////////////////////////////////
-    //4.
-    mfcbb = CUMcbb + c1o3*((c3o1*mfcaa + c1o1) * mfabb + c6o1 * mfbba * mfbab) * OOrho;
-    mfbcb = CUMbcb + c1o3*((c3o1*mfaca + c1o1) * mfbab + c6o1 * mfbba * mfabb) * OOrho;
-    mfbbc = CUMbbc + c1o3*((c3o1*mfaac + c1o1) * mfbba + c6o1 * mfbab * mfabb) * OOrho; 
-    mfcca = CUMcca + (((mfcaa * mfaca + c2o1 * mfbba * mfbba)*c9o1 + c3o1 * (mfcaa + mfaca)) * OOrho - (drho *  OOrho))*c1o9;
-    mfcac = CUMcac + (((mfcaa * mfaac + c2o1 * mfbab * mfbab)*c9o1 + c3o1 * (mfcaa + mfaac)) * OOrho - (drho *  OOrho))*c1o9;
-    mfacc = CUMacc + (((mfaac * mfaca + c2o1 * mfabb * mfabb)*c9o1 + c3o1 * (mfaac + mfaca)) * OOrho - (drho *  OOrho))*c1o9; 
-    //////////////////////////////////////////////////////////////////////////
-    //5.
-    mfbcc = CUMbcc + c1o3 *(c3o1*(mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb +    mfbba * mfabc)) + (mfbca + mfbac)) * OOrho;
-    mfcbc = CUMcbc + c1o3 *(c3o1*(mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab +    mfbba * mfbac)) + (mfcba + mfabc)) * OOrho;
-    mfccb = CUMccb + c1o3 *(c3o1*(mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca +    mfabb * mfcba)) + (mfacb + mfcab)) * OOrho; 
-    //////////////////////////////////////////////////////////////////////////
-    //6.
-    mfccc =	CUMccc - ((-c4o1 *  mfbbb * mfbbb
-            - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
-            - c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
-            - c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) * OOrho
-            + (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
-                + c2o1 * (mfcaa * mfaca * mfaac)
-                + c16o1 *  mfbba * mfbab * mfabb) * OOrho * OOrho
-            - c1o3 * (mfacc + mfcac + mfcca) * OOrho
-            - c1o9 * (mfcaa + mfaca + mfaac) * OOrho
-            + (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
-                + (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 *(mfaac + mfaca + mfcaa)) * OOrho * OOrho * c2o3
-            + c1o27*((drho * drho - drho) * OOrho * OOrho));    
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! -  Add acceleration (body force) to first order cumulants according to Eq. (85)-(87) in
-    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
-    //!
-    mfbaa = -mfbaa;
-    mfaba = -mfaba;
-    mfaab = -mfaab; 
-    ////////////////////////////////////////////////////////////////////////////////////
-    //! - Chimera transform from central moments to well conditioned distributions as defined in Appendix J in
-    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
-    //! see also Eq. (88)-(96) in
-    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
-    //!
-    ////////////////////////////////////////////////////////////////////////////////////
-    // X - Dir
-    VF::LBM::backwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
-    VF::LBM::backwardChimera(            mfaba, mfbba, mfcba, vvx, vx2);
-    VF::LBM::backwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
-    VF::LBM::backwardChimera(            mfaab, mfbab, mfcab, vvx, vx2);
-    VF::LBM::backwardChimera(            mfabb, mfbbb, mfcbb, vvx, vx2);
-    VF::LBM::backwardChimera(            mfacb, mfbcb, mfccb, vvx, vx2);
-    VF::LBM::backwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
-    VF::LBM::backwardChimera(            mfabc, mfbbc, mfcbc, vvx, vx2);
-    VF::LBM::backwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c9o1, c1o9);    
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Y - Dir
-    VF::LBM::backwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2,  c6o1,  c1o6);
-    VF::LBM::backwardChimera(            mfaab, mfabb, mfacb, vvy, vy2);
-    VF::LBM::backwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
-    VF::LBM::backwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2,  c3o2,  c2o3);
-    VF::LBM::backwardChimera(            mfbab, mfbbb, mfbcb, vvy, vy2);
-    VF::LBM::backwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2,  c9o2,  c2o9);
-    VF::LBM::backwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2,  c6o1,  c1o6);
-    VF::LBM::backwardChimera(            mfcab, mfcbb, mfccb, vvy, vy2);
-    VF::LBM::backwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);  
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Z - Dir
-    VF::LBM::backwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
-    VF::LBM::backwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::backwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
-    VF::LBM::backwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::backwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2,  c9o4,  c4o9);
-    VF::LBM::backwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::backwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
-    VF::LBM::backwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2,  c9o1,  c1o9);
-    VF::LBM::backwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);
-
-        ////////////////////////////////////////////////////////////////////////////////////
-        //! - Write distributions: style of reading and writing the distributions from/to 
-        //! stored arrays dependent on timestep is based on the esoteric twist algorithm
-        //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
-        //!
-        (dist.f[dirE   ])[k   ] = mfabb;
-        (dist.f[dirW   ])[kw  ] = mfcbb;
-        (dist.f[dirN   ])[k   ] = mfbab;
-        (dist.f[dirS   ])[ks  ] = mfbcb;
-        (dist.f[dirT   ])[k   ] = mfbba;
-        (dist.f[dirB   ])[kb  ] = mfbbc;
-        (dist.f[dirNE  ])[k   ] = mfaab;
-        (dist.f[dirSW  ])[ksw ] = mfccb;
-        (dist.f[dirSE  ])[ks  ] = mfacb;
-        (dist.f[dirNW  ])[kw  ] = mfcab;
-        (dist.f[dirTE  ])[k   ] = mfaba;
-        (dist.f[dirBW  ])[kbw ] = mfcbc;
-        (dist.f[dirBE  ])[kb  ] = mfabc;
-        (dist.f[dirTW  ])[kw  ] = mfcba;
-        (dist.f[dirTN  ])[k   ] = mfbaa;
-        (dist.f[dirBS  ])[kbs ] = mfbcc;
-        (dist.f[dirBN  ])[kb  ] = mfbac;
-        (dist.f[dirTS  ])[ks  ] = mfbca;
-        (dist.f[dirREST])[k   ] = mfbbb;
-        (dist.f[dirTNE ])[k   ] = mfaaa;
-        (dist.f[dirTSE ])[ks  ] = mfaca;
-        (dist.f[dirBNE ])[kb  ] = mfaac;
-        (dist.f[dirBSE ])[kbs ] = mfacc;
-        (dist.f[dirTNW ])[kw  ] = mfcaa;
-        (dist.f[dirTSW ])[ksw ] = mfcca;
-        (dist.f[dirBNW ])[kbw ] = mfcac;
-        (dist.f[dirBSW ])[kbsw] = mfccc;
-    }
-}
-////////////////////////////////////////////////////////////////////////////////
-
 
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CumulantK17ChimeraKernel.cu b/src/gpu/VirtualFluids_GPU/GPU/CumulantK17ChimeraKernel.cu
new file mode 100644
index 000000000..be940fd37
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/GPU/CumulantK17ChimeraKernel.cu
@@ -0,0 +1,742 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Cumulant27chim.cu
+//! \ingroup GPU
+//! \author Martin Schoenherr
+//=======================================================================================
+/* Device code */
+#include "LBM/LB.h" 
+#include "LBM/D3Q27.h"
+#include "Core/RealConstants.h"
+#include "math.h"
+
+
+#include <lbm/CumulantChimeraPreCompiled.h>
+
+
+__device__ Distributions27 getDistributions27(real* distributions, unsigned int size_Mat, bool isEvenTimestep)
+{
+    Distributions27 dist;
+    if (isEvenTimestep)
+    {
+        dist.f[dirE   ] = &distributions[dirE   *size_Mat];
+        dist.f[dirW   ] = &distributions[dirW   *size_Mat];
+        dist.f[dirN   ] = &distributions[dirN   *size_Mat];
+        dist.f[dirS   ] = &distributions[dirS   *size_Mat];
+        dist.f[dirT   ] = &distributions[dirT   *size_Mat];
+        dist.f[dirB   ] = &distributions[dirB   *size_Mat];
+        dist.f[dirNE  ] = &distributions[dirNE  *size_Mat];
+        dist.f[dirSW  ] = &distributions[dirSW  *size_Mat];
+        dist.f[dirSE  ] = &distributions[dirSE  *size_Mat];
+        dist.f[dirNW  ] = &distributions[dirNW  *size_Mat];
+        dist.f[dirTE  ] = &distributions[dirTE  *size_Mat];
+        dist.f[dirBW  ] = &distributions[dirBW  *size_Mat];
+        dist.f[dirBE  ] = &distributions[dirBE  *size_Mat];
+        dist.f[dirTW  ] = &distributions[dirTW  *size_Mat];
+        dist.f[dirTN  ] = &distributions[dirTN  *size_Mat];
+        dist.f[dirBS  ] = &distributions[dirBS  *size_Mat];
+        dist.f[dirBN  ] = &distributions[dirBN  *size_Mat];
+        dist.f[dirTS  ] = &distributions[dirTS  *size_Mat];
+        dist.f[dirREST] = &distributions[dirREST*size_Mat];
+        dist.f[dirTNE ] = &distributions[dirTNE *size_Mat];
+        dist.f[dirTSW ] = &distributions[dirTSW *size_Mat];
+        dist.f[dirTSE ] = &distributions[dirTSE *size_Mat];
+        dist.f[dirTNW ] = &distributions[dirTNW *size_Mat];
+        dist.f[dirBNE ] = &distributions[dirBNE *size_Mat];
+        dist.f[dirBSW ] = &distributions[dirBSW *size_Mat];
+        dist.f[dirBSE ] = &distributions[dirBSE *size_Mat];
+        dist.f[dirBNW ] = &distributions[dirBNW *size_Mat];
+    }
+    else
+    {
+        dist.f[dirW   ] = &distributions[dirE   *size_Mat];
+        dist.f[dirE   ] = &distributions[dirW   *size_Mat];
+        dist.f[dirS   ] = &distributions[dirN   *size_Mat];
+        dist.f[dirN   ] = &distributions[dirS   *size_Mat];
+        dist.f[dirB   ] = &distributions[dirT   *size_Mat];
+        dist.f[dirT   ] = &distributions[dirB   *size_Mat];
+        dist.f[dirSW  ] = &distributions[dirNE  *size_Mat];
+        dist.f[dirNE  ] = &distributions[dirSW  *size_Mat];
+        dist.f[dirNW  ] = &distributions[dirSE  *size_Mat];
+        dist.f[dirSE  ] = &distributions[dirNW  *size_Mat];
+        dist.f[dirBW  ] = &distributions[dirTE  *size_Mat];
+        dist.f[dirTE  ] = &distributions[dirBW  *size_Mat];
+        dist.f[dirTW  ] = &distributions[dirBE  *size_Mat];
+        dist.f[dirBE  ] = &distributions[dirTW  *size_Mat];
+        dist.f[dirBS  ] = &distributions[dirTN  *size_Mat];
+        dist.f[dirTN  ] = &distributions[dirBS  *size_Mat];
+        dist.f[dirTS  ] = &distributions[dirBN  *size_Mat];
+        dist.f[dirBN  ] = &distributions[dirTS  *size_Mat];
+        dist.f[dirREST] = &distributions[dirREST*size_Mat];
+        dist.f[dirBSW ] = &distributions[dirTNE *size_Mat];
+        dist.f[dirBNE ] = &distributions[dirTSW *size_Mat];
+        dist.f[dirBNW ] = &distributions[dirTSE *size_Mat];
+        dist.f[dirBSE ] = &distributions[dirTNW *size_Mat];
+        dist.f[dirTSW ] = &distributions[dirBNE *size_Mat];
+        dist.f[dirTNE ] = &distributions[dirBSW *size_Mat];
+        dist.f[dirTNW ] = &distributions[dirBSE *size_Mat];
+        dist.f[dirTSE ] = &distributions[dirBNW *size_Mat];
+    }
+    return dist;
+}
+
+struct DistributionWrapper
+{
+    __device__ DistributionWrapper(
+        real* distributions,
+        unsigned int size_Mat,
+        bool isEvenTimestep,
+        uint k,
+        uint* neighborX,
+        uint* neighborY,
+        uint* neighborZ) :
+        dist(getDistributions27(distributions, size_Mat, isEvenTimestep)),
+        k(k),
+        kw  (neighborX[k]),
+        ks  (neighborY[k]),
+        kb  (neighborZ[k]),
+        ksw (neighborY[kw]),
+        kbw (neighborZ[kw]),
+        kbs (neighborZ[ks]),
+        kbsw(neighborZ[ksw])
+    { 
+        read();
+    }
+
+    __device__ void read()
+    {
+        distribution.f[VF::LBM::DIR::PZZ] = (dist.f[dirE   ])[k];
+        distribution.f[VF::LBM::DIR::MZZ] = (dist.f[dirW   ])[kw];
+        distribution.f[VF::LBM::DIR::ZPZ] = (dist.f[dirN   ])[k];
+        distribution.f[VF::LBM::DIR::ZMZ] = (dist.f[dirS   ])[ks];
+        distribution.f[VF::LBM::DIR::ZZP] = (dist.f[dirT   ])[k];
+        distribution.f[VF::LBM::DIR::ZZM] = (dist.f[dirB   ])[kb];
+        distribution.f[VF::LBM::DIR::PPZ] = (dist.f[dirNE  ])[k];
+        distribution.f[VF::LBM::DIR::MMZ] = (dist.f[dirSW  ])[ksw];
+        distribution.f[VF::LBM::DIR::PMZ] = (dist.f[dirSE  ])[ks];
+        distribution.f[VF::LBM::DIR::MPZ] = (dist.f[dirNW  ])[kw];
+        distribution.f[VF::LBM::DIR::PZP] = (dist.f[dirTE  ])[k];
+        distribution.f[VF::LBM::DIR::MZM] = (dist.f[dirBW  ])[kbw];
+        distribution.f[VF::LBM::DIR::PZM] = (dist.f[dirBE  ])[kb];
+        distribution.f[VF::LBM::DIR::MZP] = (dist.f[dirTW  ])[kw];
+        distribution.f[VF::LBM::DIR::ZPP] = (dist.f[dirTN  ])[k];
+        distribution.f[VF::LBM::DIR::ZMM] = (dist.f[dirBS  ])[kbs];
+        distribution.f[VF::LBM::DIR::ZPM] = (dist.f[dirBN  ])[kb];
+        distribution.f[VF::LBM::DIR::ZMP] = (dist.f[dirTS  ])[ks];
+        distribution.f[VF::LBM::DIR::PPP] = (dist.f[dirTNE ])[k];
+        distribution.f[VF::LBM::DIR::MPP] = (dist.f[dirTNW ])[kw];
+        distribution.f[VF::LBM::DIR::PMP] = (dist.f[dirTSE ])[ks];
+        distribution.f[VF::LBM::DIR::MMP] = (dist.f[dirTSW ])[ksw];
+        distribution.f[VF::LBM::DIR::PPM] = (dist.f[dirBNE ])[kb];
+        distribution.f[VF::LBM::DIR::MPM] = (dist.f[dirBNW ])[kbw];
+        distribution.f[VF::LBM::DIR::PMM] = (dist.f[dirBSE ])[kbs];
+        distribution.f[VF::LBM::DIR::MMM] = (dist.f[dirBSW ])[kbsw];
+        distribution.f[VF::LBM::DIR::ZZZ] = (dist.f[dirREST])[k];
+    }
+
+    __device__ void write()
+    {
+        (dist.f[dirE   ])[k]    = distribution.f[VF::LBM::DIR::PZZ];
+        (dist.f[dirW   ])[kw]   = distribution.f[VF::LBM::DIR::MZZ];
+        (dist.f[dirN   ])[k]    = distribution.f[VF::LBM::DIR::ZPZ];
+        (dist.f[dirS   ])[ks]   = distribution.f[VF::LBM::DIR::ZMZ];
+        (dist.f[dirT   ])[k]    = distribution.f[VF::LBM::DIR::ZZP];
+        (dist.f[dirB   ])[kb]   = distribution.f[VF::LBM::DIR::ZZM];
+        (dist.f[dirNE  ])[k]    = distribution.f[VF::LBM::DIR::PPZ];
+        (dist.f[dirSW  ])[ksw]  = distribution.f[VF::LBM::DIR::MMZ];
+        (dist.f[dirSE  ])[ks]   = distribution.f[VF::LBM::DIR::PMZ];
+        (dist.f[dirNW  ])[kw]   = distribution.f[VF::LBM::DIR::MPZ];
+        (dist.f[dirTE  ])[k]    = distribution.f[VF::LBM::DIR::PZP];
+        (dist.f[dirBW  ])[kbw]  = distribution.f[VF::LBM::DIR::MZM];
+        (dist.f[dirBE  ])[kb]   = distribution.f[VF::LBM::DIR::PZM];
+        (dist.f[dirTW  ])[kw]   = distribution.f[VF::LBM::DIR::MZP];
+        (dist.f[dirTN  ])[k]    = distribution.f[VF::LBM::DIR::ZPP];
+        (dist.f[dirBS  ])[kbs]  = distribution.f[VF::LBM::DIR::ZMM];
+        (dist.f[dirBN  ])[kb]   = distribution.f[VF::LBM::DIR::ZPM];
+        (dist.f[dirTS  ])[ks]   = distribution.f[VF::LBM::DIR::ZMP];
+        (dist.f[dirTNE ])[k]    = distribution.f[VF::LBM::DIR::PPP];
+        (dist.f[dirTNW ])[kw]   = distribution.f[VF::LBM::DIR::MPP];
+        (dist.f[dirTSE ])[ks]   = distribution.f[VF::LBM::DIR::PMP];
+        (dist.f[dirTSW ])[ksw]  = distribution.f[VF::LBM::DIR::MMP];
+        (dist.f[dirBNE ])[kb]   = distribution.f[VF::LBM::DIR::PPM];
+        (dist.f[dirBNW ])[kbw]  = distribution.f[VF::LBM::DIR::MPM];
+        (dist.f[dirBSE ])[kbs]  = distribution.f[VF::LBM::DIR::PMM];
+        (dist.f[dirBSW ])[kbsw] = distribution.f[VF::LBM::DIR::MMM];
+        (dist.f[dirREST])[k]    = distribution.f[VF::LBM::DIR::ZZZ];
+    }
+
+    Distributions27 dist;
+
+    VF::LBM::Distribution27 distribution;
+
+    const uint k;
+    const uint kw;
+    const uint ks;
+    const uint kb;
+    const uint ksw;
+    const uint kbw;
+    const uint kbs;
+    const uint kbsw;
+};
+
+__device__ unsigned int getNodeIndex()
+{
+    const unsigned  x = threadIdx.x; 
+    const unsigned  y = blockIdx.x;  
+    const unsigned  z = blockIdx.y;  
+
+    const unsigned nx = blockDim.x;
+    const unsigned ny = gridDim.x;
+
+    return nx*(ny*z + y) + x;
+}
+
+__device__ bool isValidFluidNode(uint k, int size_Mat, uint nodeType)
+{
+    return (k < size_Mat) && (nodeType == GEO_FLUID);
+}
+
+__device__ void getLevelForce(real fx, real fy, real fz, int level, real* forces)
+{
+    real fx_t {1.}, fy_t {1.}, fz_t {1.};
+    for (int i = 0; i < level; i++)
+    {
+        fx_t *= c2o1;
+        fy_t *= c2o1;
+        fz_t *= c2o1;
+    }
+
+    forces[0] = fx / fx_t;
+    forces[1] = fy / fy_t;
+    forces[2] = fz / fz_t;
+}
+
+
+extern "C" __global__ void Cumulant_K17_LBM_Device_Kernel(
+    real omega,
+    uint* typeOfGridNode,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    real* distributions,
+    int size_Mat,
+    int level,
+    real* forces,
+    bool isEvenTimestep)
+{
+    const uint k = getNodeIndex();
+    const uint nodeType = typeOfGridNode[k];
+
+    if (isValidFluidNode(k, size_Mat, nodeType))
+    {
+        DistributionWrapper distributionWrapper {
+            distributions, size_Mat, isEvenTimestep, k, neighborX, neighborY, neighborZ
+        };
+
+        real level_forces[3];
+        getLevelForce(forces[0], forces[1], forces[2], level, level_forces);
+
+        VF::LBM::cumulantChimera(distributionWrapper.distribution, omega, level_forces);
+
+        distributionWrapper.write();
+    }
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __global__ void Cumulant_K17_LBM_Device_Kernel_old(
+    real omega,
+    uint* typeOfGridNode,
+    uint* neighborX,
+    uint* neighborY,
+    uint* neighborZ,
+    real* distributions,
+    int size_Mat,
+    int level,
+    real* forces,
+    bool isEvenTimestep)
+{
+    //////////////////////////////////////////////////////////////////////////
+    //! Cumulant K17 Kernel is based on \ref
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+    //! and \ref
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.07.004"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.07.004 ]</b></a>
+    //!
+    //! The cumulant kernel is executed in the following steps
+    //!
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from thredIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned  x = threadIdx.x; 
+    const unsigned  y = blockIdx.x;  
+    const unsigned  z = blockIdx.y;  
+
+    const unsigned nx = blockDim.x;
+    const unsigned ny = gridDim.x;
+
+    const unsigned k = nx*(ny*z + y) + x;
+
+    //////////////////////////////////////////////////////////////////////////
+    // run for all indices in size_Mat and fluid nodes
+    if ((k < size_Mat) && (typeOfGridNode[k] == GEO_FLUID))
+    {
+        //////////////////////////////////////////////////////////////////////////
+        //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
+        //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
+        //!
+        Distributions27 dist = getDistributions27(distributions, size_Mat, isEvenTimestep);
+
+        ////////////////////////////////////////////////////////////////////////////////
+        //! - Set neighbor indices (necessary for indirect addressing) 
+        uint kw   = neighborX[k];
+        uint ks   = neighborY[k];
+        uint kb   = neighborZ[k];
+        uint ksw  = neighborY[kw];
+        uint kbw  = neighborZ[kw];
+        uint kbs  = neighborZ[ks];
+        uint kbsw = neighborZ[ksw];
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Set local distributions
+        //!
+        real mfcbb = (dist.f[dirE   ])[k];
+        real mfabb = (dist.f[dirW   ])[kw];
+        real mfbcb = (dist.f[dirN   ])[k];
+        real mfbab = (dist.f[dirS   ])[ks];
+        real mfbbc = (dist.f[dirT   ])[k];
+        real mfbba = (dist.f[dirB   ])[kb];
+        real mfccb = (dist.f[dirNE  ])[k];
+        real mfaab = (dist.f[dirSW  ])[ksw];
+        real mfcab = (dist.f[dirSE  ])[ks];
+        real mfacb = (dist.f[dirNW  ])[kw];
+        real mfcbc = (dist.f[dirTE  ])[k];
+        real mfaba = (dist.f[dirBW  ])[kbw];
+        real mfcba = (dist.f[dirBE  ])[kb];
+        real mfabc = (dist.f[dirTW  ])[kw];
+        real mfbcc = (dist.f[dirTN  ])[k];
+        real mfbaa = (dist.f[dirBS  ])[kbs];
+        real mfbca = (dist.f[dirBN  ])[kb];
+        real mfbac = (dist.f[dirTS  ])[ks];
+        real mfbbb = (dist.f[dirREST])[k];
+        real mfccc = (dist.f[dirTNE ])[k];
+        real mfaac = (dist.f[dirTSW ])[ksw];
+        real mfcac = (dist.f[dirTSE ])[ks];
+        real mfacc = (dist.f[dirTNW ])[kw];
+        real mfcca = (dist.f[dirBNE ])[kb];
+        real mfaaa = (dist.f[dirBSW ])[kbsw];
+        real mfcaa = (dist.f[dirBSE ])[kbs];
+        real mfaca = (dist.f[dirBNW ])[kbw];
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
+        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
+        //!
+        real drho =
+            ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
+            (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
+            ((mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc))) + mfbbb; 
+        real rho = c1o1 + drho;
+        real OOrho = c1o1 / rho;    
+        real vvx = 
+            ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
+            (((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) +
+            (mfcbb - mfabb)) * OOrho;
+        real vvy = 
+            ((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
+            (((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) +
+            (mfbcb - mfbab)) * OOrho;
+        real vvz = 
+            ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
+            (((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) +
+            (mfbbc - mfbba)) * OOrho;
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Add half of the acceleration (body force) to the velocity as in Eq. (42) \ref
+        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
+        //!
+        real fx = forces[0];
+        real fy = forces[1];
+        real fz = forces[2];
+
+        real fx_t {1.}, fy_t {1.}, fz_t {1.};
+        for (int i = 0; i < level; i++)
+        {
+            fx_t *= c2o1;
+            fy_t *= c2o1;
+            fz_t *= c2o1;
+        }
+
+        fx /= fx_t;
+        fy /= fy_t;
+        fz /= fz_t;
+        //real forces[3] {fx, fy, fz};
+
+        vvx += fx * c1o2;
+        vvy += fy * c1o2;
+        vvz += fz * c1o2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        // calculate the square of velocities for this lattice node
+        real vx2 = vvx*vvx;
+        real vy2 = vvy*vvy;
+        real vz2 = vvz*vvz;
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Set relaxation limiters for third order cumulants to default value \f$ \lambda=0.001 \f$ according to section 6 in \ref
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+        //!
+        real wadjust;
+        real qudricLimitP = c1o100;
+        real qudricLimitM = c1o100;
+        real qudricLimitD = c1o100;
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Chimera transform from well conditioned distributions to central moments as defined in Appendix J in \ref
+        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
+        //! see also Eq. (6)-(14) in \ref
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+        //!
+        ////////////////////////////////////////////////////////////////////////////////////
+        // Z - Dir
+        VF::LBM::forwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
+        VF::LBM::forwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2,  c9o1,  c1o9);
+        VF::LBM::forwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
+        VF::LBM::forwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2,  c9o1,  c1o9);
+        VF::LBM::forwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2,  c9o4,  c4o9);
+        VF::LBM::forwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2,  c9o1,  c1o9);
+        VF::LBM::forwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
+        VF::LBM::forwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2,  c9o1,  c1o9);
+        VF::LBM::forwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);   
+        ////////////////////////////////////////////////////////////////////////////////////
+        // Y - Dir
+        VF::LBM::forwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2,  c6o1,  c1o6);
+        VF::LBM::forwardChimera(            mfaab, mfabb, mfacb, vvy, vy2);
+        VF::LBM::forwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
+        VF::LBM::forwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2,  c3o2,  c2o3);
+        VF::LBM::forwardChimera(            mfbab, mfbbb, mfbcb, vvy, vy2);
+        VF::LBM::forwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2,  c9o2,  c2o9);
+        VF::LBM::forwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2,  c6o1,  c1o6);
+        VF::LBM::forwardChimera(            mfcab, mfcbb, mfccb, vvy, vy2);
+        VF::LBM::forwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);   
+        ////////////////////////////////////////////////////////////////////////////////////
+        // X - Dir
+        VF::LBM::forwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
+        VF::LBM::forwardChimera(            mfaba, mfbba, mfcba, vvx, vx2);
+        VF::LBM::forwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
+        VF::LBM::forwardChimera(            mfaab, mfbab, mfcab, vvx, vx2);
+        VF::LBM::forwardChimera(            mfabb, mfbbb, mfcbb, vvx, vx2);
+        VF::LBM::forwardChimera(            mfacb, mfbcb, mfccb, vvx, vx2);
+        VF::LBM::forwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
+        VF::LBM::forwardChimera(            mfabc, mfbbc, mfcbc, vvx, vx2);
+        VF::LBM::forwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c3o1, c1o9); 
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Setting relaxation rates for non-hydrodynamic cumulants (default values). Variable names and equations    according to
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+        //!  => [NAME IN PAPER]=[NAME IN CODE]=[DEFAULT VALUE].
+        //!  - Trace of second order cumulants \f$ C_{200}+C_{020}+C_{002} \f$ used to adjust bulk  viscosity:\f$\omega_2=OxxPyyPzz=1.0 \f$.
+        //!  - Third order cumulants \f$ C_{120}+C_{102}, C_{210}+C_{012}, C_{201}+C_{021} \f$: \f$ \omega_3=OxyyPxzz   \f$ set according to Eq. (111) with simplifications assuming \f$ \omega_2=1.0\f$.
+        //!  - Third order cumulants \f$ C_{120}-C_{102}, C_{210}-C_{012}, C_{201}-C_{021} \f$: \f$ \omega_4 =  OxyyMxzz \f$ set according to Eq. (112) with simplifications assuming \f$ \omega_2 = 1.0\f$.
+        //!  - Third order cumulants \f$ C_{111} \f$: \f$ \omega_5 = Oxyz \f$ set according to Eq. (113) with   simplifications assuming \f$ \omega_2 = 1.0\f$  (modify for different bulk viscosity).
+        //!  - Fourth order cumulants \f$ C_{220}, C_{202}, C_{022}, C_{211}, C_{121}, C_{112} \f$: for simplification  all set to the same default value \f$ \omega_6=\omega_7=\omega_8=O4=1.0 \f$.
+        //!  - Fifth order cumulants \f$ C_{221}, C_{212}, C_{122}\f$: \f$\omega_9=O5=1.0\f$.
+        //!  - Sixth order cumulant \f$ C_{222}\f$: \f$\omega_{10}=O6=1.0\f$.
+        //!
+        ////////////////////////////////////////////////////////////
+        //2.
+        real OxxPyyPzz = c1o1;
+        ////////////////////////////////////////////////////////////
+        //3.
+        real OxyyPxzz = c8o1  * (-c2o1 + omega) * ( c1o1 + c2o1*omega) / (-c8o1 - c14o1*omega + c7o1*omega*omega);
+        real OxyyMxzz = c8o1  * (-c2o1 + omega) * (-c7o1 + c4o1*omega) / (c56o1 - c50o1*omega + c9o1*omega*omega);
+        real Oxyz     = c24o1 * (-c2o1 + omega) * (-c2o1 - c7o1*omega + c3o1*omega*omega) / (c48o1 + c152o1*omega - c130o1*omega*omega + c29o1*omega*omega*omega);
+        ////////////////////////////////////////////////////////////
+        //4.
+        real O4 = c1o1;
+        ////////////////////////////////////////////////////////////
+        //5.
+        real O5 = c1o1;
+        ////////////////////////////////////////////////////////////
+        //6.
+        real O6 = c1o1; 
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - A and B: parameters for fourth order convergence of the diffusion term according to Eq. (114) and (115) 
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+        //! with simplifications assuming \f$ \omega_2 = 1.0 \f$ (modify for different bulk viscosity).
+        //!
+        real A = (c4o1 + c2o1*omega - c3o1*omega*omega) / (c2o1 - c7o1*omega + c5o1*omega*omega);
+        real B = (c4o1 + c28o1*omega - c14o1*omega*omega) / (c6o1 - c21o1*omega + c15o1*omega*omega);   
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Compute cumulants from central moments according to Eq. (20)-(23) in
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+        //!
+        ////////////////////////////////////////////////////////////
+        //4.
+        real CUMcbb = mfcbb - ((mfcaa + c1o3) * mfabb + c2o1 * mfbba * mfbab) * OOrho;
+        real CUMbcb = mfbcb - ((mfaca + c1o3) * mfbab + c2o1 * mfbba * mfabb) * OOrho;
+        real CUMbbc = mfbbc - ((mfaac + c1o3) * mfbba + c2o1 * mfbab * mfabb) * OOrho;  
+        real CUMcca = mfcca - (((mfcaa * mfaca + c2o1 * mfbba * mfbba) + c1o3 * (mfcaa + mfaca)) * OOrho - c1o9*(drho   * OOrho));
+        real CUMcac = mfcac - (((mfcaa * mfaac + c2o1 * mfbab * mfbab) + c1o3 * (mfcaa + mfaac)) * OOrho - c1o9*(drho   * OOrho));
+        real CUMacc = mfacc - (((mfaac * mfaca + c2o1 * mfabb * mfabb) + c1o3 * (mfaac + mfaca)) * OOrho - c1o9*(drho   * OOrho));
+        ////////////////////////////////////////////////////////////
+        //5.
+        real CUMbcc = mfbcc - ((mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb + mfbba *  mfabc)) + c1o3 * (mfbca + mfbac)) * OOrho;
+        real CUMcbc = mfcbc - ((mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab + mfbba *  mfbac)) + c1o3 * (mfcba + mfabc)) * OOrho;
+        real CUMccb = mfccb - ((mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca + mfabb *  mfcba)) + c1o3 * (mfacb + mfcab)) * OOrho;
+        ////////////////////////////////////////////////////////////
+        //6.
+        real CUMccc = mfccc + ((-c4o1 *  mfbbb * mfbbb
+            - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
+            - c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
+            - c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) * OOrho
+            + (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
+            + c2o1 * (mfcaa * mfaca * mfaac)
+            + c16o1 *  mfbba * mfbab * mfabb) * OOrho * OOrho
+            - c1o3 * (mfacc + mfcac + mfcca) * OOrho
+            - c1o9 * (mfcaa + mfaca + mfaac) * OOrho
+            + (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
+            + (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 *(mfaac + mfaca + mfcaa)) * OOrho * OOrho  * c2o3
+            + c1o27*((drho * drho - drho) * OOrho * OOrho));    
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Compute linear combinations of second and third order cumulants
+        //!
+        ////////////////////////////////////////////////////////////
+        //2.
+        real mxxPyyPzz = mfcaa + mfaca + mfaac;
+        real mxxMyy = mfcaa - mfaca;
+        real mxxMzz = mfcaa - mfaac;
+        ////////////////////////////////////////////////////////////
+        //3.
+        real mxxyPyzz = mfcba + mfabc;
+        real mxxyMyzz = mfcba - mfabc;  
+        real mxxzPyyz = mfcab + mfacb;
+        real mxxzMyyz = mfcab - mfacb;  
+        real mxyyPxzz = mfbca + mfbac;
+        real mxyyMxzz = mfbca - mfbac;  
+        ////////////////////////////////////////////////////////////////////////////////////
+        //incl. correction
+        ////////////////////////////////////////////////////////////
+        //! - Compute velocity  gradients from second order cumulants according to Eq. (27)-(32)
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+        //! Further explanations of the correction in viscosity in Appendix H of
+        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
+        //! Note that the division by rho is omitted here as we need rho times the gradients later.
+        //!
+        real Dxy = -c3o1*omega*mfbba;
+        real Dxz = -c3o1*omega*mfbab;
+        real Dyz = -c3o1*omega*mfabb;
+        real dxux = c1o2 * (-omega) *(mxxMyy + mxxMzz) + c1o2 *  OxxPyyPzz * (mfaaa - mxxPyyPzz);
+        real dyuy = dxux + omega * c3o2 * mxxMyy;
+        real dzuz = dxux + omega * c3o2 * mxxMzz;
+        ////////////////////////////////////////////////////////////
+        //! - Relaxation of second order cumulants with correction terms according to Eq. (33)-(35) in
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+        //!
+        mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz) - c3o1 * (c1o1 - c1o2 * OxxPyyPzz) * (vx2 * dxux + vy2 * dyuy + vz2  * dzuz);
+        mxxMyy    += omega * (-mxxMyy) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vy2 * dyuy);
+        mxxMzz    += omega * (-mxxMzz) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vz2 * dzuz);   
+        ////////////////////////////////////////////////////////////////////////////////////
+        ////no correction
+        //mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz);
+        //mxxMyy += -(-omega) * (-mxxMyy);
+        //mxxMzz += -(-omega) * (-mxxMzz);
+        //////////////////////////////////////////////////////////////////////////
+        mfabb += omega * (-mfabb);
+        mfbab += omega * (-mfbab);
+        mfbba += omega * (-mfbba);  
+        ////////////////////////////////////////////////////////////////////////////////////
+        //relax
+        //////////////////////////////////////////////////////////////////////////
+        // incl. limiter
+        //! - Relaxation of third order cumulants including limiter according to Eq. (116)-(123)
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+        //!
+        wadjust   = Oxyz + (c1o1 - Oxyz)*abs(mfbbb) / (abs(mfbbb) + qudricLimitD);
+        mfbbb    += wadjust * (-mfbbb);
+        wadjust   = OxyyPxzz + (c1o1 - OxyyPxzz)*abs(mxxyPyzz) / (abs(mxxyPyzz) + qudricLimitP);
+        mxxyPyzz += wadjust * (-mxxyPyzz);
+        wadjust   = OxyyMxzz + (c1o1 - OxyyMxzz)*abs(mxxyMyzz) / (abs(mxxyMyzz) + qudricLimitM);
+        mxxyMyzz += wadjust * (-mxxyMyzz);
+        wadjust   = OxyyPxzz + (c1o1 - OxyyPxzz)*abs(mxxzPyyz) / (abs(mxxzPyyz) + qudricLimitP);
+        mxxzPyyz += wadjust * (-mxxzPyyz);
+        wadjust   = OxyyMxzz + (c1o1 - OxyyMxzz)*abs(mxxzMyyz) / (abs(mxxzMyyz) + qudricLimitM);
+        mxxzMyyz += wadjust * (-mxxzMyyz);
+        wadjust   = OxyyPxzz + (c1o1 - OxyyPxzz)*abs(mxyyPxzz) / (abs(mxyyPxzz) + qudricLimitP);
+        mxyyPxzz += wadjust * (-mxyyPxzz);
+        wadjust   = OxyyMxzz + (c1o1 - OxyyMxzz)*abs(mxyyMxzz) / (abs(mxyyMxzz) + qudricLimitM);
+        mxyyMxzz += wadjust * (-mxyyMxzz);
+        //////////////////////////////////////////////////////////////////////////
+        // no limiter
+        //mfbbb += OxyyMxzz * (-mfbbb);
+        //mxxyPyzz += OxyyPxzz * (-mxxyPyzz);
+        //mxxyMyzz += OxyyMxzz * (-mxxyMyzz);
+        //mxxzPyyz += OxyyPxzz * (-mxxzPyyz);
+        //mxxzMyyz += OxyyMxzz * (-mxxzMyyz);
+        //mxyyPxzz += OxyyPxzz * (-mxyyPxzz);
+        //mxyyMxzz += OxyyMxzz * (-mxyyMxzz);   
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Compute inverse linear combinations of second and third order cumulants
+        //!
+        mfcaa = c1o3 * (mxxMyy + mxxMzz + mxxPyyPzz);
+        mfaca = c1o3 * (-c2o1*  mxxMyy + mxxMzz + mxxPyyPzz);
+        mfaac = c1o3 * (mxxMyy - c2o1* mxxMzz + mxxPyyPzz); 
+        mfcba = ( mxxyMyzz + mxxyPyzz) * c1o2;
+        mfabc = (-mxxyMyzz + mxxyPyzz) * c1o2;
+        mfcab = ( mxxzMyyz + mxxzPyyz) * c1o2;
+        mfacb = (-mxxzMyyz + mxxzPyyz) * c1o2;
+        mfbca = ( mxyyMxzz + mxyyPxzz) * c1o2;
+        mfbac = (-mxyyMxzz + mxyyPxzz) * c1o2;
+        //////////////////////////////////////////////////////////////////////////  
+        //////////////////////////////////////////////////////////////////////////
+        //4.
+        // no limiter
+        //! - Relax fourth order cumulants to modified equilibrium for fourth order convergence of diffusion according  to Eq. (43)-(48)
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+        //!
+        CUMacc = -O4*(c1o1 / omega - c1o2) * (dyuy + dzuz) * c2o3 * A + (c1o1 - O4) * (CUMacc);
+        CUMcac = -O4*(c1o1 / omega - c1o2) * (dxux + dzuz) * c2o3 * A + (c1o1 - O4) * (CUMcac);
+        CUMcca = -O4*(c1o1 / omega - c1o2) * (dyuy + dxux) * c2o3 * A + (c1o1 - O4) * (CUMcca);
+        CUMbbc = -O4*(c1o1 / omega - c1o2) * Dxy           * c1o3 * B + (c1o1 - O4) * (CUMbbc);
+        CUMbcb = -O4*(c1o1 / omega - c1o2) * Dxz           * c1o3 * B + (c1o1 - O4) * (CUMbcb);
+        CUMcbb = -O4*(c1o1 / omega - c1o2) * Dyz           * c1o3 * B + (c1o1 - O4) * (CUMcbb); 
+        //////////////////////////////////////////////////////////////////////////
+        //5.
+        CUMbcc += O5 * (-CUMbcc);
+        CUMcbc += O5 * (-CUMcbc);
+        CUMccb += O5 * (-CUMccb);   
+        //////////////////////////////////////////////////////////////////////////
+        //6.
+        CUMccc += O6 * (-CUMccc);   
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Compute central moments from post collision cumulants according to Eq. (53)-(56) in
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+        //! 
+        //////////////////////////////////////////////////////////////////////////
+        //4.
+        mfcbb = CUMcbb + c1o3*((c3o1*mfcaa + c1o1) * mfabb + c6o1 * mfbba * mfbab) * OOrho;
+        mfbcb = CUMbcb + c1o3*((c3o1*mfaca + c1o1) * mfbab + c6o1 * mfbba * mfabb) * OOrho;
+        mfbbc = CUMbbc + c1o3*((c3o1*mfaac + c1o1) * mfbba + c6o1 * mfbab * mfabb) * OOrho; 
+        mfcca = CUMcca + (((mfcaa * mfaca + c2o1 * mfbba * mfbba)*c9o1 + c3o1 * (mfcaa + mfaca)) * OOrho - (drho *  OOrho))*c1o9;
+        mfcac = CUMcac + (((mfcaa * mfaac + c2o1 * mfbab * mfbab)*c9o1 + c3o1 * (mfcaa + mfaac)) * OOrho - (drho *  OOrho))*c1o9;
+        mfacc = CUMacc + (((mfaac * mfaca + c2o1 * mfabb * mfabb)*c9o1 + c3o1 * (mfaac + mfaca)) * OOrho - (drho *  OOrho))*c1o9; 
+        //////////////////////////////////////////////////////////////////////////
+        //5.
+        mfbcc = CUMbcc + c1o3 *(c3o1*(mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb +    mfbba * mfabc)) + (mfbca + mfbac)) * OOrho;
+        mfcbc = CUMcbc + c1o3 *(c3o1*(mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab +    mfbba * mfbac)) + (mfcba + mfabc)) * OOrho;
+        mfccb = CUMccb + c1o3 *(c3o1*(mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca +    mfabb * mfcba)) + (mfacb + mfcab)) * OOrho; 
+        //////////////////////////////////////////////////////////////////////////
+        //6.
+        mfccc =	CUMccc - ((-c4o1 *  mfbbb * mfbbb
+                - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
+                - c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
+                - c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) * OOrho
+                + (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
+                    + c2o1 * (mfcaa * mfaca * mfaac)
+                    + c16o1 *  mfbba * mfbab * mfabb) * OOrho * OOrho
+                - c1o3 * (mfacc + mfcac + mfcca) * OOrho
+                - c1o9 * (mfcaa + mfaca + mfaac) * OOrho
+                + (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
+                    + (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 *(mfaac + mfaca + mfcaa)) * OOrho * OOrho * c2o3
+                + c1o27*((drho * drho - drho) * OOrho * OOrho));    
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! -  Add acceleration (body force) to first order cumulants according to Eq. (85)-(87) in
+        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
+        //!
+        mfbaa = -mfbaa;
+        mfaba = -mfaba;
+        mfaab = -mfaab; 
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Chimera transform from central moments to well conditioned distributions as defined in Appendix J in
+        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
+        //! see also Eq. (88)-(96) in
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+        //!
+        ////////////////////////////////////////////////////////////////////////////////////
+        // X - Dir
+        VF::LBM::backwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
+        VF::LBM::backwardChimera(            mfaba, mfbba, mfcba, vvx, vx2);
+        VF::LBM::backwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
+        VF::LBM::backwardChimera(            mfaab, mfbab, mfcab, vvx, vx2);
+        VF::LBM::backwardChimera(            mfabb, mfbbb, mfcbb, vvx, vx2);
+        VF::LBM::backwardChimera(            mfacb, mfbcb, mfccb, vvx, vx2);
+        VF::LBM::backwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
+        VF::LBM::backwardChimera(            mfabc, mfbbc, mfcbc, vvx, vx2);
+        VF::LBM::backwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c9o1, c1o9);    
+        ////////////////////////////////////////////////////////////////////////////////////
+        // Y - Dir
+        VF::LBM::backwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2,  c6o1,  c1o6);
+        VF::LBM::backwardChimera(            mfaab, mfabb, mfacb, vvy, vy2);
+        VF::LBM::backwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
+        VF::LBM::backwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2,  c3o2,  c2o3);
+        VF::LBM::backwardChimera(            mfbab, mfbbb, mfbcb, vvy, vy2);
+        VF::LBM::backwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2,  c9o2,  c2o9);
+        VF::LBM::backwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2,  c6o1,  c1o6);
+        VF::LBM::backwardChimera(            mfcab, mfcbb, mfccb, vvy, vy2);
+        VF::LBM::backwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);  
+        ////////////////////////////////////////////////////////////////////////////////////
+        // Z - Dir
+        VF::LBM::backwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
+        VF::LBM::backwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2,  c9o1,  c1o9);
+        VF::LBM::backwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
+        VF::LBM::backwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2,  c9o1,  c1o9);
+        VF::LBM::backwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2,  c9o4,  c4o9);
+        VF::LBM::backwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2,  c9o1,  c1o9);
+        VF::LBM::backwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
+        VF::LBM::backwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2,  c9o1,  c1o9);
+        VF::LBM::backwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Write distributions: style of reading and writing the distributions from/to 
+        //! stored arrays dependent on timestep is based on the esoteric twist algorithm
+        //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
+        //!
+        (dist.f[dirE   ])[k   ] = mfabb;
+        (dist.f[dirW   ])[kw  ] = mfcbb;
+        (dist.f[dirN   ])[k   ] = mfbab;
+        (dist.f[dirS   ])[ks  ] = mfbcb;
+        (dist.f[dirT   ])[k   ] = mfbba;
+        (dist.f[dirB   ])[kb  ] = mfbbc;
+        (dist.f[dirNE  ])[k   ] = mfaab;
+        (dist.f[dirSW  ])[ksw ] = mfccb;
+        (dist.f[dirSE  ])[ks  ] = mfacb;
+        (dist.f[dirNW  ])[kw  ] = mfcab;
+        (dist.f[dirTE  ])[k   ] = mfaba;
+        (dist.f[dirBW  ])[kbw ] = mfcbc;
+        (dist.f[dirBE  ])[kb  ] = mfabc;
+        (dist.f[dirTW  ])[kw  ] = mfcba;
+        (dist.f[dirTN  ])[k   ] = mfbaa;
+        (dist.f[dirBS  ])[kbs ] = mfbcc;
+        (dist.f[dirBN  ])[kb  ] = mfbac;
+        (dist.f[dirTS  ])[ks  ] = mfbca;
+        (dist.f[dirREST])[k   ] = mfbbb;
+        (dist.f[dirTNE ])[k   ] = mfaaa;
+        (dist.f[dirTSE ])[ks  ] = mfaca;
+        (dist.f[dirBNE ])[kb  ] = mfaac;
+        (dist.f[dirBSE ])[kbs ] = mfacc;
+        (dist.f[dirTNW ])[kw  ] = mfcaa;
+        (dist.f[dirTSW ])[ksw ] = mfcca;
+        (dist.f[dirBNW ])[kbw ] = mfcac;
+        (dist.f[dirBSW ])[kbsw] = mfccc;
+    }
+}
diff --git a/src/lbm/CMakeLists.txt b/src/lbm/CMakeLists.txt
index 21ac44f7c..6331a0fe7 100644
--- a/src/lbm/CMakeLists.txt
+++ b/src/lbm/CMakeLists.txt
@@ -1,5 +1,7 @@
-project(lbm LANGUAGES CXX)
+project(lbm LANGUAGES CXX CUDA)
 
 vf_add_library(NAME lbm PUBLIC_LINK basics)
 
 vf_add_tests()
+
+set_target_properties(lbm PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
diff --git a/src/lbm/CumulantChimeraPreCompiled.cu b/src/lbm/CumulantChimeraPreCompiled.cu
new file mode 100644
index 000000000..b0a3e6254
--- /dev/null
+++ b/src/lbm/CumulantChimeraPreCompiled.cu
@@ -0,0 +1,433 @@
+#include "CumulantChimeraPreCompiled.h"
+
+#include <cmath>
+
+#include <basics/Core/DataTypes.h>
+#include <basics/Core/RealConstants.h>
+
+#include "D3Q27.h"
+#include "Chimera.h"
+#include "MacroscopicQuantities.h"
+
+namespace VF
+{
+namespace LBM
+{
+
+
+//////////////////////////////////////////////////////////////////////////
+//! Cumulant K17 Kernel is based on \ref
+//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//! and \ref
+//! <a href="https://doi.org/10.1016/j.jcp.2017.07.004"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.07.004 ]</b></a>
+//////////////////////////////////////////////////////////////////////////
+__host__ __device__ void cumulantChimera(Distribution27& distribution, real omega, real* forces)
+{
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Read distributions: style of reading and writing the distributions from/to 
+    //! stored arrays dependent on timestep is based on the esoteric twist algorithm
+    //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
+    //!
+    real mfcbb = distribution.f[DIR::PZZ];
+    real mfabb = distribution.f[DIR::MZZ];
+    real mfbcb = distribution.f[DIR::ZPZ];
+    real mfbab = distribution.f[DIR::ZMZ];
+    real mfbbc = distribution.f[DIR::ZZP];
+    real mfbba = distribution.f[DIR::ZZM];
+    real mfccb = distribution.f[DIR::PPZ];
+    real mfaab = distribution.f[DIR::MMZ];
+    real mfcab = distribution.f[DIR::PMZ];
+    real mfacb = distribution.f[DIR::MPZ];
+    real mfcbc = distribution.f[DIR::PZP];
+    real mfaba = distribution.f[DIR::MZM];
+    real mfcba = distribution.f[DIR::PZM];
+    real mfabc = distribution.f[DIR::MZP];
+    real mfbcc = distribution.f[DIR::ZPP];
+    real mfbaa = distribution.f[DIR::ZMM];
+    real mfbca = distribution.f[DIR::ZPM];
+    real mfbac = distribution.f[DIR::ZMP];
+    real mfccc = distribution.f[DIR::PPP];
+    real mfacc = distribution.f[DIR::MPP];
+    real mfcac = distribution.f[DIR::PMP];
+    real mfaac = distribution.f[DIR::MMP];
+    real mfcca = distribution.f[DIR::PPM];
+    real mfaca = distribution.f[DIR::MPM];
+    real mfcaa = distribution.f[DIR::PMM];
+    real mfaaa = distribution.f[DIR::MMM];
+    real mfbbb = distribution.f[DIR::ZZZ];
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
+    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
+    //!
+    real drho =
+        ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
+        (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) + ((mfacb + mfcab) + (mfaab + mfccb))) +
+        ((mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc))) + mfbbb; 
+    real rho = c1o1 + drho;
+    real OOrho = c1o1 / rho;    
+    real vvx = 
+        ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
+        (((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) +
+        (mfcbb - mfabb)) * OOrho;
+    real vvy = 
+        ((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
+        (((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) +
+        (mfbcb - mfbab)) * OOrho;
+    real vvz = 
+        ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
+        (((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) +
+        (mfbbc - mfbba)) * OOrho;
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Add half of the acceleration (body force) to the velocity as in Eq. (42) \ref
+    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
+    //!
+    vvx += forces[0] * c1o2;
+    vvy += forces[1] * c1o2;
+    vvz += forces[2] * c1o2;
+    ////////////////////////////////////////////////////////////////////////////////////
+    // calculate the square of velocities for this lattice node
+    real vx2 = vvx*vvx;
+    real vy2 = vvy*vvy;
+    real vz2 = vvz*vvz;
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Set relaxation limiters for third order cumulants to default value \f$ \lambda=0.001 \f$ according to section 6 in \ref
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+    //!
+    real wadjust;
+    real qudricLimitP = c1o100;
+    real qudricLimitM = c1o100;
+    real qudricLimitD = c1o100;
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Chimera transform from well conditioned distributions to central moments as defined in Appendix J in \ref
+    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
+    //! see also Eq. (6)-(14) in \ref
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+    //!
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Z - Dir
+    VF::LBM::forwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
+    VF::LBM::forwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2,  c9o1,  c1o9);
+    VF::LBM::forwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
+    VF::LBM::forwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2,  c9o1,  c1o9);
+    VF::LBM::forwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2,  c9o4,  c4o9);
+    VF::LBM::forwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2,  c9o1,  c1o9);
+    VF::LBM::forwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
+    VF::LBM::forwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2,  c9o1,  c1o9);
+    VF::LBM::forwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);   
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Y - Dir
+    VF::LBM::forwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2,  c6o1,  c1o6);
+    VF::LBM::forwardChimera(            mfaab, mfabb, mfacb, vvy, vy2);
+    VF::LBM::forwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
+    VF::LBM::forwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2,  c3o2,  c2o3);
+    VF::LBM::forwardChimera(            mfbab, mfbbb, mfbcb, vvy, vy2);
+    VF::LBM::forwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2,  c9o2,  c2o9);
+    VF::LBM::forwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2,  c6o1,  c1o6);
+    VF::LBM::forwardChimera(            mfcab, mfcbb, mfccb, vvy, vy2);
+    VF::LBM::forwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);   
+    ////////////////////////////////////////////////////////////////////////////////////
+    // X - Dir
+    VF::LBM::forwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
+    VF::LBM::forwardChimera(            mfaba, mfbba, mfcba, vvx, vx2);
+    VF::LBM::forwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
+    VF::LBM::forwardChimera(            mfaab, mfbab, mfcab, vvx, vx2);
+    VF::LBM::forwardChimera(            mfabb, mfbbb, mfcbb, vvx, vx2);
+    VF::LBM::forwardChimera(            mfacb, mfbcb, mfccb, vvx, vx2);
+    VF::LBM::forwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
+    VF::LBM::forwardChimera(            mfabc, mfbbc, mfcbc, vvx, vx2);
+    VF::LBM::forwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c3o1, c1o9); 
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Setting relaxation rates for non-hydrodynamic cumulants (default values). Variable names and equations    according to
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+    //!  => [NAME IN PAPER]=[NAME IN CODE]=[DEFAULT VALUE].
+    //!  - Trace of second order cumulants \f$ C_{200}+C_{020}+C_{002} \f$ used to adjust bulk  viscosity:\f$\omega_2=OxxPyyPzz=1.0 \f$.
+    //!  - Third order cumulants \f$ C_{120}+C_{102}, C_{210}+C_{012}, C_{201}+C_{021} \f$: \f$ \omega_3=OxyyPxzz   \f$ set according to Eq. (111) with simplifications assuming \f$ \omega_2=1.0\f$.
+    //!  - Third order cumulants \f$ C_{120}-C_{102}, C_{210}-C_{012}, C_{201}-C_{021} \f$: \f$ \omega_4 =  OxyyMxzz \f$ set according to Eq. (112) with simplifications assuming \f$ \omega_2 = 1.0\f$.
+    //!  - Third order cumulants \f$ C_{111} \f$: \f$ \omega_5 = Oxyz \f$ set according to Eq. (113) with   simplifications assuming \f$ \omega_2 = 1.0\f$  (modify for different bulk viscosity).
+    //!  - Fourth order cumulants \f$ C_{220}, C_{202}, C_{022}, C_{211}, C_{121}, C_{112} \f$: for simplification  all set to the same default value \f$ \omega_6=\omega_7=\omega_8=O4=1.0 \f$.
+    //!  - Fifth order cumulants \f$ C_{221}, C_{212}, C_{122}\f$: \f$\omega_9=O5=1.0\f$.
+    //!  - Sixth order cumulant \f$ C_{222}\f$: \f$\omega_{10}=O6=1.0\f$.
+    //!
+    ////////////////////////////////////////////////////////////
+    //2.
+    real OxxPyyPzz = c1o1;
+    ////////////////////////////////////////////////////////////
+    //3.
+    real OxyyPxzz = c8o1  * (-c2o1 + omega) * ( c1o1 + c2o1*omega) / (-c8o1 - c14o1*omega + c7o1*omega*omega);
+    real OxyyMxzz = c8o1  * (-c2o1 + omega) * (-c7o1 + c4o1*omega) / (c56o1 - c50o1*omega + c9o1*omega*omega);
+    real Oxyz     = c24o1 * (-c2o1 + omega) * (-c2o1 - c7o1*omega + c3o1*omega*omega) / (c48o1 + c152o1*omega - c130o1*omega*omega + c29o1*omega*omega*omega);
+    ////////////////////////////////////////////////////////////
+    //4.
+    real O4 = c1o1;
+    ////////////////////////////////////////////////////////////
+    //5.
+    real O5 = c1o1;
+    ////////////////////////////////////////////////////////////
+    //6.
+    real O6 = c1o1; 
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - A and B: parameters for fourth order convergence of the diffusion term according to Eq. (114) and (115) 
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+    //! with simplifications assuming \f$ \omega_2 = 1.0 \f$ (modify for different bulk viscosity).
+    //!
+    real A = (c4o1 + c2o1*omega - c3o1*omega*omega) / (c2o1 - c7o1*omega + c5o1*omega*omega);
+    real B = (c4o1 + c28o1*omega - c14o1*omega*omega) / (c6o1 - c21o1*omega + c15o1*omega*omega);   
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Compute cumulants from central moments according to Eq. (20)-(23) in
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+    //!
+    ////////////////////////////////////////////////////////////
+    //4.
+    real CUMcbb = mfcbb - ((mfcaa + c1o3) * mfabb + c2o1 * mfbba * mfbab) * OOrho;
+    real CUMbcb = mfbcb - ((mfaca + c1o3) * mfbab + c2o1 * mfbba * mfabb) * OOrho;
+    real CUMbbc = mfbbc - ((mfaac + c1o3) * mfbba + c2o1 * mfbab * mfabb) * OOrho;  
+    real CUMcca = mfcca - (((mfcaa * mfaca + c2o1 * mfbba * mfbba) + c1o3 * (mfcaa + mfaca)) * OOrho - c1o9*(drho   * OOrho));
+    real CUMcac = mfcac - (((mfcaa * mfaac + c2o1 * mfbab * mfbab) + c1o3 * (mfcaa + mfaac)) * OOrho - c1o9*(drho   * OOrho));
+    real CUMacc = mfacc - (((mfaac * mfaca + c2o1 * mfabb * mfabb) + c1o3 * (mfaac + mfaca)) * OOrho - c1o9*(drho   * OOrho));
+    ////////////////////////////////////////////////////////////
+    //5.
+    real CUMbcc = mfbcc - ((mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb + mfbba *  mfabc)) + c1o3 * (mfbca + mfbac)) * OOrho;
+    real CUMcbc = mfcbc - ((mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab + mfbba *  mfbac)) + c1o3 * (mfcba + mfabc)) * OOrho;
+    real CUMccb = mfccb - ((mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca + mfabb *  mfcba)) + c1o3 * (mfacb + mfcab)) * OOrho;
+    ////////////////////////////////////////////////////////////
+    //6.
+    real CUMccc = mfccc + ((-c4o1 *  mfbbb * mfbbb
+        - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
+        - c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
+        - c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) * OOrho
+        + (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
+        + c2o1 * (mfcaa * mfaca * mfaac)
+        + c16o1 *  mfbba * mfbab * mfabb) * OOrho * OOrho
+        - c1o3 * (mfacc + mfcac + mfcca) * OOrho
+        - c1o9 * (mfcaa + mfaca + mfaac) * OOrho
+        + (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
+        + (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 *(mfaac + mfaca + mfcaa)) * OOrho * OOrho  * c2o3
+        + c1o27*((drho * drho - drho) * OOrho * OOrho));    
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Compute linear combinations of second and third order cumulants
+    //!
+    ////////////////////////////////////////////////////////////
+    //2.
+    real mxxPyyPzz = mfcaa + mfaca + mfaac;
+    real mxxMyy = mfcaa - mfaca;
+    real mxxMzz = mfcaa - mfaac;
+    ////////////////////////////////////////////////////////////
+    //3.
+    real mxxyPyzz = mfcba + mfabc;
+    real mxxyMyzz = mfcba - mfabc;  
+    real mxxzPyyz = mfcab + mfacb;
+    real mxxzMyyz = mfcab - mfacb;  
+    real mxyyPxzz = mfbca + mfbac;
+    real mxyyMxzz = mfbca - mfbac;  
+    ////////////////////////////////////////////////////////////////////////////////////
+    //incl. correction
+    ////////////////////////////////////////////////////////////
+    //! - Compute velocity  gradients from second order cumulants according to Eq. (27)-(32)
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+    //! Further explanations of the correction in viscosity in Appendix H of
+    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
+    //! Note that the division by rho is omitted here as we need rho times the gradients later.
+    //!
+    real Dxy = -c3o1*omega*mfbba;
+    real Dxz = -c3o1*omega*mfbab;
+    real Dyz = -c3o1*omega*mfabb;
+    real dxux = c1o2 * (-omega) *(mxxMyy + mxxMzz) + c1o2 *  OxxPyyPzz * (mfaaa - mxxPyyPzz);
+    real dyuy = dxux + omega * c3o2 * mxxMyy;
+    real dzuz = dxux + omega * c3o2 * mxxMzz;
+    ////////////////////////////////////////////////////////////
+    //! - Relaxation of second order cumulants with correction terms according to Eq. (33)-(35) in
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+    //!
+    mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz) - c3o1 * (c1o1 - c1o2 * OxxPyyPzz) * (vx2 * dxux + vy2 * dyuy + vz2  * dzuz);
+    mxxMyy    += omega * (-mxxMyy) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vy2 * dyuy);
+    mxxMzz    += omega * (-mxxMzz) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vz2 * dzuz);   
+    ////////////////////////////////////////////////////////////////////////////////////
+    ////no correction
+    //mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz);
+    //mxxMyy += -(-omega) * (-mxxMyy);
+    //mxxMzz += -(-omega) * (-mxxMzz);
+    //////////////////////////////////////////////////////////////////////////
+    mfabb += omega * (-mfabb);
+    mfbab += omega * (-mfbab);
+    mfbba += omega * (-mfbba);  
+    ////////////////////////////////////////////////////////////////////////////////////
+    //relax
+    //////////////////////////////////////////////////////////////////////////
+    // incl. limiter
+    //! - Relaxation of third order cumulants including limiter according to Eq. (116)-(123)
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+    //!
+    wadjust   = Oxyz + (c1o1 - Oxyz)*abs_internal(mfbbb) / (abs_internal(mfbbb) + qudricLimitD);
+    mfbbb    += wadjust * (-mfbbb);
+    wadjust   = OxyyPxzz + (c1o1 - OxyyPxzz)*abs_internal(mxxyPyzz) / (abs_internal(mxxyPyzz) + qudricLimitP);
+    mxxyPyzz += wadjust * (-mxxyPyzz);
+    wadjust   = OxyyMxzz + (c1o1 - OxyyMxzz)*abs_internal(mxxyMyzz) / (abs_internal(mxxyMyzz) + qudricLimitM);
+    mxxyMyzz += wadjust * (-mxxyMyzz);
+    wadjust   = OxyyPxzz + (c1o1 - OxyyPxzz)*abs_internal(mxxzPyyz) / (abs_internal(mxxzPyyz) + qudricLimitP);
+    mxxzPyyz += wadjust * (-mxxzPyyz);
+    wadjust   = OxyyMxzz + (c1o1 - OxyyMxzz)*abs_internal(mxxzMyyz) / (abs_internal(mxxzMyyz) + qudricLimitM);
+    mxxzMyyz += wadjust * (-mxxzMyyz);
+    wadjust   = OxyyPxzz + (c1o1 - OxyyPxzz)*abs_internal(mxyyPxzz) / (abs_internal(mxyyPxzz) + qudricLimitP);
+    mxyyPxzz += wadjust * (-mxyyPxzz);
+    wadjust   = OxyyMxzz + (c1o1 - OxyyMxzz)*abs_internal(mxyyMxzz) / (abs_internal(mxyyMxzz) + qudricLimitM);
+    mxyyMxzz += wadjust * (-mxyyMxzz);
+    //////////////////////////////////////////////////////////////////////////
+    // no limiter
+    //mfbbb += OxyyMxzz * (-mfbbb);
+    //mxxyPyzz += OxyyPxzz * (-mxxyPyzz);
+    //mxxyMyzz += OxyyMxzz * (-mxxyMyzz);
+    //mxxzPyyz += OxyyPxzz * (-mxxzPyyz);
+    //mxxzMyyz += OxyyMxzz * (-mxxzMyyz);
+    //mxyyPxzz += OxyyPxzz * (-mxyyPxzz);
+    //mxyyMxzz += OxyyMxzz * (-mxyyMxzz);   
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Compute inverse linear combinations of second and third order cumulants
+    //!
+    mfcaa = c1o3 * (mxxMyy + mxxMzz + mxxPyyPzz);
+    mfaca = c1o3 * (-c2o1*  mxxMyy + mxxMzz + mxxPyyPzz);
+    mfaac = c1o3 * (mxxMyy - c2o1* mxxMzz + mxxPyyPzz); 
+    mfcba = ( mxxyMyzz + mxxyPyzz) * c1o2;
+    mfabc = (-mxxyMyzz + mxxyPyzz) * c1o2;
+    mfcab = ( mxxzMyyz + mxxzPyyz) * c1o2;
+    mfacb = (-mxxzMyyz + mxxzPyyz) * c1o2;
+    mfbca = ( mxyyMxzz + mxyyPxzz) * c1o2;
+    mfbac = (-mxyyMxzz + mxyyPxzz) * c1o2;
+    //////////////////////////////////////////////////////////////////////////  
+    //////////////////////////////////////////////////////////////////////////
+    //4.
+    // no limiter
+    //! - Relax fourth order cumulants to modified equilibrium for fourth order convergence of diffusion according  to Eq. (43)-(48)
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+    //!
+    CUMacc = -O4*(c1o1 / omega - c1o2) * (dyuy + dzuz) * c2o3 * A + (c1o1 - O4) * (CUMacc);
+    CUMcac = -O4*(c1o1 / omega - c1o2) * (dxux + dzuz) * c2o3 * A + (c1o1 - O4) * (CUMcac);
+    CUMcca = -O4*(c1o1 / omega - c1o2) * (dyuy + dxux) * c2o3 * A + (c1o1 - O4) * (CUMcca);
+    CUMbbc = -O4*(c1o1 / omega - c1o2) * Dxy           * c1o3 * B + (c1o1 - O4) * (CUMbbc);
+    CUMbcb = -O4*(c1o1 / omega - c1o2) * Dxz           * c1o3 * B + (c1o1 - O4) * (CUMbcb);
+    CUMcbb = -O4*(c1o1 / omega - c1o2) * Dyz           * c1o3 * B + (c1o1 - O4) * (CUMcbb); 
+    //////////////////////////////////////////////////////////////////////////
+    //5.
+    CUMbcc += O5 * (-CUMbcc);
+    CUMcbc += O5 * (-CUMcbc);
+    CUMccb += O5 * (-CUMccb);   
+    //////////////////////////////////////////////////////////////////////////
+    //6.
+    CUMccc += O6 * (-CUMccc);   
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Compute central moments from post collision cumulants according to Eq. (53)-(56) in
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+    //! 
+    //////////////////////////////////////////////////////////////////////////
+    //4.
+    mfcbb = CUMcbb + c1o3*((c3o1*mfcaa + c1o1) * mfabb + c6o1 * mfbba * mfbab) * OOrho;
+    mfbcb = CUMbcb + c1o3*((c3o1*mfaca + c1o1) * mfbab + c6o1 * mfbba * mfabb) * OOrho;
+    mfbbc = CUMbbc + c1o3*((c3o1*mfaac + c1o1) * mfbba + c6o1 * mfbab * mfabb) * OOrho; 
+    mfcca = CUMcca + (((mfcaa * mfaca + c2o1 * mfbba * mfbba)*c9o1 + c3o1 * (mfcaa + mfaca)) * OOrho - (drho *  OOrho))*c1o9;
+    mfcac = CUMcac + (((mfcaa * mfaac + c2o1 * mfbab * mfbab)*c9o1 + c3o1 * (mfcaa + mfaac)) * OOrho - (drho *  OOrho))*c1o9;
+    mfacc = CUMacc + (((mfaac * mfaca + c2o1 * mfabb * mfabb)*c9o1 + c3o1 * (mfaac + mfaca)) * OOrho - (drho *  OOrho))*c1o9; 
+    //////////////////////////////////////////////////////////////////////////
+    //5.
+    mfbcc = CUMbcc + c1o3 *(c3o1*(mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb +    mfbba * mfabc)) + (mfbca + mfbac)) * OOrho;
+    mfcbc = CUMcbc + c1o3 *(c3o1*(mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab +    mfbba * mfbac)) + (mfcba + mfabc)) * OOrho;
+    mfccb = CUMccb + c1o3 *(c3o1*(mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca +    mfabb * mfcba)) + (mfacb + mfcab)) * OOrho; 
+    //////////////////////////////////////////////////////////////////////////
+    //6.
+    mfccc =	CUMccc - ((-c4o1 *  mfbbb * mfbbb
+            - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca)
+            - c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc)
+            - c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) * OOrho
+            + (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac)
+                + c2o1 * (mfcaa * mfaca * mfaac)
+                + c16o1 *  mfbba * mfbab * mfabb) * OOrho * OOrho
+            - c1o3 * (mfacc + mfcac + mfcca) * OOrho
+            - c1o9 * (mfcaa + mfaca + mfaac) * OOrho
+            + (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba)
+                + (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 *(mfaac + mfaca + mfcaa)) * OOrho * OOrho * c2o3
+            + c1o27*((drho * drho - drho) * OOrho * OOrho));    
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! -  Add acceleration (body force) to first order cumulants according to Eq. (85)-(87) in
+    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
+    //!
+    mfbaa = -mfbaa;
+    mfaba = -mfaba;
+    mfaab = -mfaab; 
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Chimera transform from central moments to well conditioned distributions as defined in Appendix J in
+    //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015), DOI:10.1016/j.camwa  2015.05.001 ]</b></a>
+    //! see also Eq. (88)-(96) in
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05  040 ]</b></a>
+    //!
+    ////////////////////////////////////////////////////////////////////////////////////
+    // X - Dir
+    VF::LBM::backwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
+    VF::LBM::backwardChimera(            mfaba, mfbba, mfcba, vvx, vx2);
+    VF::LBM::backwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
+    VF::LBM::backwardChimera(            mfaab, mfbab, mfcab, vvx, vx2);
+    VF::LBM::backwardChimera(            mfabb, mfbbb, mfcbb, vvx, vx2);
+    VF::LBM::backwardChimera(            mfacb, mfbcb, mfccb, vvx, vx2);
+    VF::LBM::backwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
+    VF::LBM::backwardChimera(            mfabc, mfbbc, mfcbc, vvx, vx2);
+    VF::LBM::backwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c9o1, c1o9);    
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Y - Dir
+    VF::LBM::backwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2,  c6o1,  c1o6);
+    VF::LBM::backwardChimera(            mfaab, mfabb, mfacb, vvy, vy2);
+    VF::LBM::backwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
+    VF::LBM::backwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2,  c3o2,  c2o3);
+    VF::LBM::backwardChimera(            mfbab, mfbbb, mfbcb, vvy, vy2);
+    VF::LBM::backwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2,  c9o2,  c2o9);
+    VF::LBM::backwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2,  c6o1,  c1o6);
+    VF::LBM::backwardChimera(            mfcab, mfcbb, mfccb, vvy, vy2);
+    VF::LBM::backwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);  
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Z - Dir
+    VF::LBM::backwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
+    VF::LBM::backwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2,  c9o1,  c1o9);
+    VF::LBM::backwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
+    VF::LBM::backwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2,  c9o1,  c1o9);
+    VF::LBM::backwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2,  c9o4,  c4o9);
+    VF::LBM::backwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2,  c9o1,  c1o9);
+    VF::LBM::backwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
+    VF::LBM::backwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2,  c9o1,  c1o9);
+    VF::LBM::backwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);
+
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    //! - Write distributions: style of reading and writing the distributions from/to 
+    //! stored arrays dependent on timestep is based on the esoteric twist algorithm
+    //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
+    //!
+    distribution.f[VF::LBM::DIR::MZZ] = mfcbb;
+    distribution.f[VF::LBM::DIR::PZZ] = mfabb;
+    distribution.f[VF::LBM::DIR::ZMZ] = mfbcb;
+    distribution.f[VF::LBM::DIR::ZPZ] = mfbab;
+    distribution.f[VF::LBM::DIR::ZZM] = mfbbc;
+    distribution.f[VF::LBM::DIR::ZZP] = mfbba;
+    distribution.f[VF::LBM::DIR::MMZ] = mfccb;
+    distribution.f[VF::LBM::DIR::PPZ] = mfaab;
+    distribution.f[VF::LBM::DIR::MPZ] = mfcab;
+    distribution.f[VF::LBM::DIR::PMZ] = mfacb;
+    distribution.f[VF::LBM::DIR::MZM] = mfcbc;
+    distribution.f[VF::LBM::DIR::PZP] = mfaba;
+    distribution.f[VF::LBM::DIR::MZP] = mfcba;
+    distribution.f[VF::LBM::DIR::PZM] = mfabc;
+    distribution.f[VF::LBM::DIR::ZMM] = mfbcc;
+    distribution.f[VF::LBM::DIR::ZPP] = mfbaa;
+    distribution.f[VF::LBM::DIR::ZMP] = mfbca;
+    distribution.f[VF::LBM::DIR::ZPM] = mfbac;
+    distribution.f[VF::LBM::DIR::MMM] = mfccc;
+    distribution.f[VF::LBM::DIR::PMM] = mfacc;
+    distribution.f[VF::LBM::DIR::MPM] = mfcac;
+    distribution.f[VF::LBM::DIR::PPM] = mfaac;
+    distribution.f[VF::LBM::DIR::MMP] = mfcca;
+    distribution.f[VF::LBM::DIR::PMP] = mfaca;
+    distribution.f[VF::LBM::DIR::MPP] = mfcaa;
+    distribution.f[VF::LBM::DIR::PPP] = mfaaa;
+    distribution.f[VF::LBM::DIR::ZZZ] = mfbbb;
+}
+
+
+}
+}
+
diff --git a/src/lbm/CumulantChimeraPreCompiled.h b/src/lbm/CumulantChimeraPreCompiled.h
new file mode 100644
index 000000000..f76e14ac9
--- /dev/null
+++ b/src/lbm/CumulantChimeraPreCompiled.h
@@ -0,0 +1,59 @@
+#ifndef LBM_CUMULANT_CHIMERA_PRE_H
+#define LBM_CUMULANT_CHIMERA_PRE_H
+
+#ifndef __host__
+#define __host__
+#endif
+#ifndef __device__
+#define __device__
+#endif
+
+#include <cmath>
+
+#include <basics/Core/DataTypes.h>
+#include <basics/Core/RealConstants.h>
+
+#include "D3Q27.h"
+#include "Chimera.h"
+#include "MacroscopicQuantities.h"
+
+namespace VF
+{
+namespace LBM
+{
+
+struct Distribution27
+{
+    real f[27];
+
+    inline __host__ __device__ real getDensity_() const
+    {
+        return getDensity(f);
+    }
+};
+
+
+
+
+inline __host__ __device__ real abs_internal(real value)
+{
+#ifdef __CUDA_ARCH__
+    return ::abs(value);
+#else
+    return std::abs(value);
+#endif
+}
+
+
+
+//////////////////////////////////////////////////////////////////////////
+//! Cumulant K17 Kernel is based on \ref
+//! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//! and \ref
+//! <a href="https://doi.org/10.1016/j.jcp.2017.07.004"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.07.004 ]</b></a>
+//////////////////////////////////////////////////////////////////////////
+__host__ __device__ void cumulantChimera(Distribution27& distribution, real omega, real* forces);
+
+}
+}
+#endif
-- 
GitLab