diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
index c2dfc7512b21cf54ab7da7aaeb74ccc6c4563f00..1989bbbd225fc43554a21eaf61d192a2401b64c5 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
+++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
@@ -2105,6 +2105,12 @@ void CudaMemoryManager::cudaAllocFsForCheckPointAndRestart(int lev)
 {
     checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->distributions.f[0] ),           (unsigned long long)parameter->getD3Qxx()*(unsigned long long)parameter->getParH(lev)->memSizeRealLBnodes));
 }
+void CudaMemoryManager::cudaAllocFsForAllLevelsOnHost()
+{
+    for (int level = 0; level <= parameter->getMaxLevel(); level++) {
+        cudaAllocFsForCheckPointAndRestart(level);
+    }
+}
 void CudaMemoryManager::cudaCopyFsForRestart(int lev)
 {
     checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->distributions.f[0],  parameter->getParH(lev)->distributions.f[0],     (unsigned long long)parameter->getD3Qxx()*(unsigned long long)parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyHostToDevice));
@@ -2113,6 +2119,11 @@ void CudaMemoryManager::cudaCopyFsForCheckPoint(int lev)
 {
     checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->distributions.f[0],  parameter->getParD(lev)->distributions.f[0],     (unsigned long long)parameter->getD3Qxx()*(unsigned long long)parameter->getParH(lev)->memSizeRealLBnodes , cudaMemcpyDeviceToHost));
 }
+void CudaMemoryManager::cudaCopyFsForAllLevelsToHost()
+{
+    for (int level = 0; level <= parameter->getMaxLevel(); level++)
+        cudaCopyFsForCheckPoint(level);
+}
 void CudaMemoryManager::cudaFreeFsForCheckPointAndRestart(int lev)
 {
     checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->distributions.f[0]));
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
index 20c5a1145c6002803d54302e976cc22ddf4e3e10..c8c10eab1094e25dfafa4bc2fabeaa79885facad 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
+++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
@@ -275,8 +275,12 @@ public:
     void cudaFreeMeasurePointsIndex(int lev);
 
     void cudaAllocFsForCheckPointAndRestart(int lev);
+    void cudaAllocFsForAllLevelsOnHost();
+    //! \brief copy distributions from host to device
     void cudaCopyFsForRestart(int lev);
+    //! \brief copy distributions from device to host
     void cudaCopyFsForCheckPoint(int lev);
+    void cudaCopyFsForAllLevelsToHost();
     void cudaFreeFsForCheckPointAndRestart(int lev);
 
     void cudaAllocDragLift(int lev, int numofelem);
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
index 198aa0165f673300bd399783f09d0348e397e9c2..2425184d8a59390e343e603d5f775ccf88c491ca 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
@@ -51,6 +51,7 @@
 //////////////////////////////////////////////////////////////////////////
 #include "DataStructureInitializer/GridProvider.h"
 #include "Output/DataWriter.h"
+#include "Output/DistributionDebugWriter.h"
 #include "Kernel/Utilities/KernelFactory/KernelFactory.h"
 #include "PreProcessor/PreProcessorFactory/PreProcessorFactory.h"
 #include "PreProcessor/PreProcessorFactory/PreProcessorFactoryImp.h"
@@ -331,6 +332,9 @@ void Simulation::init(GridProvider &gridProvider, BoundaryConditionFactory *bcFa
         }
     }
 
+    // Allocate host memory for distribution debug writer
+    cudaMemoryManager->cudaAllocFsForAllLevelsOnHost();
+
     //////////////////////////////////////////////////////////////////////////
     // Restart
     //////////////////////////////////////////////////////////////////////////
@@ -1013,6 +1017,11 @@ void Simulation::readAndWriteFiles(uint timestep)
     }
     ////////////////////////////////////////////////////////////////////////
     if (para->getCalcParticles()) copyAndPrintParticles(para.get(), cudaMemoryManager.get(), timestep, false);
+
+    // Write distributions (f's) for debugging purposes.
+    cudaMemoryManager->cudaCopyFsForAllLevelsToHost();
+    DistributionDebugWriter::writeDistributions(para.get(), timestep);
+
     ////////////////////////////////////////////////////////////////////////
     VF_LOG_INFO("... done");
     ////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Output/DistributionDebugWriter.cpp b/src/gpu/VirtualFluids_GPU/Output/DistributionDebugWriter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0fb48b1573c56b042167ea4d08adcd94e7d73a08
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Output/DistributionDebugWriter.cpp
@@ -0,0 +1,90 @@
+#include <basics/writer/WbWriterVtkXmlBinary.h>
+#include <lbm/constants/D3Q27.h>
+#include "DistributionDebugWriter.h"
+#include "Parameter/Parameter.h"
+#include "WriterUtilities.h"
+
+using namespace vf::lbm::dir;
+
+void DistributionDebugWriter::writeDistributions(const Parameter* para, uint timestep)
+{
+    for (int level = para->getCoarse(); level <= para->getFine(); level++) {
+        DistributionDebugWriter::writeDistributionsForLevel(para, level, timestep);
+    }
+}
+
+void DistributionDebugWriter::writeDistributionsForLevel(const Parameter* para, uint level, uint timestep)
+{
+    const uint numberOfParts = WriterUtilities::calculateNumberOfParts(para, level);
+
+    std::vector<std::string> fileNames;
+    for (uint i = 1; i <= numberOfParts; i++) {
+        fileNames.push_back(para->getFName() + "_bin_distributions" +
+                            WriterUtilities::makePartFileNameEnding(level, para->getMyProcessID(), i, timestep));
+    }
+
+    std::vector<std::string> nodeDataNames(NUMBER_Of_DIRECTIONS);
+
+    for (uint dir = STARTDIR; dir <= ENDDIR; dir++) {
+        const size_t minLenghtOfNumberString = 2; // the number is padded with zeros to this length
+        const auto numberString = std::to_string(dir);
+        nodeDataNames[dir] =
+            "f_" + std::string(minLenghtOfNumberString - std::min(minLenghtOfNumberString, numberString.length()), '0') +
+            numberString;
+    }
+
+    uint sizeOfNodes;
+    uint startPosition;
+    uint endPosition;
+    std::array<uint, 8> indicesOfOct;
+    std::array<uint, 8> relativePosInPart;
+    uint relPosInPart;
+
+    const LBMSimulationParameter* parH = para->getParHConst(level).get();
+    Distributions27 distributions = parH->distributions;
+
+    for (unsigned int part = 0; part < (uint)fileNames.size(); part++) {
+        sizeOfNodes = WriterUtilities::calculateNumberOfNodesInPart(para, level, part);
+        startPosition = part * para->getLimitOfNodesForVTK();
+        endPosition = startPosition + sizeOfNodes;
+
+        std::vector<UbTupleFloat3> nodes(sizeOfNodes);
+        std::vector<UbTupleUInt8> cells;
+        std::vector<std::vector<double>> nodeData(nodeDataNames.size());
+        for (uint i = 0; i < (uint)nodeDataNames.size(); i++)
+            nodeData[i].resize(sizeOfNodes);
+
+        for (unsigned int pos = startPosition; pos < endPosition; pos++) {
+
+            if (parH->typeOfGridNode[pos] != GEO_FLUID)
+                continue;
+
+            relPosInPart = pos - startPosition;
+
+            // node
+            double x1 = parH->coordinateX[pos];
+            double x2 = parH->coordinateY[pos];
+            double x3 = parH->coordinateZ[pos];
+            nodes[relPosInPart] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+
+            // node data
+            for (uint dir = STARTDIR; dir <= ENDDIR; dir++) {
+                nodeData[dir][relPosInPart] = distributions.f[0][dir*parH->numberOfNodes + pos];
+            }
+
+            WriterUtilities::getIndicesOfAllNodesInOct(indicesOfOct, pos, parH);
+            if (WriterUtilities::isPeriodicCell(parH, indicesOfOct[0], indicesOfOct[6])) {
+                continue;
+            }
+
+            if (WriterUtilities::areAllNodesInOctValidForWriting(indicesOfOct, parH, endPosition)) {
+                WriterUtilities::calculateRelativeNodeIndexInPart(relativePosInPart, indicesOfOct, startPosition);
+                cells.push_back(makeUbTupleFromArray(relativePosInPart));
+            }
+        }
+
+        std::string fileName = WbWriterVtkXmlBinary::getInstance()->writeOctsWithNodeData(fileNames[part], nodes, cells,
+                                                                                          nodeDataNames, nodeData);
+        VF_LOG_DEBUG("DistributionDebugWriter wrote to {} ", fileName);
+    }
+}
diff --git a/src/gpu/VirtualFluids_GPU/Output/DistributionDebugWriter.h b/src/gpu/VirtualFluids_GPU/Output/DistributionDebugWriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..e789fdbc51f0b225a64bb0d6b0043416e8da2b90
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Output/DistributionDebugWriter.h
@@ -0,0 +1,15 @@
+#ifndef DISTRIBUTION_DEBUG_WRITER
+#define DISTRIBUTION_DEBUG_WRITER
+
+#include <basics/DataTypes.h>
+
+class Parameter;
+
+class DistributionDebugWriter
+{
+public:
+    static void writeDistributions(const Parameter* para, uint timestep);
+    static void writeDistributionsForLevel(const Parameter* para, uint level, uint timestep);
+};
+
+#endif