diff --git a/src/gpu/GridGenerator/grid/Grid.h b/src/gpu/GridGenerator/grid/Grid.h
index 32f57179182b069c3f2804e71e60824e831a4102..d4621562a56e33e632d8e2a9c92dd40fa49f3bbf 100644
--- a/src/gpu/GridGenerator/grid/Grid.h
+++ b/src/gpu/GridGenerator/grid/Grid.h
@@ -143,7 +143,7 @@ public:
 
     // needed for CUDA Streams 
     virtual void findFluidNodeIndices(bool onlyBulk) = 0;
-    virtual uint getNumberOfFluidNodes() const = 0;;
+    virtual uint getNumberOfFluidNodes() const = 0;
     virtual void getFluidNodeIndices(uint *fluidNodeIndices) const = 0;
 
     virtual uint getNumberOfFluidNodesBorder() const = 0;
diff --git a/src/gpu/GridGenerator/grid/GridImp.cu b/src/gpu/GridGenerator/grid/GridImp.cu
index ee54a8b0298a8b01fd923f70e98e47ed0d25eea8..3cb31261714ef6d7b0915bd2049b0db8d4fbf088 100644
--- a/src/gpu/GridGenerator/grid/GridImp.cu
+++ b/src/gpu/GridGenerator/grid/GridImp.cu
@@ -1974,7 +1974,7 @@ CUDA_HOST void GridImp::getFluidNodeIndices(uint *fluidNodeIndices) const
 
 uint GridImp::getNumberOfFluidNodesBorder() const 
 { 
-    return this->fluidNodeIndicesBorder.size(); 
+    return (uint)this->fluidNodeIndicesBorder.size(); 
 }
 
 void GridImp::getFluidNodeIndicesBorder(uint *fluidNodeIndicesBorder) const 
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
index 445244ac761ed7dfa49f9fcaa385987e31955fa0..9d3bd9030b5f82e100d5d913906b60b257df2e4e 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
@@ -27,10 +27,10 @@ void updateGrid27(Parameter* para,
     //////////////////////////////////////////////////////////////////////////
 
     if (para->useStreams) {
-        collision(para, pm, level, t, kernels, para->getParD(level)->fluidNodeIndices,
-                  para->getParD(level)->numberOfFluidNodes);
-        collision(para, pm, level, t, kernels, para->getParD(level)->fluidNodeIndicesBorder,
-                  para->getParD(level)->numberOffluidNodesBorder);
+        collisionUsingIndex(para, pm, level, t, kernels, para->getParD(level)->fluidNodeIndices,
+                            para->getParD(level)->numberOfFluidNodes, 0);
+        collisionUsingIndex(para, pm, level, t, kernels, para->getParD(level)->fluidNodeIndicesBorder,
+                            para->getParD(level)->numberOffluidNodesBorder, 1);
     }
     else
         collision(para, pm, level, t, kernels);
@@ -68,15 +68,30 @@ void updateGrid27(Parameter* para,
     }
 }
 
-void collision(Parameter* para, std::vector<std::shared_ptr<PorousMedia>>& pm, int level, unsigned int t, std::vector < SPtr< Kernel>>& kernels, uint* fluidNodeIndices, uint numberOfFluidNodes)
+void collision(Parameter* para, std::vector<std::shared_ptr<PorousMedia>>& pm, int level, unsigned int t, std::vector < SPtr< Kernel>>& kernels)
 {
-    if (para->useStreams)
-        if (fluidNodeIndices != nullptr && numberOfFluidNodes != 0)
-            kernels.at(level)->runOnIndices(fluidNodeIndices, numberOfFluidNodes);
-        else
-            std::cout << "in collision: fluidNodeIndices or numberOfFluidNodes not definded" << std::endl; // better use logger
+    kernels.at(level)->run();
+
+    //////////////////////////////////////////////////////////////////////////
+
+    if (para->getSimulatePorousMedia())
+        collisionPorousMedia(para, pm, level);
+
+    //////////////////////////////////////////////////////////////////////////
+
+    if (para->getDiffOn())
+        collisionAdvectionDiffusion(para, level);
+}
+
+void collisionUsingIndex(Parameter *para, std::vector<std::shared_ptr<PorousMedia>> &pm, int level, unsigned int t,
+                          std::vector<SPtr<Kernel>> &kernels, 
+               uint *fluidNodeIndices, uint numberOfFluidNodes, int stream)
+{
+    if (fluidNodeIndices != nullptr && numberOfFluidNodes != 0)
+        kernels.at(level)->runOnIndices(fluidNodeIndices, numberOfFluidNodes, stream);
     else
-        kernels.at(level)->run();
+        std::cout << "in collision: fluidNodeIndices or numberOfFluidNodes not definded"
+                      << std::endl;
 
     //////////////////////////////////////////////////////////////////////////
 
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
index 0ce9383b9dc30e9dcdca4586f6d06c2593dbf027..44b02d36fa4a428e3b0ba299df73430ef491ca7a 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
@@ -18,7 +18,9 @@ extern "C" void updateGrid27(Parameter* para,
                              unsigned int t, 
                              std::vector < SPtr< Kernel>>& kernels);
 
-extern "C" void collision(Parameter *para, std::vector<std::shared_ptr<PorousMedia>> &pm, int level, unsigned int t,  std::vector<SPtr<Kernel>> &kernels, uint *fluidNodeIndices = nullptr, uint numberOfFluidNodes = 0);
+extern "C" void collision(Parameter *para, std::vector<std::shared_ptr<PorousMedia>> &pm, int level, unsigned int t,  std::vector<SPtr<Kernel>> &kernels);
+
+extern "C" void collisionUsingIndex(Parameter *para, std::vector<std::shared_ptr<PorousMedia>> &pm, int level, unsigned int t,  std::vector<SPtr<Kernel>> &kernels, uint *fluidNodeIndices = nullptr, uint numberOfFluidNodes = 0, int stream = -1);
 
 extern "C" void collisionPorousMedia(Parameter* para, std::vector<std::shared_ptr<PorousMedia>>& pm, int level);
 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h
index ceb1638adbfb3abf38dae61714a69356b0bb8361..9f9f7539bc5a1e28612d956ca32234c5a3589f8a 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h
@@ -13,7 +13,7 @@ class Kernel
 public:
     virtual ~Kernel()  = default;
     virtual void run() = 0;
-    virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices) = 0;
+    virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1) = 0; //if stream == -1: run on default stream
 
     virtual bool checkParameter()                                = 0;
     virtual std::vector<PreProcessorType> getPreProcessorTypes() = 0;
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp
index cd06a7c94b2ffce4dcbb20d4f564fa82a56321f2..3151e6bedeb6a96666f11f0040de2c95b20cc42c 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp
+++ b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp
@@ -3,7 +3,7 @@
 #include "Kernel/Utilities/CheckParameterStrategy/CheckParameterStrategy.h"
 
 
-void KernelImp::runOnIndices(const unsigned int *indices, unsigned int size_indices)
+void KernelImp::runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream)
 {
     printf("Method not implemented for this Kernel \n");
 }
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h
index 0986e7b9759a69cf6df6c9624a3e67fcb8716100..c5215dbdd00e41a76b49c6c67808a6e7c44bb9a8 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h
+++ b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h
@@ -14,7 +14,7 @@ class KernelImp : public Kernel
 {
 public:
     virtual void run() = 0;
-    virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices);
+    virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1);
 
     bool checkParameter();
     std::vector<PreProcessorType> getPreProcessorTypes();
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimSparse/CumulantK17CompChimSparse.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimSparse/CumulantK17CompChimSparse.cu
index 6c3555c8f74762de4f9d57053fe0ec10c959893d..afad3118124851461132cef17a4f8a5801ddc28d 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimSparse/CumulantK17CompChimSparse.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimSparse/CumulantK17CompChimSparse.cu
@@ -3,6 +3,8 @@
 #include "Parameter/Parameter.h"
 #include "CumulantK17CompChimSparse_Device.cuh"
 
+#include <cuda.h>
+
 std::shared_ptr<CumulantK17CompChimSparse> CumulantK17CompChimSparse::getNewInstance(std::shared_ptr<Parameter> para,
                                                                                int level)
 {
@@ -30,25 +32,32 @@ void CumulantK17CompChimSparse::run()
 	getLastCudaError("LB_Kernel_CumulantK17CompChim execution failed");
 }
 
-void CumulantK17CompChimSparse::runOnIndices(const unsigned int *indices, unsigned int size_indices)
+void CumulantK17CompChimSparse::runOnIndices(const unsigned int *indices, unsigned int size_indices, int streamIndex)
 {
     dim3 grid, threads;
     std::tie(grid, threads) = *calcGridDimensions(para->getParD(level)->numberOfFluidNodes);
 
-    LB_Kernel_CumulantK17CompChimSparse<<<grid, threads, 0, para->getStream(0)>>>(
+    cudaStream_t stream;
+    if (streamIndex == -1)
+        stream = CU_STREAM_LEGACY;
+    else
+        stream = para->getStream(streamIndex);
+
+    LB_Kernel_CumulantK17CompChimSparse<<<grid, threads, 0, stream>>>(
         para->getParD(level)->omega, 
-		para->getParD(level)->neighborX_SP, 
-		para->getParD(level)->neighborY_SP,
+	    para->getParD(level)->neighborX_SP, 
+	    para->getParD(level)->neighborY_SP,
         para->getParD(level)->neighborZ_SP, 
-		para->getParD(level)->d0SP.f[0], 
-		para->getParD(level)->size_Mat_SP, 
-		level,
+	    para->getParD(level)->d0SP.f[0], 
+	    para->getParD(level)->size_Mat_SP, 
+	    level,
         para->getForcesDev(), 
-		para->getQuadricLimitersDev(), 
-		para->getParD(level)->evenOrOdd,
+	    para->getQuadricLimitersDev(), 
+	    para->getParD(level)->evenOrOdd,
         indices,
-		size_indices);
+	    size_indices);
     getLastCudaError("LB_Kernel_CumulantK17CompChim execution failed");
+    
 }
 
 CumulantK17CompChimSparse::CumulantK17CompChimSparse(std::shared_ptr<Parameter> para, int level)
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimSparse/CumulantK17CompChimSparse.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimSparse/CumulantK17CompChimSparse.h
index f1645cfaec3ca28b0856bffc0ecb45319746216e..906c880337d1747f96bc7a0f5d5b7988d6235ee5 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimSparse/CumulantK17CompChimSparse.h
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimSparse/CumulantK17CompChimSparse.h
@@ -7,8 +7,8 @@ class CumulantK17CompChimSparse : public KernelImp
 {
 public:
     static std::shared_ptr<CumulantK17CompChimSparse> getNewInstance(std::shared_ptr<Parameter> para, int level);
-	void run();
-    void runOnIndices(const unsigned int *indices, unsigned int size_indices) override;
+	void run() override;
+    void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1) override;
 
 private:
     CumulantK17CompChimSparse();
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
index 34c45493d7ce7e8606c1e96a97e770ba57d1e152..9bbfa66b6f1132a66146bba833ba7c5a5c18d041 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
@@ -106,7 +106,7 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
    //////////////////////////////////////////////////////////////////////////
    // CUDA streams
    if(para->useStreams)
-	   para->launchStreams((uint)1);
+	   para->launchStreams((uint)2);
    //////////////////////////////////////////////////////////////////////////
    // 
    //output << para->getNeedInterface().at(0) << "\n";
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
index 9f70266e4656863ca18fed53afe30196410c5d40..5d24abe4f812e9a898a32c32fcef9aa759b87593 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
@@ -43,8 +43,8 @@
 
 #include "VirtualFluids_GPU_export.h"
 
+#include <cuda.h>
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
 
 struct curandStateXORWOW;
 typedef struct curandStateXORWOW curandState;