From 9c35d0ef2bc12332f2137c2b74e5806b68b04d61 Mon Sep 17 00:00:00 2001
From: Anna Wellmann <a.wellmann@tu-braunschweig.de>
Date: Fri, 6 Aug 2021 14:29:32 +0200
Subject: [PATCH] Use CUDA streams on multiple GPUs

---
 apps/gpu/LBM/MusselOyster/MusselOyster.cpp    | 23 ++++++++++---------
 .../Calculation/UpdateGrid27.cpp              | 19 ++++++++++-----
 .../Calculation/UpdateGrid27.h                |  3 ++-
 .../Communication/ExchangeData27.cpp          | 13 +++++++----
 .../Communication/ExchangeData27.h            |  7 +++---
 .../GPU/CudaMemoryManager.cpp                 | 19 ++++++++++-----
 .../VirtualFluids_GPU/GPU/CudaMemoryManager.h |  2 +-
 src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp  |  4 ++--
 8 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/apps/gpu/LBM/MusselOyster/MusselOyster.cpp b/apps/gpu/LBM/MusselOyster/MusselOyster.cpp
index 2b9ce1f11..30fbd5ec8 100644
--- a/apps/gpu/LBM/MusselOyster/MusselOyster.cpp
+++ b/apps/gpu/LBM/MusselOyster/MusselOyster.cpp
@@ -109,7 +109,7 @@ void multipleLevel(const std::string& configPath)
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
     bool useGridGenerator = true;
-    bool useMultiGPU = false;
+    bool useMultiGPU = true;
     bool useStreams= true;
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -132,8 +132,8 @@ void multipleLevel(const std::string& configPath)
     *logging::out << logging::Logger::INFO_HIGH << "velocity real [m/s] = " << vxLB * para->getVelocityRatio()<< " \n";
     *logging::out << logging::Logger::INFO_HIGH << "viscosity real [m^2/s] = " << viscosityLB * para->getViscosityRatio() << "\n";
 
-    para->setTOut(5000);
-    para->setTEnd(50000);
+    para->setTOut(100);
+    para->setTEnd(100);
 
     para->setCalcDragLift(false);
     para->setUseWale(false);
@@ -148,7 +148,7 @@ void multipleLevel(const std::string& configPath)
     //para->setMainKernel("CumulantK17CompChim");
     para->useStreams = useStreams;
     para->setMainKernel("CumulantK17CompChimSparse");
-   *logging::out << logging::Logger::INFO_HIGH << "Kernel: " << para->getMainKernel() << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "Kernel: " << para->getMainKernel() << "\n";
 
     if (useMultiGPU) {
         para->setDevices(std::vector<uint>{ (uint)0, (uint)1 });
@@ -186,8 +186,8 @@ void multipleLevel(const std::string& configPath)
 
         TriangularMesh *bivalveSTL =
             TriangularMesh::make("C:/Users/Master/Documents/MasterAnna/STL/" + bivalveType + ".stl");
-         TriangularMesh* bivalveRef_1_STL =
-             TriangularMesh::make("C:/Users/Master/Documents/MasterAnna/STL/" + bivalveType + "_Level1.stl");
+         //TriangularMesh* bivalveRef_1_STL =
+         //    TriangularMesh::make("C:/Users/Master/Documents/MasterAnna/STL/" + bivalveType + "_Level1.stl");
 
         if (useMultiGPU) {
             const uint generatePart = vf::gpu::Communicator::getInstanz()->getPID();
@@ -204,8 +204,8 @@ void multipleLevel(const std::string& configPath)
                                            xGridMax,    yGridMax,           zGridMax,   dxGrid);
             }
 
-             gridBuilder->setNumberOfLayers(6, 8);
-             gridBuilder->addGrid(bivalveRef_1_STL, 1);
+             //gridBuilder->setNumberOfLayers(6, 8);
+             //gridBuilder->addGrid(bivalveRef_1_STL, 1);
 
             gridBuilder->addGeometry(bivalveSTL);
 
@@ -222,8 +222,6 @@ void multipleLevel(const std::string& configPath)
 
             gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
 
-            gridBuilder->findFluidNodes(useStreams);
-
             if (generatePart == 0) {
                 gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
                 gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 1);
@@ -244,6 +242,8 @@ void multipleLevel(const std::string& configPath)
             gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
             //////////////////////////////////////////////////////////////////////////
 
+            gridBuilder->findFluidNodes(useStreams);
+
             //gridBuilder->writeGridsToVtk(path + "/" + bivalveType + "/grid/part" + std::to_string(generatePart) + "_");
             //gridBuilder->writeGridsToVtk(path + "/" + bivalveType + "/" + std::to_string(generatePart) + "/grid/");
             //gridBuilder->writeArrows(path + "/" + bivalveType + "/" + std::to_string(generatePart) + " /arrow");
@@ -262,7 +262,6 @@ void multipleLevel(const std::string& configPath)
             gridBuilder->setPeriodicBoundaryCondition(false, false, true);
 
             gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
-            gridBuilder->findFluidNodes(useStreams);
 
             //////////////////////////////////////////////////////////////////////////
             gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
@@ -273,6 +272,8 @@ void multipleLevel(const std::string& configPath)
             gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
             //////////////////////////////////////////////////////////////////////////
 
+            gridBuilder->findFluidNodes(useStreams);
+
             // gridBuilder->writeGridsToVtk("E:/temp/MusselOyster/" + bivalveType + "/grid/");
             // gridBuilder->writeArrows ("E:/temp/MusselOyster/" + bivalveType + "/arrow");
 
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
index 9d3bd9030..bf2be2cbd 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
@@ -27,17 +27,20 @@ void updateGrid27(Parameter* para,
     //////////////////////////////////////////////////////////////////////////
 
     if (para->useStreams) {
-        collisionUsingIndex(para, pm, level, t, kernels, para->getParD(level)->fluidNodeIndices,
-                            para->getParD(level)->numberOfFluidNodes, 0);
         collisionUsingIndex(para, pm, level, t, kernels, para->getParD(level)->fluidNodeIndicesBorder,
                             para->getParD(level)->numberOffluidNodesBorder, 1);
+        collisionUsingIndex(para, pm, level, t, kernels, para->getParD(level)->fluidNodeIndices,
+                            para->getParD(level)->numberOfFluidNodes, 0);
     }
     else
         collision(para, pm, level, t, kernels);
 
     //////////////////////////////////////////////////////////////////////////
 
-    exchangeMultiGPU(para, comm, cudaManager, level);
+    if (para->useStreams)
+        exchangeMultiGPU(para, comm, cudaManager, level, 1);
+    else
+        exchangeMultiGPU(para, comm, cudaManager, level, -1);
 
     //////////////////////////////////////////////////////////////////////////
 
@@ -62,7 +65,10 @@ void updateGrid27(Parameter* para,
     {
         fineToCoarse(para, level);
 
-        exchangeMultiGPU(para, comm, cudaManager, level);
+        if (para->useStreams)
+            exchangeMultiGPU(para, comm, cudaManager, level, 1);
+        else
+            exchangeMultiGPU(para, comm, cudaManager, level, -1);
 
         coarseToFine(para, level);
     }
@@ -177,7 +183,8 @@ void collisionAdvectionDiffusion(Parameter* para, int level)
 	}
 }
 
-void exchangeMultiGPU(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level)
+void exchangeMultiGPU(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level,
+                      int streamIndex)
 {
     if (para->getNumprocs() > 1)
 	{
@@ -186,7 +193,7 @@ void exchangeMultiGPU(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryMa
 		//////////////////////////////////////////////////////////////////////////
 		//3D domain decomposition
 		exchangePostCollDataXGPU27(para, comm, cudaManager, level);
-		exchangePostCollDataYGPU27(para, comm, cudaManager, level);
+        exchangePostCollDataYGPU27(para, comm, cudaManager, level, streamIndex);
 		exchangePostCollDataZGPU27(para, comm, cudaManager, level);
 
 		//////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
index 44b02d36f..8e9fbff04 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
@@ -26,7 +26,8 @@ extern "C" void collisionPorousMedia(Parameter* para, std::vector<std::shared_pt
 
 extern "C" void collisionAdvectionDiffusion(Parameter* para, int level);
 
-extern "C" void exchangeMultiGPU(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level);
+extern "C" void exchangeMultiGPU(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager,
+                                 int level, int streamIndex);
 
 extern "C" void postCollisionBC(Parameter* para, int level, unsigned int t);
 
diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
index 8f89656ac..474f5327f 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
@@ -176,7 +176,7 @@ void exchangePostCollDataXGPU27(Parameter* para, vf::gpu::Communicator* comm, Cu
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Y
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePreCollDataYGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level)
+void exchangePreCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, int streamIndex)
 {
 	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	//copy Device to Host
@@ -192,8 +192,10 @@ void exchangePreCollDataYGPU27(Parameter* para, vf::gpu::Communicator* comm, Cud
 						  para->getParD(level)->size_Mat_SP, 
 						  para->getParD(level)->evenOrOdd,
 						  para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborYFsDH(level, i);
+		//////////////////////////////////////////////////////////////////////////);
+		cudaManager->cudaCopyProcessNeighborYFsDH(level, i, streamIndex);
+        if (streamIndex != -1)            
+            cudaStreamSynchronize(para->getStream(streamIndex));
 	}
 	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	//start non blocking MPI receive
@@ -257,7 +259,8 @@ void exchangePreCollDataYGPU27(Parameter* para, vf::gpu::Communicator* comm, Cud
 	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePostCollDataYGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level)
+void exchangePostCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level,
+                                int streamIndex)
 {
 	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	//copy Device to Host
@@ -274,7 +277,7 @@ void exchangePostCollDataYGPU27(Parameter* para, vf::gpu::Communicator* comm, Cu
 						   para->getParD(level)->evenOrOdd,
 						   para->getParD(level)->numberofthreads);
 		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborYFsDH(level, i);
+		cudaManager->cudaCopyProcessNeighborYFsDH(level, i, streamIndex);
 	}
 	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	//start non blocking MPI receive
diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h
index 34c9cba80..b0dd8d53b 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h
+++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h
@@ -14,10 +14,11 @@ extern "C" void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator
 //////////////////////////////////////////////////////////////////////////
 //3D domain decomposition
 extern "C" void exchangePreCollDataXGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePreCollDataYGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level);
+extern "C" void exchangePreCollDataYGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level, int streamIndex = -1);
 extern "C" void exchangePreCollDataZGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePostCollDataXGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePostCollDataYGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level);
+extern "C" void exchangePostCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level);
+extern "C" void exchangePostCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager,
+                                           int level, int streamIndex = -1);
 extern "C" void exchangePostCollDataZGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level);
 //////////////////////////////////////////////////////////////////////////
 //3D domain decomposition convection diffusion
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
index e4fe89cb9..827746a06 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
+++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
@@ -570,12 +570,19 @@ void CudaMemoryManager::cudaCopyProcessNeighborYFsHD(int lev, unsigned int proce
 								parameter->getD3Qxx() * parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].memsizeFs, 
 								cudaMemcpyHostToDevice));
 }
-void CudaMemoryManager::cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor)
-{
-	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0], 
-								parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0], 
-								parameter->getD3Qxx() * parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].memsizeFs, 
-								cudaMemcpyDeviceToHost));
+void CudaMemoryManager::cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, int streamIndex)
+{
+    if (streamIndex == -1)
+	    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0], 
+	    							parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0], 
+	    							parameter->getD3Qxx() * parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].memsizeFs, 
+	    							cudaMemcpyDeviceToHost));
+    else
+        checkCudaErrors(
+            cudaMemcpyAsync(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0],
+                            parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0],
+                            parameter->getD3Qxx() * parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].memsizeFs,
+                            cudaMemcpyDeviceToHost, parameter->getStream(streamIndex)));
 }
 void CudaMemoryManager::cudaFreeProcessNeighborY(int lev, unsigned int processNeighbor)
 {
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
index 492d9b9be..5db28ec44 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
+++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
@@ -96,7 +96,7 @@ public:
 	//
 	void cudaAllocProcessNeighborY(int lev, unsigned int processNeighbor);
 	void cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor);
-	void cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor);
+    void cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, int streamIndex);
 	void cudaCopyProcessNeighborYIndex(int lev, unsigned int processNeighbor);
 	void cudaFreeProcessNeighborY(int lev, unsigned int processNeighbor);
 	//
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
index 9bbfa66b6..a90ae2b5b 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
@@ -433,7 +433,7 @@ void Simulation::run()
         // run Analyzers for kinetic energy and enstrophy for TGV in 3D
         // these analyzers only work on level 0
 	    ////////////////////////////////////////////////////////////////////////////////
-        if( this->kineticEnergyAnalyzer || this->enstrophyAnalyzer ) exchangeMultiGPU(para.get(), comm, cudaManager.get(), 0);
+        if( this->kineticEnergyAnalyzer || this->enstrophyAnalyzer ) exchangeMultiGPU(para.get(), comm, cudaManager.get(), 0, -1);
 
 	    if( this->kineticEnergyAnalyzer ) this->kineticEnergyAnalyzer->run(t);
 	    if( this->enstrophyAnalyzer     ) this->enstrophyAnalyzer->run(t);
@@ -673,7 +673,7 @@ void Simulation::run()
             {
 		        //////////////////////////////////////////////////////////////////////////
 		        //exchange data for valid post process
-		        exchangeMultiGPU(para.get(), comm, cudaManager.get(), lev);
+		        exchangeMultiGPU(para.get(), comm, cudaManager.get(), lev, -1);
                 //////////////////////////////////////////////////////////////////////////
                //if (para->getD3Qxx()==19)
                //{
-- 
GitLab