From f8794af18e1430e9653e11d14b059cc61797342d Mon Sep 17 00:00:00 2001
From: Anna Wellmann <a.wellmann@tu-bs.de>
Date: Tue, 3 Jan 2023 09:26:35 +0000
Subject: [PATCH] Add timers for performance optimization

---
 apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp    |  2 +-
 .../LBM/ActuatorLine/configActuatorLine.txt   |  6 +-
 .../Calculation/UpdateGrid27.cpp              | 31 ++++++++++
 .../Calculation/UpdateGrid27.h                |  4 ++
 src/gpu/VirtualFluids_GPU/Output/Timer.cpp    |  7 +++
 src/gpu/VirtualFluids_GPU/Output/Timer.h      |  2 +
 .../PreCollisionInteractor/ActuatorFarm.cu    | 60 ++++++++++++++++---
 .../PreCollisionInteractor/ActuatorFarm.h     |  3 +
 8 files changed, 102 insertions(+), 13 deletions(-)

diff --git a/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp b/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
index 77fa28738..8b5502329 100644
--- a/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
+++ b/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
@@ -251,7 +251,7 @@ void multipleLevel(const std::string& configPath)
     const real epsilon = dx*exp2(-level)*2.0; // width of gaussian smearing
     const real density = 1.225f;
     const uint nBlades = 3;
-    const uint nBladeNodes = 404; // passt zu auflösung von 105 Knoten
+    const uint nBladeNodes = reference_diameter * 4 + 3; // passt zu auflösung von 105 Knoten
     VF_LOG_INFO("number of blade nodes ALM = {}", nBladeNodes);
     const real tipspeed_ratio = 7.5f; // tipspeed ratio = angular vel * radius / inflow vel
     const real omega = 2*tipspeed_ratio*velocity/reference_diameter;
diff --git a/apps/gpu/LBM/ActuatorLine/configActuatorLine.txt b/apps/gpu/LBM/ActuatorLine/configActuatorLine.txt
index f637c47f2..fc8403bab 100644
--- a/apps/gpu/LBM/ActuatorLine/configActuatorLine.txt
+++ b/apps/gpu/LBM/ActuatorLine/configActuatorLine.txt
@@ -8,12 +8,12 @@ Path = /work/y0078217/Results/ActuatorLine105
 GridPath=.
 ##################################################
 ReferenceDiameter=126
-NodesPerDiameter=105
+NodesPerDiameter=32
 Velocity=9
 ##################################################
 tStartOut=0
-tOut=5000
-tEnd=5000
+tOut=1
+tEnd=1
 ##################################################
 
 # tStartTmpAveraging=100
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
index 4136614df..26632f18d 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
@@ -13,8 +13,13 @@
 #include "CollisionStrategy.h"
 #include "RefinementStrategy.h"
 
+#include "Output/Timer.h"
+
 void UpdateGrid27::updateGrid(int level, unsigned int t)
 {
+
+    timer->startTimer();
+
     //////////////////////////////////////////////////////////////////////////
 
     if (level != para->getFine()) {
@@ -22,18 +27,27 @@ void UpdateGrid27::updateGrid(int level, unsigned int t)
         updateGrid(level + 1, t);
     }
 
+
     //////////////////////////////////////////////////////////////////////////
+    std::cout << "updateGrid: level = " << level << ", t = " << t << std::endl;
+
     
     interactWithProbes(level, t);
+            std::cout << " interactWithProbes, " << timer->startStopGetElapsed() << std::endl;
 
     //////////////////////////////////////////////////////////////////////////
 
     collision(this, para.get(), level, t);
 
+            std::cout << " collision, " << timer->startStopGetElapsed() << std::endl;
+
     //////////////////////////////////////////////////////////////////////////
 
     postCollisionBC(level, t);
 
+                std::cout << " postCollisionBC, " << timer->startStopGetElapsed() << std::endl;
+
+
     //////////////////////////////////////////////////////////////////////////
 
     swapBetweenEvenAndOddTimestep(level);
@@ -45,20 +59,35 @@ void UpdateGrid27::updateGrid(int level, unsigned int t)
 
     calcTurbulentViscosity(level);
 
+    
+                    std::cout << " calcTurbulentViscosity, " << timer->startStopGetElapsed() << std::endl;
+
     //////////////////////////////////////////////////////////////////////////
 
     this->preCollisionBC(level, t);
 
+                        std::cout << " preCollisionBC, " << timer->startStopGetElapsed() << std::endl;
+
+
     //////////////////////////////////////////////////////////////////////////
     if( level != para->getFine() )
     {   
         refinement(this, para.get(), level);
     }
 
+    
+                        std::cout << " refinement, " << timer->startStopGetElapsed() << std::endl;
+
     //////////////////////////////////////////////////////////////////////////
     
     interactWithActuators(level, t);
 
+                            std::cout << " interactWithActuators, " << timer->startStopGetElapsed() << std::endl;
+                                std::cout << "total time, " << timer->getTotalElapsedTime() << std::endl;
+                                timer->resetTimer();
+
+
+
 }
 
 void UpdateGrid27::collisionAllNodes(int level, unsigned int t)
@@ -391,4 +420,6 @@ UpdateGrid27::UpdateGrid27(SPtr<Parameter> para, vf::gpu::Communicator &comm, SP
     this->bcKernelManager = std::make_shared<BCKernelManager>(para, bcFactory);
     this->adKernelManager = std::make_shared<ADKernelManager>(para);
     this->gridScalingKernelManager = std::make_shared<GridScalingKernelManager>(para, scalingFactory);
+        timer = new Timer("ALM blade performance");
+    timer->initTimer();
 }
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
index 8ce2cf5bf..da6d53c17 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
@@ -17,6 +17,8 @@ class BoundaryConditionFactory;
 class GridScalingFactory;
 class TurbulenceModelFactory;
 class UpdateGrid27;
+class Timer;
+
 using CollisionStrategy = std::function<void (UpdateGrid27* updateGrid, Parameter* para, int level, unsigned int t)>;
 using RefinementStrategy = std::function<void (UpdateGrid27* updateGrid, Parameter* para, int level)>;
 
@@ -84,6 +86,8 @@ private:
     std::shared_ptr<GridScalingKernelManager> gridScalingKernelManager;
     //! \property tmFactory is a shared pointer to an object of TurbulenceModelFactory
     std::shared_ptr<TurbulenceModelFactory> tmFactory;
+
+    Timer* timer = nullptr;
 };
 
 #endif
diff --git a/src/gpu/VirtualFluids_GPU/Output/Timer.cpp b/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
index 74a706165..b9ec5d9ba 100644
--- a/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
+++ b/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
@@ -25,6 +25,13 @@ void Timer::stopTimer()
         this->totalElapsedTime += this->elapsedTime;
 }
 
+float Timer::startStopGetElapsed()
+{
+    this->stopTimer();
+    this->startTimer();
+    return this->elapsedTime;
+}
+
 void Timer::resetTimer()
 {
         this->elapsedTime = 0.0;
diff --git a/src/gpu/VirtualFluids_GPU/Output/Timer.h b/src/gpu/VirtualFluids_GPU/Output/Timer.h
index d035cbb6c..f6b6bad3c 100644
--- a/src/gpu/VirtualFluids_GPU/Output/Timer.h
+++ b/src/gpu/VirtualFluids_GPU/Output/Timer.h
@@ -34,6 +34,8 @@ class Timer
     float getElapsedTime(){ return this->elapsedTime; }
     float getTotalElapsedTime(){ return this->totalElapsedTime; }
 
+    float startStopGetElapsed();
+
     private:
     
     cudaEvent_t start_t, stop_t;
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.cu
index 0e70df88e..72d349ff5 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.cu
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.cu
@@ -14,6 +14,9 @@
 #include "GPU/CudaMemoryManager.h"
 #include <lbm/constants/NumericConstants.h>
 #include <logger/Logger.h>
+#include <ostream>
+
+#include "Output/Timer.h"
 
 using namespace vf::lbm::constant;
 
@@ -267,26 +270,38 @@ void ActuatorFarm::addTurbine(real posX, real posY, real posZ, real diameter, re
 
 void ActuatorFarm::init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager* cudaMemoryManager)
 {
-    if(!para->getIsBodyForce()) throw std::runtime_error("try to allocate ActuatorFarm but BodyForce is not set in Parameter.");
+    if (!para->getIsBodyForce())
+        throw std::runtime_error("try to allocate ActuatorFarm but BodyForce is not set in Parameter.");
     this->forceRatio = para->getForceRatio();
     this->initTurbineGeometries(cudaMemoryManager);
-    this->initBladeCoords(cudaMemoryManager);    
+    this->initBladeCoords(cudaMemoryManager);
     this->initBladeIndices(para, cudaMemoryManager);
     this->initBladeVelocities(cudaMemoryManager);
-    this->initBladeForces(cudaMemoryManager);    
-    this->initBoundingSpheres(para, cudaMemoryManager);  
+    this->initBladeForces(cudaMemoryManager);
+    this->initBoundingSpheres(para, cudaMemoryManager);
     this->streamIndex = 0;
+
+    bladeTimer = new Timer("ALM blade performance");
+    bladeTimer->initTimer();
 }
 
-void ActuatorFarm::interact(Parameter* para, CudaMemoryManager* cudaMemoryManager, int level, unsigned int t)
+void ActuatorFarm::interact(Parameter* para, CudaMemoryManager* cudaMemoryManager, int currentLevel, unsigned int t)
 {
-    if (level != this->level) return;
+    if (currentLevel != this->level) return;
+    bool useTimer = false;
 
     cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamIndex::ActuatorFarm, this->streamIndex);
 
+    if (useTimer)
+    std::cout << "ActuatorFarm::interact: level = " << currentLevel << ", t = " << t << " useHostArrays = " << useHostArrays <<std::endl;
+    bladeTimer->startTimer();
+
     if(useHostArrays) cudaMemoryManager->cudaCopyBladeCoordsHtoD(this);
 
-    vf::cuda::CudaGrid bladeGrid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, this->numberOfNodes);
+    vf::cuda::CudaGrid bladeGrid = vf::cuda::CudaGrid(para->getParH(currentLevel)->numberofthreads, this->numberOfNodes);
+
+    if (useTimer)
+    std::cout << " cudaCopyBladeCoordsHtoD, " << bladeTimer->startStopGetElapsed() << std::endl;
 
     interpolateVelocities<<< bladeGrid.grid, bladeGrid.threads, 0, stream >>>(
         para->getParD(this->level)->coordinateX, para->getParD(this->level)->coordinateY, para->getParD(this->level)->coordinateZ,        
@@ -299,14 +314,28 @@ void ActuatorFarm::interact(Parameter* para, CudaMemoryManager* cudaMemoryManage
         this->turbinePosXD, this->turbinePosYD, this->turbinePosZD,
         this->bladeIndicesD, para->getVelocityRatio(), this->invDeltaX);
 
+    if (useTimer)
+    std::cout << " interpolateVelocities, " << bladeTimer->startStopGetElapsed() << std::endl;
+
     cudaStreamSynchronize(stream);
     if(useHostArrays) cudaMemoryManager->cudaCopyBladeVelocitiesDtoH(this);
+
+    if (useTimer)
+    std::cout << " cudaCopyBladeVelocitiesDtoH, " << bladeTimer->startStopGetElapsed() << std::endl;
+
     this->calcBladeForces();
+    
+    if (useTimer)
+    std::cout << " calcBladeForces, " << bladeTimer->startStopGetElapsed() << std::endl;
+
     this->swapDeviceArrays();
 
     if(useHostArrays) cudaMemoryManager->cudaCopyBladeForcesHtoD(this);
 
-    vf::cuda::CudaGrid sphereGrid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, this->numberOfIndices);
+    if (useTimer)
+    std::cout << " cudaCopyBladeForcesHtoD, " << bladeTimer->startStopGetElapsed() << std::endl;
+
+    vf::cuda::CudaGrid sphereGrid = vf::cuda::CudaGrid(para->getParH(currentLevel)->numberofthreads, this->numberOfIndices);
 
     applyBodyForces<<<sphereGrid.grid, sphereGrid.threads, 0, stream>>>(
         para->getParD(this->level)->coordinateX, para->getParD(this->level)->coordinateY, para->getParD(this->level)->coordinateZ,        
@@ -318,12 +347,25 @@ void ActuatorFarm::interact(Parameter* para, CudaMemoryManager* cudaMemoryManage
         this->turbinePosXD, this->turbinePosYD, this->turbinePosZD,
         this->boundingSphereIndicesD, this->numberOfIndices,
         this->invEpsilonSqrd, this->factorGaussian);
+
+    if (useTimer)
+    std::cout << " applyBodyForces, " << bladeTimer->startStopGetElapsed() << std::endl;
+
     cudaMemoryManager->cudaCopyBladeOrientationsHtoD(this);
+
+        if (useTimer)
+    std::cout << " cudaCopyBladeOrientationsHtoD, " << bladeTimer->startStopGetElapsed()  << std::endl;
+    if (useTimer)
+    std::cout << "total time, " << bladeTimer->getTotalElapsedTime() << std::endl;
+                                    bladeTimer->resetTimer();
+
     cudaStreamSynchronize(stream);
+
+
 }
 
 
-void ActuatorFarm::free(Parameter* para, CudaMemoryManager* cudaMemoryManager)
+void ActuatorFarm::free(Parameter*  /*para*/, CudaMemoryManager* cudaMemoryManager)
 {
     cudaMemoryManager->cudaFreeBladeGeometries(this);
     cudaMemoryManager->cudaFreeBladeOrientations(this);
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.h
index 8e21cdb6b..e6066e286 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/ActuatorFarm.h
@@ -10,6 +10,7 @@ using namespace vf::lbm::constant;
 
 class Parameter;
 class GridProvider;
+class Timer;
 using namespace vf::lbm::constant;
 
 class ActuatorFarm : public PreCollisionInteractor
@@ -192,6 +193,8 @@ private:
     uint numberOfNodes;
     real forceRatio, factorGaussian, invEpsilonSqrd, invDeltaX;
     int streamIndex;
+
+    Timer* bladeTimer = nullptr;
 };
 
 #endif
-- 
GitLab