diff --git a/src/gpu/VirtualFluids_GPU/Communication/Communicator.cpp b/src/gpu/VirtualFluids_GPU/Communication/Communicator.cpp
index a5296b1202103fc1732befe3b6d8bea238841fe6..155251a3273c8976c058eddad760b8808b451433 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/Communicator.cpp
+++ b/src/gpu/VirtualFluids_GPU/Communication/Communicator.cpp
@@ -231,6 +231,16 @@ std::vector<double> Communicator::gatherNUPS(double processNups)
     return std::vector<double>(); 
 }
 
+double Communicator::sumNups(double processNups)
+{ 
+    double *buffer_send = &processNups;
+    double *buffer_recv = (double *)malloc(sizeof(double));
+
+    MPI_Reduce(buffer_send, buffer_recv, 1, MPI_DOUBLE, MPI_SUM, 0, commGPU);
+
+    return *buffer_recv;
+}
+
 void vf::gpu::Communicator::exchangeIndices(uint *rbuf, int count_r, int nb_rank_r, uint *sbuf, int count_s,
                                             int nb_rank_s)
 {
diff --git a/src/gpu/VirtualFluids_GPU/Communication/Communicator.h b/src/gpu/VirtualFluids_GPU/Communication/Communicator.h
index 3308c6b3ae964144446777c62781c2c2ad3049cb..6227dbd8210ea27013ad252cf64f399c611a9d75 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/Communicator.h
+++ b/src/gpu/VirtualFluids_GPU/Communication/Communicator.h
@@ -58,6 +58,7 @@ public:
     double getTime();
     int mapCudaDevice(const int &rank, const int &size, const std::vector<unsigned int> &devices, const int &maxdev);
     std::vector<double> gatherNUPS(double processNups);
+    double sumNups(double processNups);
     //////////////////////////////////////////////////////////////////////////
     void exchangeIndices(uint *rbuf, int count_r, int nb_rank_r, uint *sbuf, int count_s, int nb_rank_s);
 private:
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
index 706f3da00fe2f98bedd1975951cc8d0a8f189a7d..342c9288ef82fb0a13a3e5e2b66db47db6b0b12e 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
@@ -706,7 +706,7 @@ void Simulation::run()
 
 		//////////////////////////////////////////////////////////////////////////
 		averageTimer->stopTimer();
-		averageTimer->outputPerformance(t, para.get());
+		averageTimer->outputPerformance(t, para.get(), communicator);
 		//////////////////////////////////////////////////////////////////////////
 
          if( para->getPrintFiles() )
diff --git a/src/gpu/VirtualFluids_GPU/Output/Timer.cpp b/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
index b08a4ea11e6227d460ef5913695ebebf2474a02b..4fb7b223ff8480af20075b54a7037a4d27022708 100644
--- a/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
+++ b/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
@@ -31,7 +31,7 @@ void Timer::resetTimer()
         this->totalElapsedTime = 0.0;
 }
 
-void Timer::outputPerformance(uint t, Parameter* para)
+void Timer::outputPerformance(uint t, Parameter* para, vf::gpu::Communicator& communicator)
 {
     real fnups      = 0.0;
     real bandwidth  = 0.0;
@@ -42,25 +42,17 @@ void Timer::outputPerformance(uint t, Parameter* para)
         bandwidth   += (27.0+1.0) * 4.0 * 1000.0 * (t-para->getTStart()) * para->getParH(lev)->size_Mat_SP  / (this->totalElapsedTime*1.0E9);
     }
 
-    if(this->firstOutput)
+    if(this->firstOutput && communicator.getPID() == 0) //only display the legend once
     {
-        VF_LOG_INFO(" --- {} --- Processing time (ms) \t Nups in Mio \t Bandwidth in GB/sec", this->name );
+        VF_LOG_INFO("PID \t --- {} --- Processing time (ms) \t Nups in Mio \t Bandwidth in GB/sec", this->name );
         this->firstOutput = false;
     }
 
-    VF_LOG_INFO(" --- {} --- {}/{} \t {} \t {}", this->name, this->elapsedTime, this->totalElapsedTime, fnups, bandwidth  );
+    VF_LOG_INFO(" {} \t --- {} --- {}/{} \t {} \t {}",  communicator.getPID(), this->name, this->elapsedTime, this->totalElapsedTime, fnups, bandwidth);
 
-    // When using multiple GPUs, get Nups of all processes
-	if (para->getMaxDev() > 1) {
-        vf::gpu::Communicator& comm=vf::gpu::Communicator::getInstance();
-        std::vector<double> nups = comm.gatherNUPS(fnups);
-        if (comm.getPID() == 0) {
-			double sum = 0;
-            for (uint pid = 0; pid < nups.size(); pid++) {
-                VF_LOG_INFO("Process {}: \t NUPS in Mio: {}", pid, nups[pid]);
-                sum += nups[pid];
-			}
-            VF_LOG_INFO("Sum of all processes: Nups in Mio: {}", sum);
-		}
-	}
+    // When using multiple GPUs, sum the nups of all processes
+    if (communicator.getNummberOfProcess() > 1) {
+        double nupsSum =  communicator.sumNups(fnups);
+        VF_LOG_INFO("Sum of all {}\t processes: Nups in Mio: {}", communicator.getNummberOfProcess(), nupsSum);
+    }
 }
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Output/Timer.h b/src/gpu/VirtualFluids_GPU/Output/Timer.h
index 6432b347458e68a5089aea3de625017d6facd34b..26be785c7f76b7695656c9600bdb586804dca251 100644
--- a/src/gpu/VirtualFluids_GPU/Output/Timer.h
+++ b/src/gpu/VirtualFluids_GPU/Output/Timer.h
@@ -9,6 +9,10 @@
 #include "logger/Logger.h"
 #include "Parameter/Parameter.h"
 
+namespace vf::gpu{
+    class Communicator;
+}
+
 class Timer
 {
     public:
@@ -27,7 +31,7 @@ class Timer
     void startTimer();
     void stopTimer();
     void resetTimer();
-    void outputPerformance(uint t, Parameter* para);
+    void outputPerformance(uint t, Parameter* para, vf::gpu::Communicator& communicator);
 
     float getElapsedTime(){ return this->elapsedTime; }
     float getTotalElapsedTime(){ return this->totalElapsedTime; }