From edfbd9916e8aa70ddecf571a47fa2f0e737b0c51 Mon Sep 17 00:00:00 2001
From: Soeren Peters <peters@irmb.tu-bs.de>
Date: Sun, 23 Jul 2023 18:15:58 +0000
Subject: [PATCH] [Communicator] Using the same communicator from gpu and cpu.

- renamed the mpi module to parallel
- removed the communicationRoutine interface from gpu. gpu now also uses the communicator interface from parallel
- added missing functionality in parallel/Communicator
- moved ws2_32 library dependency on windows from VirtualFluids_GPU to parallel
---
 .clang-format                                 |   2 +-
 CMakeLists.txt                                |   2 +-
 Python/actuator_line/actuator_line.py         |   6 +-
 Python/boundary_layer/boundary_layer.py       |   6 +-
 apps/cpu/AcousticPulse/ap.cpp                 |   2 +-
 apps/cpu/BeadPack/beadpack.cpp                |   2 +-
 apps/cpu/BoxBenchmark/bb.cpp                  |   2 +-
 apps/cpu/CheckpointConverter/cpc.cpp          |   2 +-
 apps/cpu/ConvectionOfVortex/cov.cpp           |   2 +-
 apps/cpu/CouetteFlow/cflow.cpp                |   2 +-
 apps/cpu/DHIT/dhit.cpp                        |   2 +-
 apps/cpu/DLR-F16-Porous/f16.cpp               |   2 +-
 apps/cpu/DLR-F16-Solid/f16.cpp                |   2 +-
 apps/cpu/DLR-F16/f16.cpp                      |   4 +-
 apps/cpu/FallingSphere/FallingSphere.cpp      |   2 +-
 apps/cpu/FlowAroundCylinder/cylinder.cpp      |   2 +-
 apps/cpu/FlowAroundCylinder/cylinder.cpp.old  |   4 +-
 apps/cpu/Hagen_Poiseuille_flow/pflow.cpp      |   4 +-
 apps/cpu/Hagen_Poiseuille_flow2/pflow2.cpp    |   2 +-
 apps/cpu/HerschelBulkleyModel/hbflow.cpp      |   2 +-
 apps/cpu/HerschelBulkleySphere/hbsphere.cpp   |   2 +-
 apps/cpu/InterfaceTest/itest.cpp              |   2 +-
 apps/cpu/JetBreakup/JetBreakup.cpp            |   2 +-
 apps/cpu/JetBreakup/JetBreakup.cpp.new        |   2 +-
 apps/cpu/LaminarTubeFlow/ltf.cpp              |   2 +-
 apps/cpu/LaminarTubeFlowConv/ltf.cpp          |   2 +-
 apps/cpu/LidDrivenCavity/LidDrivenCavity.cpp  |   2 +-
 apps/cpu/LiggghtsApp/LiggghtsApp.cpp          |   2 +-
 .../Multiphase (Droplet Test).cpp.backup      |   2 +-
 ...iphase (Jet breakup on Phoenix).cpp.backup |   2 +-
 apps/cpu/Multiphase/Multiphase.cpp            |   2 +-
 .../backup/Multiphase (Droplet Test).cpp      |   2 +-
 .../Multiphase (Final before automation).cpp  |   2 +-
 .../backup/Multiphase (Flow Focusing).cpp     |   2 +-
 .../Multiphase (Jet breakup on Phoenix).cpp   |   2 +-
 .../backup/Multiphase (T-Junction).cpp        |   2 +-
 .../backup/Multiphase (Thermal).cpp           |   2 +-
 .../Multiphase/backup/Multiphase (Tube).cpp   |   2 +-
 apps/cpu/Multiphase/backup/Multiphase.cpp     |   2 +-
 apps/cpu/MultiphaseDropletTest/droplet.cpp    |   2 +-
 apps/cpu/Nozzle/nozzle.cpp                    |   2 +-
 apps/cpu/OrganPipe/OrganPipe.cpp              |   2 +-
 apps/cpu/PlateWithPorousInlay/plate.cpp       |   2 +-
 apps/cpu/PoiseuilleFlow/pf1.cpp               |   2 +-
 apps/cpu/PoiseuilleFlow/pf2.cpp               |   2 +-
 apps/cpu/PoiseuilleFlow/pf3.cpp               |   2 +-
 apps/cpu/PoiseuilleFlow/pf4.cpp               |   2 +-
 apps/cpu/RisingBubble2D/RisingBubble2D.cpp    |   2 +-
 apps/cpu/TPMSRow/TPMSRow.cpp                  |   2 +-
 apps/cpu/ViskomatXL/viskomat.cpp              |   2 +-
 apps/cpu/Wing/wing.cpp                        |   2 +-
 apps/cpu/aperm/aperm.cpp                      |   2 +-
 apps/cpu/aperm/aperm.cpp.old                  |   2 +-
 apps/cpu/aperm/aperm.cpp.old2                 |   2 +-
 apps/cpu/bChannelA/bChannelA.cpp              |   2 +-
 apps/cpu/bChannelVA/bChannelVA.cpp            |   2 +-
 apps/cpu/bKanal/HLRNb/bKanal.cpp              |   2 +-
 apps/cpu/bKanal/bKanal.cpp                    |   2 +-
 apps/cpu/bKanal/sKanal/bKanal.cpp             |   2 +-
 apps/cpu/bKanal2/bKanal2.cpp                  |   2 +-
 apps/cpu/bKanalAv/bKanal.cpp                  |   2 +-
 apps/cpu/band/band.cpp                        |   2 +-
 apps/cpu/bbone/bbone.cpp                      |   2 +-
 .../block_test/block_test_incompressible.hpp  |   2 +-
 apps/cpu/bond_benchmark/bonb_b_chanel.cpp     |   2 +-
 apps/cpu/bond_benchmark/bond_b.cpp            |   2 +-
 apps/cpu/bond_test/bond_test.cpp              |   2 +-
 apps/cpu/bone/bone.cpp                        |   2 +-
 apps/cpu/f16Test/f16test.cpp                  |   2 +-
 apps/cpu/insitu_demo/insitu_demo.cpp          |   2 +-
 apps/cpu/levels/levels.cpp                    |   2 +-
 apps/cpu/micropart/micropartTestQs3.hpp       |   2 +-
 apps/cpu/mirror/mirror.cpp                    |   2 +-
 apps/cpu/mpi_benchmark/mpib.cpp               |   2 +-
 apps/cpu/pChannel/pChannel.cpp                |   2 +-
 apps/cpu/pChannel/pChannel.cpp.hlrn           |   2 +-
 apps/cpu/pDisk/pdisk.cpp                      |   2 +-
 apps/cpu/perm/perm.cpp                        |   2 +-
 apps/cpu/perm/perm.cpp_s                      |   2 +-
 apps/cpu/plate/plate.cpp                      |   2 +-
 apps/cpu/plate2/plate2.cpp                    |   2 +-
 apps/cpu/poiseuille_example/poiseuille.cpp    |   2 +-
 apps/cpu/porplate2/porplate.cpp               |   2 +-
 apps/cpu/rheometer/rheometer.cpp              |   2 +-
 apps/cpu/sbone/sbone.cpp                      |   2 +-
 apps/cpu/screw/screw.cpp                      |   2 +-
 apps/cpu/sphere/sphere.cpp                    |   2 +-
 apps/cpu/stick/stick.cpp                      |   2 +-
 apps/cpu/teperm/teperm.cpp                    |   2 +-
 apps/cpu/town/town.cpp                        |   2 +-
 apps/gpu/ActuatorLine/ActuatorLine.cpp        |  28 +-
 apps/gpu/ActuatorLine/CMakeLists.txt          |   2 +-
 apps/gpu/BoundaryLayer/BoundaryLayer.cpp      |  34 ++-
 apps/gpu/BoundaryLayer/CMakeLists.txt         |   2 +-
 apps/gpu/ChannelFlow/CMakeLists.txt           |   2 +-
 apps/gpu/ChannelFlow/ChannelFlow.cpp          |  19 +-
 apps/gpu/DrivenCavity/CMakeLists.txt          |   2 +-
 apps/gpu/DrivenCavity/DrivenCavity.cpp        |   5 +-
 apps/gpu/DrivenCavityMultiGPU/CMakeLists.txt  |   2 +-
 .../DrivenCavityMultiGPU.cpp                  |  19 +-
 apps/gpu/DrivenCavityUniform/CMakeLists.txt   |   2 +-
 apps/gpu/DrivenCavityUniform/DrivenCavity.cpp |   9 +-
 apps/gpu/MusselOyster/CMakeLists.txt          |   2 +-
 apps/gpu/MusselOyster/MusselOyster.cpp        |  14 +-
 apps/gpu/SphereGPU/CMakeLists.txt             |   2 +-
 apps/gpu/SphereGPU/Sphere.cpp                 |  11 +-
 apps/gpu/SphereRefined/CMakeLists.txt         |   2 +-
 apps/gpu/SphereRefined/SphereRefined.cpp      |   5 +-
 apps/gpu/SphereScaling/CMakeLists.txt         |   2 +-
 apps/gpu/SphereScaling/SphereScaling.cpp      |  20 +-
 apps/gpu/TGV_3D/TGV_3D.cpp                    |  13 +-
 apps/gpu/WTG_RUB/CMakeLists.txt               |   2 +-
 apps/gpu/WTG_RUB/WTG_RUB.cpp                  |  15 +-
 apps/gpu/gridGeneratorTest/gridGenerator.cpp  |   9 +-
 .../VirtualFluidSimulationFactory.cpp         |   6 +-
 pythonbindings/CMakeLists.txt                 |  32 ++-
 pythonbindings/pyfluids/__init__.py           |  14 +-
 .../{VirtualFluids.cpp => communicator.cpp}   |  41 +--
 pythonbindings/src/gpu/gpu.cpp                |  38 ++-
 .../src/gpu/submodules/communicator.cpp       |  52 ----
 .../src/gpu/submodules/simulation.cpp         |   8 +-
 .../LiggghtsCouplingSimulationObserver.cpp    |   2 +-
 .../LiggghtsCouplingSimulationObserver.h      |   6 +-
 ...MultiphaseQuantitiesSimulationObserver.cpp |   4 +-
 ...teMultiphaseQuantitiesSimulationObserver.h |   6 +-
 ...pInterfaceQuantitiesSimulationObserver.cpp |   4 +-
 ...arpInterfaceQuantitiesSimulationObserver.h |   6 +-
 .../CalculateTorqueSimulationObserver.cpp     |   4 +-
 .../CalculateTorqueSimulationObserver.h       |   6 +-
 ...ThixotropyQuantitiesSimulationObserver.cpp |   2 +-
 ...teThixotropyQuantitiesSimulationObserver.h |   6 +-
 src/cpu/VirtualFluids.h                       |   6 +-
 src/cpu/VirtualFluidsCore/CMakeLists.txt      |   2 +-
 .../Interactors/InteractorsHelper.cpp         |   4 +-
 .../Parallel/BlocksDistributor.cpp            |   2 +-
 .../Parallel/BlocksDistributor.h              |   6 +-
 .../VirtualFluidsCore/Simulation/Grid3D.cpp   |   6 +-
 src/cpu/VirtualFluidsCore/Simulation/Grid3D.h |   8 +-
 .../AdjustForcingSimulationObserver.cpp       |   4 +-
 .../AdjustForcingSimulationObserver.h         |   6 +-
 .../AverageValuesSimulationObserver.cpp       |   6 +-
 .../CalculateForcesSimulationObserver.cpp     |   4 +-
 .../CalculateForcesSimulationObserver.h       |   6 +-
 .../CalculateTorqueSimulationObserver.cpp     |   4 +-
 .../DecreaseViscositySimulationObserver.cpp   |   4 +-
 .../DecreaseViscositySimulationObserver.h     |   6 +-
 .../EmergencyExitSimulationObserver.cpp       |   4 +-
 .../EmergencyExitSimulationObserver.h         |   6 +-
 .../SimulationObservers/ForceCalculator.cpp   |   4 +-
 .../SimulationObservers/ForceCalculator.h     |   6 +-
 .../InSituCatalystSimulationObserver.cpp      |   2 +-
 .../InSituVTKSimulationObserver.cpp           |   4 +-
 .../IntegrateValuesHelper.cpp                 |   4 +-
 .../IntegrateValuesHelper.h                   |   8 +-
 .../LineTimeSeriesSimulationObserver.cpp      |   4 +-
 .../LineTimeSeriesSimulationObserver.h        |   4 +-
 .../MPIIOMigrationBESimulationObserver.cpp    |   4 +-
 .../MPIIOMigrationBESimulationObserver.h      |   4 +-
 .../MPIIOMigrationSimulationObserver.cpp      |   4 +-
 .../MPIIOMigrationSimulationObserver.h        |   4 +-
 .../MPIIORestartSimulationObserver.cpp        |   4 +-
 .../MPIIORestartSimulationObserver.h          |   4 +-
 .../MPIIOSimulationObserver.cpp               |   4 +-
 .../MPIIOSimulationObserver.h                 |   6 +-
 .../MicrophoneArraySimulationObserver.cpp     |   4 +-
 .../MicrophoneArraySimulationObserver.h       |   6 +-
 .../NUPSCounterSimulationObserver.cpp         |   4 +-
 .../NUPSCounterSimulationObserver.h           |   6 +-
 .../PressureCoefficientSimulationObserver.cpp |   4 +-
 .../PressureCoefficientSimulationObserver.h   |   6 +-
 .../PressureDifferenceSimulationObserver.cpp  |   4 +-
 .../PressureDifferenceSimulationObserver.h    |   6 +-
 .../QCriterionSimulationObserver.cpp          |   4 +-
 .../QCriterionSimulationObserver.h            |   6 +-
 .../ShearStressSimulationObserver.cpp         |   8 +-
 .../TimeAveragedValuesSimulationObserver.cpp  |   6 +-
 .../TimeAveragedValuesSimulationObserver.h    |   8 +-
 .../TimeseriesSimulationObserver.cpp          |   4 +-
 .../TimeseriesSimulationObserver.h            |   6 +-
 .../TurbulenceIntensitySimulationObserver.cpp |   4 +-
 .../TurbulenceIntensitySimulationObserver.h   |   6 +-
 .../WriteBlocksSimulationObserver.cpp         |   4 +-
 .../WriteBlocksSimulationObserver.h           |   6 +-
 ...teBoundaryConditionsSimulationObserver.cpp |   4 +-
 ...riteBoundaryConditionsSimulationObserver.h |   6 +-
 .../WriteGbObjectsSimulationObserver.cpp      |   4 +-
 .../WriteGbObjectsSimulationObserver.h        |   6 +-
 ...WriteMQFromSelectionSimulationObserver.cpp |   6 +-
 .../WriteMQFromSelectionSimulationObserver.h  |   6 +-
 ...icQuantitiesPlusMassSimulationObserver.cpp |   2 +-
 ...opicQuantitiesPlusMassSimulationObserver.h |   6 +-
 ...acroscopicQuantitiesSimulationObserver.cpp |   4 +-
 ...eMacroscopicQuantitiesSimulationObserver.h |   6 +-
 .../Utilities/CheckpointConverter.cpp         |   4 +-
 .../Utilities/CheckpointConverter.h           |   6 +-
 .../Visitors/CreateTransmittersHelper.cpp     |   2 +-
 .../Visitors/CreateTransmittersHelper.h       |   4 +-
 .../Visitors/MetisPartitioningGridVisitor.cpp |   4 +-
 .../Visitors/MetisPartitioningGridVisitor.h   |   6 +-
 .../Visitors/RefineAroundGbObjectHelper.cpp   |   4 +-
 .../Visitors/RefineAroundGbObjectHelper.h     |   6 +-
 .../RefineCrossAndInsideGbObjectHelper.cpp    |   4 +-
 .../RefineCrossAndInsideGbObjectHelper.h      |   6 +-
 .../Visitors/RenumberGridVisitor.cpp          |   2 +-
 .../Visitors/RenumberGridVisitor.h            |   6 +-
 .../Visitors/SetConnectorsBlockVisitor.h      |   8 +-
 ...SetInterpolationConnectorsBlockVisitor.cpp |   4 +-
 .../SetInterpolationConnectorsBlockVisitor.h  |   6 +-
 .../ZoltanPartitioningGridVisitor.cpp         |   2 +-
 .../Visitors/ZoltanPartitioningGridVisitor.h  |   6 +-
 src/cpu/simulationconfig/Simulation.cpp       |   4 +-
 src/cpu/simulationconfig/Simulation.h         |   4 +-
 .../grid/GridBuilder/LevelGridBuilder.h       |   2 +-
 src/gpu/VirtualFluids_GPU/CMakeLists.txt      |   6 +-
 .../Calculation/UpdateGrid27.cpp              |   4 +-
 .../Calculation/UpdateGrid27.h                |  18 +-
 .../Communication/CommunicationRoutine.h      |  19 --
 .../Communication/CommunicationRoutineMocks.h |  21 --
 .../Communication/Communicator.h              |  39 ---
 .../Communication/ExchangeData27.cpp          | 193 ++++++--------
 .../Communication/ExchangeData27.h            |  49 ++--
 .../Communication/MpiCommunicator.cpp         | 242 ------------------
 .../Communication/MpiCommunicator.h           |  78 ------
 .../DataStructureInitializer/GridProvider.cpp |   2 +-
 .../DataStructureInitializer/GridProvider.h   |   6 +-
 .../GridReaderGenerator/GridGenerator.cpp     |   7 +-
 .../GridReaderGenerator/GridGenerator.h       |   6 +-
 .../GridReaderGenerator/GridGeneratorTest.cpp |   9 +-
 .../IndexRearrangementForStreams.cpp          |  15 +-
 .../IndexRearrangementForStreams.h            |   8 +-
 .../IndexRearrangementForStreamsTest.cpp      |  63 +++--
 src/gpu/VirtualFluids_GPU/Init/VfReader.cpp   |   1 -
 src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp  |  10 +-
 src/gpu/VirtualFluids_GPU/LBM/Simulation.h    |  14 +-
 .../Output/EdgeNodeDebugWriter.hpp            |  11 +-
 .../Output/InterfaceDebugWriter.hpp           |  18 +-
 .../Output/NeighborDebugWriter.hpp            |   1 -
 .../Output/QDebugVtkWriter.hpp                |   1 -
 src/gpu/VirtualFluids_GPU/Output/Timer.cpp    |  17 +-
 src/gpu/VirtualFluids_GPU/Output/Timer.h      |  14 +-
 .../Parameter/ParameterTest.cpp               |   7 +-
 src/mpi/CMakeLists.txt                        |   2 -
 src/parallel/CMakeLists.txt                   |   6 +
 src/{mpi => parallel}/Communicator.cpp        |   2 +-
 src/{mpi => parallel}/Communicator.h          |  28 +-
 src/{mpi => parallel}/MPICommunicator.cpp     | 140 +++++++++-
 src/{mpi => parallel}/MPICommunicator.h       |  39 ++-
 src/{mpi => parallel}/NullCommunicator.cpp    |  47 +++-
 src/{mpi => parallel}/NullCommunicator.h      |  92 ++++---
 249 files changed, 1006 insertions(+), 1279 deletions(-)
 rename pythonbindings/src/{VirtualFluids.cpp => communicator.cpp} (72%)
 delete mode 100644 pythonbindings/src/gpu/submodules/communicator.cpp
 delete mode 100644 src/gpu/VirtualFluids_GPU/Communication/CommunicationRoutine.h
 delete mode 100644 src/gpu/VirtualFluids_GPU/Communication/CommunicationRoutineMocks.h
 delete mode 100644 src/gpu/VirtualFluids_GPU/Communication/Communicator.h
 delete mode 100644 src/gpu/VirtualFluids_GPU/Communication/MpiCommunicator.cpp
 delete mode 100644 src/gpu/VirtualFluids_GPU/Communication/MpiCommunicator.h
 delete mode 100644 src/mpi/CMakeLists.txt
 create mode 100644 src/parallel/CMakeLists.txt
 rename src/{mpi => parallel}/Communicator.cpp (99%)
 rename src/{mpi => parallel}/Communicator.h (81%)
 rename src/{mpi => parallel}/MPICommunicator.cpp (61%)
 rename src/{mpi => parallel}/MPICommunicator.h (84%)
 rename src/{mpi => parallel}/NullCommunicator.cpp (80%)
 rename src/{mpi => parallel}/NullCommunicator.h (56%)

diff --git a/.clang-format b/.clang-format
index 915697a4a..600543875 100644
--- a/.clang-format
+++ b/.clang-format
@@ -50,7 +50,7 @@ BreakConstructorInitializersBeforeComma: false
 BreakConstructorInitializers: BeforeColon
 BreakAfterJavaFieldAnnotations: false
 BreakStringLiterals: true
-ColumnLimit:     300
+ColumnLimit:     125
 CommentPragmas:  '^ IWYU pragma:'
 CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91f4f5121..db049189b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -210,7 +210,7 @@ ENDIF()
 
 add_subdirectory(src/logger)
 add_subdirectory(src/basics)
-add_subdirectory(src/mpi)
+add_subdirectory(src/parallel)
 add_subdirectory(src/lbm)
 
 
diff --git a/Python/actuator_line/actuator_line.py b/Python/actuator_line/actuator_line.py
index 3e2a196ab..00fa21bae 100644
--- a/Python/actuator_line/actuator_line.py
+++ b/Python/actuator_line/actuator_line.py
@@ -36,7 +36,7 @@ r"""
 import numpy as np
 from pathlib import Path
 from mpi4py import MPI
-from pyfluids import basics, gpu, logger
+from pyfluids import basics, gpu, logger, communicator
 #%%
 sim_name = "ABL"
 config_file = Path(__file__).parent/"configActuatorLine.txt"
@@ -48,12 +48,12 @@ output_path.mkdir(exist_ok=True)
 logger.Logger.initialize_logger()
 
 grid_builder = gpu.grid_generator.MultipleGridBuilder()
-communicator = gpu.MpiCommunicator.get_instance()
+communicator = communicator.Communicator.get_instance()
 
 config = basics.ConfigurationFile()
 config.load(str(config_file))
 
-para = gpu.Parameter(communicator.get_number_of_process(), communicator.get_pid(), config)
+para = gpu.Parameter(communicator.get_number_of_processes(), communicator.get_number_of_processes(), config)
 bc_factory = gpu.BoundaryConditionFactory()
 
 #%%
diff --git a/Python/boundary_layer/boundary_layer.py b/Python/boundary_layer/boundary_layer.py
index 647f16dbc..d2efffeac 100644
--- a/Python/boundary_layer/boundary_layer.py
+++ b/Python/boundary_layer/boundary_layer.py
@@ -36,7 +36,7 @@ r"""
 import numpy as np
 from pathlib import Path
 from mpi4py import MPI
-from pyfluids import basics, gpu, logger
+from pyfluids import basics, gpu, logger, communicator
 #%%
 sim_name = "ABL"
 config_file = Path(__file__).parent/"configBoundaryLayer.txt"
@@ -49,12 +49,12 @@ logger.Logger.initialize_logger()
 
 #%%
 grid_builder = gpu.grid_generator.MultipleGridBuilder()
-communicator = gpu.MpiCommunicator.get_instance()
+communicator = communicator.Communicator.get_instance()
 
 config = basics.ConfigurationFile()
 config.load(str(config_file))
 
-para = gpu.Parameter(communicator.get_number_of_process(), communicator.get_pid(), config)
+para = gpu.Parameter(communicator.get_number_of_processes(), communicator.get_process_id(), config)
 bc_factory = gpu.BoundaryConditionFactory()
 
 #%%
diff --git a/apps/cpu/AcousticPulse/ap.cpp b/apps/cpu/AcousticPulse/ap.cpp
index ac69eee6d..a04952a35 100644
--- a/apps/cpu/AcousticPulse/ap.cpp
+++ b/apps/cpu/AcousticPulse/ap.cpp
@@ -10,7 +10,7 @@ void run()
 {
    try
    {
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       int    numOfThreads = 4;
diff --git a/apps/cpu/BeadPack/beadpack.cpp b/apps/cpu/BeadPack/beadpack.cpp
index d683fc445..3da3030bb 100644
--- a/apps/cpu/BeadPack/beadpack.cpp
+++ b/apps/cpu/BeadPack/beadpack.cpp
@@ -23,7 +23,7 @@ void sbonepd(const char *configname)
          throw exceptionText;
       }
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (machine == "BOMBADIL")
diff --git a/apps/cpu/BoxBenchmark/bb.cpp b/apps/cpu/BoxBenchmark/bb.cpp
index 0f0c7d6c1..3915f9062 100644
--- a/apps/cpu/BoxBenchmark/bb.cpp
+++ b/apps/cpu/BoxBenchmark/bb.cpp
@@ -37,7 +37,7 @@ void run(string configname)
 
       //UbLog::reportingLevel() = UbLog::logLevelFromString("DEBUG3");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/CheckpointConverter/cpc.cpp b/apps/cpu/CheckpointConverter/cpc.cpp
index 4eb526cc7..a34e758db 100644
--- a/apps/cpu/CheckpointConverter/cpc.cpp
+++ b/apps/cpu/CheckpointConverter/cpc.cpp
@@ -17,7 +17,7 @@ void run(string configname)
       int    step = config.getValue<int>("step");
       int    numberOfProcesses = config.getValue<int>("numberOfProcesses");
       
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       SPtr<Grid3D> grid(new Grid3D(comm));
diff --git a/apps/cpu/ConvectionOfVortex/cov.cpp b/apps/cpu/ConvectionOfVortex/cov.cpp
index bfe29fc9b..102a1ad7d 100644
--- a/apps/cpu/ConvectionOfVortex/cov.cpp
+++ b/apps/cpu/ConvectionOfVortex/cov.cpp
@@ -12,7 +12,7 @@ void run()
 
    try
    {
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       int    numOfThreads = 4;
diff --git a/apps/cpu/CouetteFlow/cflow.cpp b/apps/cpu/CouetteFlow/cflow.cpp
index 112c0c96b..6c7e28f5a 100644
--- a/apps/cpu/CouetteFlow/cflow.cpp
+++ b/apps/cpu/CouetteFlow/cflow.cpp
@@ -40,7 +40,7 @@ void bflow(string configname)
 //      double          Re = config.getValue<double>("Re");
 //      double          Bn = config.getValue<double>("Bn");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/DHIT/dhit.cpp b/apps/cpu/DHIT/dhit.cpp
index e06db26b8..3143aa1c8 100644
--- a/apps/cpu/DHIT/dhit.cpp
+++ b/apps/cpu/DHIT/dhit.cpp
@@ -29,7 +29,7 @@ void run(string configname)
       double          lambda = config.getDouble("lambda");
       double          initTime = config.getDouble("initTime");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/DLR-F16-Porous/f16.cpp b/apps/cpu/DLR-F16-Porous/f16.cpp
index 791c1c926..08bafaf3d 100644
--- a/apps/cpu/DLR-F16-Porous/f16.cpp
+++ b/apps/cpu/DLR-F16-Porous/f16.cpp
@@ -95,7 +95,7 @@ void run(string configname)
       
 
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/DLR-F16-Solid/f16.cpp b/apps/cpu/DLR-F16-Solid/f16.cpp
index 2a9893ef2..cf08ef73a 100644
--- a/apps/cpu/DLR-F16-Solid/f16.cpp
+++ b/apps/cpu/DLR-F16-Solid/f16.cpp
@@ -47,7 +47,7 @@ void run(string configname)
       double          timeAvStart       = config.getValue<double>("timeAvStart");
       double          timeAvStop        = config.getValue<double>("timeAvStop");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/DLR-F16/f16.cpp b/apps/cpu/DLR-F16/f16.cpp
index 17fef2d15..639a73c38 100644
--- a/apps/cpu/DLR-F16/f16.cpp
+++ b/apps/cpu/DLR-F16/f16.cpp
@@ -13,7 +13,7 @@ double rangeRandom1()
 
 void setBC(SPtr<Grid3D> grid, string pathGeo, string fngFileWhole, string zigZagTape, vector<double>  boundingBox, double uLB, double rhoLB, double blockLength, SPtr<BCProcessor> bcProcessor)
 {
-   SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+   SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
    int myid = comm->getProcessID();
    
    std::vector<std::vector<SPtr<Block3D>> > blockVector;
@@ -205,7 +205,7 @@ void run(string configname)
       int             chunk = config.getValue<int>("chunk");
 
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/FallingSphere/FallingSphere.cpp b/apps/cpu/FallingSphere/FallingSphere.cpp
index ba837fcf1..0ce12e62e 100644
--- a/apps/cpu/FallingSphere/FallingSphere.cpp
+++ b/apps/cpu/FallingSphere/FallingSphere.cpp
@@ -11,7 +11,7 @@ using namespace std;
 
 int main(int argc, char *argv[])
 {
-    std::shared_ptr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+    std::shared_ptr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
     int myid                                        = comm->getProcessID();
 
 
diff --git a/apps/cpu/FlowAroundCylinder/cylinder.cpp b/apps/cpu/FlowAroundCylinder/cylinder.cpp
index 5956a48a8..3e5be5e08 100644
--- a/apps/cpu/FlowAroundCylinder/cylinder.cpp
+++ b/apps/cpu/FlowAroundCylinder/cylinder.cpp
@@ -35,7 +35,7 @@ void run(string configname)
       vector<int>     blockNx = config.getVector<int>("blockNx");
       real          dx = config.getValue<real>("dx");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/FlowAroundCylinder/cylinder.cpp.old b/apps/cpu/FlowAroundCylinder/cylinder.cpp.old
index f251ee635..774ed812b 100644
--- a/apps/cpu/FlowAroundCylinder/cylinder.cpp.old
+++ b/apps/cpu/FlowAroundCylinder/cylinder.cpp.old
@@ -15,7 +15,7 @@ void run(const char *cstr)
       int numOfThreads = 1;
       double availMem = 0;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if(machine == "BOMBADIL") 
@@ -385,7 +385,7 @@ void run2(const char *cstr)
       int numOfThreads = 1;
       double availMem = 0;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if(machine == "BOMBADIL") 
diff --git a/apps/cpu/Hagen_Poiseuille_flow/pflow.cpp b/apps/cpu/Hagen_Poiseuille_flow/pflow.cpp
index 5d5e47fdd..e7f7bb84a 100644
--- a/apps/cpu/Hagen_Poiseuille_flow/pflow.cpp
+++ b/apps/cpu/Hagen_Poiseuille_flow/pflow.cpp
@@ -29,7 +29,7 @@ using namespace std;
 //      double          deltax = config.getDouble("deltax");
 //
 //
-//      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+//      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
 //      int myid = comm->getProcessID();
 //
 //      if (logToFile)
@@ -322,7 +322,7 @@ void pflowdp(string configname)
       double          cpStepStart = config.getValue<double>("cpStepStart");
       bool            newStart = config.getValue<bool>("newStart");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       LBMReal rhoLB = 0.0;
diff --git a/apps/cpu/Hagen_Poiseuille_flow2/pflow2.cpp b/apps/cpu/Hagen_Poiseuille_flow2/pflow2.cpp
index f298d697f..e278fcb78 100644
--- a/apps/cpu/Hagen_Poiseuille_flow2/pflow2.cpp
+++ b/apps/cpu/Hagen_Poiseuille_flow2/pflow2.cpp
@@ -29,7 +29,7 @@ void pflowdp(string configname)
       double          deltax = config.getValue<double>("deltax");
 
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       LBMReal rhoLB = 0.0;
diff --git a/apps/cpu/HerschelBulkleyModel/hbflow.cpp b/apps/cpu/HerschelBulkleyModel/hbflow.cpp
index 6abe6b5c5..67ed5404e 100644
--- a/apps/cpu/HerschelBulkleyModel/hbflow.cpp
+++ b/apps/cpu/HerschelBulkleyModel/hbflow.cpp
@@ -41,7 +41,7 @@ void bflow(string configname)
 //      double          Bn = config.getValue<double>("Bn");
       real          scaleFactor = config.getValue<real>("scaleFactor");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/HerschelBulkleySphere/hbsphere.cpp b/apps/cpu/HerschelBulkleySphere/hbsphere.cpp
index 221b10612..90ae44b62 100644
--- a/apps/cpu/HerschelBulkleySphere/hbsphere.cpp
+++ b/apps/cpu/HerschelBulkleySphere/hbsphere.cpp
@@ -38,7 +38,7 @@ void bflow(string configname)
       real          Bn = config.getValue<real>("Bn");
       vector<real>  sphereCenter = config.getVector<real>("sphereCenter");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/InterfaceTest/itest.cpp b/apps/cpu/InterfaceTest/itest.cpp
index 723802f6d..e8c93d4e5 100644
--- a/apps/cpu/InterfaceTest/itest.cpp
+++ b/apps/cpu/InterfaceTest/itest.cpp
@@ -11,7 +11,7 @@ void run()
 {
    try
    {
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       int    numOfThreads = 4;
diff --git a/apps/cpu/JetBreakup/JetBreakup.cpp b/apps/cpu/JetBreakup/JetBreakup.cpp
index f4b74ca37..53cdc7370 100644
--- a/apps/cpu/JetBreakup/JetBreakup.cpp
+++ b/apps/cpu/JetBreakup/JetBreakup.cpp
@@ -61,7 +61,7 @@ void run(string configname)
 
         int caseN = config.getValue<int>("case");
 
-        SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+        SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
         int myid = comm->getProcessID();
 
         if (myid == 0)
diff --git a/apps/cpu/JetBreakup/JetBreakup.cpp.new b/apps/cpu/JetBreakup/JetBreakup.cpp.new
index 953a8dee8..1c03deebd 100644
--- a/apps/cpu/JetBreakup/JetBreakup.cpp.new
+++ b/apps/cpu/JetBreakup/JetBreakup.cpp.new
@@ -58,7 +58,7 @@ void run(string configname)
 
         int caseN = config.getValue<int>("case");
 
-        SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+        SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
         int myid = comm->getProcessID();
 
         if (myid == 0)
diff --git a/apps/cpu/LaminarTubeFlow/ltf.cpp b/apps/cpu/LaminarTubeFlow/ltf.cpp
index 315bee6f2..53927b737 100644
--- a/apps/cpu/LaminarTubeFlow/ltf.cpp
+++ b/apps/cpu/LaminarTubeFlow/ltf.cpp
@@ -33,7 +33,7 @@ void run(string configname)
       real          cpStep = config.getValue<real>("cpStep");
       bool            newStart = config.getValue<bool>("newStart");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/LaminarTubeFlowConv/ltf.cpp b/apps/cpu/LaminarTubeFlowConv/ltf.cpp
index 53cd7c1ac..d7515d44b 100644
--- a/apps/cpu/LaminarTubeFlowConv/ltf.cpp
+++ b/apps/cpu/LaminarTubeFlowConv/ltf.cpp
@@ -30,7 +30,7 @@ void run(int tn)
       int numOfThreads = 1;
       double availMem = 0;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if(machine == "BOMBADIL") 
diff --git a/apps/cpu/LidDrivenCavity/LidDrivenCavity.cpp b/apps/cpu/LidDrivenCavity/LidDrivenCavity.cpp
index 1819ee0f6..01d6262c2 100644
--- a/apps/cpu/LidDrivenCavity/LidDrivenCavity.cpp
+++ b/apps/cpu/LidDrivenCavity/LidDrivenCavity.cpp
@@ -80,7 +80,7 @@ int main(int  /*argc*/, char*  /*argv*/[])
       double g_maxX3 = 0.5;
 
       // NullCommunicator is a place-holder for interprocess communication
-      SPtr<vf::mpi::Communicator> comm = NullCommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = NullCommunicator::getInstance();
       // new grid object
       SPtr<Grid3D> grid(new Grid3D(comm));
       // set grid spacing
diff --git a/apps/cpu/LiggghtsApp/LiggghtsApp.cpp b/apps/cpu/LiggghtsApp/LiggghtsApp.cpp
index 969243a05..5b3f27d2e 100644
--- a/apps/cpu/LiggghtsApp/LiggghtsApp.cpp
+++ b/apps/cpu/LiggghtsApp/LiggghtsApp.cpp
@@ -19,7 +19,7 @@ int main(int argc, char *argv[])
 {
     //Sleep(30000);
 
-    std::shared_ptr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+    std::shared_ptr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
     int myid                                        = comm->getProcessID();
 
 
diff --git a/apps/cpu/Multiphase/Multiphase (Droplet Test).cpp.backup b/apps/cpu/Multiphase/Multiphase (Droplet Test).cpp.backup
index b783a354f..c0ca32e5c 100644
--- a/apps/cpu/Multiphase/Multiphase (Droplet Test).cpp.backup	
+++ b/apps/cpu/Multiphase/Multiphase (Droplet Test).cpp.backup	
@@ -51,7 +51,7 @@ void run(string configname)
       double beta  = 12*sigma/interfaceThickness;
 	  double kappa = 1.5*interfaceThickness*sigma;
 	  
-	  CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+	  CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/Multiphase/Multiphase (Jet breakup on Phoenix).cpp.backup b/apps/cpu/Multiphase/Multiphase (Jet breakup on Phoenix).cpp.backup
index ebf91e6cb..d43a61d67 100644
--- a/apps/cpu/Multiphase/Multiphase (Jet breakup on Phoenix).cpp.backup	
+++ b/apps/cpu/Multiphase/Multiphase (Jet breakup on Phoenix).cpp.backup	
@@ -51,7 +51,7 @@ void run(string configname)
       double beta  = 12*sigma/interfaceThickness;
 	  double kappa = 1.5*interfaceThickness*sigma;
 	  
-	  CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+	  CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/Multiphase/Multiphase.cpp b/apps/cpu/Multiphase/Multiphase.cpp
index 79d969dd9..07b5a661f 100644
--- a/apps/cpu/Multiphase/Multiphase.cpp
+++ b/apps/cpu/Multiphase/Multiphase.cpp
@@ -55,7 +55,7 @@ void run(string configname)
         real beta = 12 * sigma / interfaceWidth;
         real kappa = 1.5 * interfaceWidth * sigma;
 
-        SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+        SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
         int myid                = comm->getProcessID();
 
         if (myid == 0)
diff --git a/apps/cpu/Multiphase/backup/Multiphase (Droplet Test).cpp b/apps/cpu/Multiphase/backup/Multiphase (Droplet Test).cpp
index e6efac953..868b4abfa 100644
--- a/apps/cpu/Multiphase/backup/Multiphase (Droplet Test).cpp	
+++ b/apps/cpu/Multiphase/backup/Multiphase (Droplet Test).cpp	
@@ -51,7 +51,7 @@ void run(string configname)
       double beta  = 12*sigma/interfaceThickness;
 	  double kappa = 1.5*interfaceThickness*sigma;
 	  
-	  CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+	  CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/Multiphase/backup/Multiphase (Final before automation).cpp b/apps/cpu/Multiphase/backup/Multiphase (Final before automation).cpp
index 61d376600..4b5178359 100644
--- a/apps/cpu/Multiphase/backup/Multiphase (Final before automation).cpp	
+++ b/apps/cpu/Multiphase/backup/Multiphase (Final before automation).cpp	
@@ -67,7 +67,7 @@ void run(string configname)
       double beta  = 12*sigma/interfaceThickness;
 	  double kappa = 1.5*interfaceThickness*sigma;
 	  
-	  CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+	  CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/Multiphase/backup/Multiphase (Flow Focusing).cpp b/apps/cpu/Multiphase/backup/Multiphase (Flow Focusing).cpp
index 32548c12b..c79270b36 100644
--- a/apps/cpu/Multiphase/backup/Multiphase (Flow Focusing).cpp	
+++ b/apps/cpu/Multiphase/backup/Multiphase (Flow Focusing).cpp	
@@ -51,7 +51,7 @@ void run(string configname)
       double beta  = 12*sigma/interfaceThickness;
 	  double kappa = 1.5*interfaceThickness*sigma;
 	  
-	  CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+	  CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/Multiphase/backup/Multiphase (Jet breakup on Phoenix).cpp b/apps/cpu/Multiphase/backup/Multiphase (Jet breakup on Phoenix).cpp
index ebf91e6cb..d43a61d67 100644
--- a/apps/cpu/Multiphase/backup/Multiphase (Jet breakup on Phoenix).cpp	
+++ b/apps/cpu/Multiphase/backup/Multiphase (Jet breakup on Phoenix).cpp	
@@ -51,7 +51,7 @@ void run(string configname)
       double beta  = 12*sigma/interfaceThickness;
 	  double kappa = 1.5*interfaceThickness*sigma;
 	  
-	  CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+	  CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/Multiphase/backup/Multiphase (T-Junction).cpp b/apps/cpu/Multiphase/backup/Multiphase (T-Junction).cpp
index 93844c4a2..c213e6366 100644
--- a/apps/cpu/Multiphase/backup/Multiphase (T-Junction).cpp	
+++ b/apps/cpu/Multiphase/backup/Multiphase (T-Junction).cpp	
@@ -51,7 +51,7 @@ void run(string configname)
       double beta  = 12*sigma/interfaceThickness;
 	  double kappa = 1.5*interfaceThickness*sigma;
 	  
-	  CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+	  CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/Multiphase/backup/Multiphase (Thermal).cpp b/apps/cpu/Multiphase/backup/Multiphase (Thermal).cpp
index 68e07b43c..4f5417397 100644
--- a/apps/cpu/Multiphase/backup/Multiphase (Thermal).cpp	
+++ b/apps/cpu/Multiphase/backup/Multiphase (Thermal).cpp	
@@ -51,7 +51,7 @@ void run(string configname)
       double beta  = 12*sigma/interfaceThickness;
 	  double kappa = 1.5*interfaceThickness*sigma;
 	  
-	  CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+	  CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/Multiphase/backup/Multiphase (Tube).cpp b/apps/cpu/Multiphase/backup/Multiphase (Tube).cpp
index 492a906b8..84de9055d 100644
--- a/apps/cpu/Multiphase/backup/Multiphase (Tube).cpp	
+++ b/apps/cpu/Multiphase/backup/Multiphase (Tube).cpp	
@@ -42,7 +42,7 @@ void run(string configname)
       double beta  = 12*sigma/interfaceThickness;
 	  double kappa = 1.5*interfaceThickness*sigma;
 	  
-	  CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+	  CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/Multiphase/backup/Multiphase.cpp b/apps/cpu/Multiphase/backup/Multiphase.cpp
index b029e3527..8a7708c05 100644
--- a/apps/cpu/Multiphase/backup/Multiphase.cpp
+++ b/apps/cpu/Multiphase/backup/Multiphase.cpp
@@ -78,7 +78,7 @@ void run(string configname)
       double beta  = 12*sigma/interfaceThickness;
 	  double kappa = 1.5*interfaceThickness*sigma;
 	  
-	  CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+	  CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/MultiphaseDropletTest/droplet.cpp b/apps/cpu/MultiphaseDropletTest/droplet.cpp
index eaa3e550f..f65c19633 100644
--- a/apps/cpu/MultiphaseDropletTest/droplet.cpp
+++ b/apps/cpu/MultiphaseDropletTest/droplet.cpp
@@ -51,7 +51,7 @@ void run(string configname)
         bool newStart      = config.getValue<bool>("newStart");
         //double rStep = config.getValue<double>("rStep");
 
-        SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+        SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
         int myid                = comm->getProcessID();
 
         if (myid == 0)
diff --git a/apps/cpu/Nozzle/nozzle.cpp b/apps/cpu/Nozzle/nozzle.cpp
index babee1900..bdaec8e1a 100644
--- a/apps/cpu/Nozzle/nozzle.cpp
+++ b/apps/cpu/Nozzle/nozzle.cpp
@@ -18,7 +18,7 @@ int main(int argc, char *argv[])
 
     try {
 
-        std::shared_ptr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+        std::shared_ptr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
         int myid = comm->getProcessID();
 
         // bounding box
diff --git a/apps/cpu/OrganPipe/OrganPipe.cpp b/apps/cpu/OrganPipe/OrganPipe.cpp
index b23c15911..98b8c2505 100644
--- a/apps/cpu/OrganPipe/OrganPipe.cpp
+++ b/apps/cpu/OrganPipe/OrganPipe.cpp
@@ -8,7 +8,7 @@ void run(string configname)
 {
    try
    {
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (myid == 0) UBLOG(logINFO, "Testcase organ pipe");
diff --git a/apps/cpu/PlateWithPorousInlay/plate.cpp b/apps/cpu/PlateWithPorousInlay/plate.cpp
index 315bacfa9..0334da7c3 100644
--- a/apps/cpu/PlateWithPorousInlay/plate.cpp
+++ b/apps/cpu/PlateWithPorousInlay/plate.cpp
@@ -52,7 +52,7 @@ void run(const char *cstr)
       stringstream logFilename;
       double availMem = 0;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       string machine = string(cstr);
diff --git a/apps/cpu/PoiseuilleFlow/pf1.cpp b/apps/cpu/PoiseuilleFlow/pf1.cpp
index 0e21dea4d..dffd5fde6 100644
--- a/apps/cpu/PoiseuilleFlow/pf1.cpp
+++ b/apps/cpu/PoiseuilleFlow/pf1.cpp
@@ -9,7 +9,7 @@ void pf1()
 {
     using namespace vf::lbm::dir;
 
-   SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+   SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
    int myid = comm->getProcessID();
 
    //parameters
diff --git a/apps/cpu/PoiseuilleFlow/pf2.cpp b/apps/cpu/PoiseuilleFlow/pf2.cpp
index c339e06a6..7990d1969 100644
--- a/apps/cpu/PoiseuilleFlow/pf2.cpp
+++ b/apps/cpu/PoiseuilleFlow/pf2.cpp
@@ -6,7 +6,7 @@
 ////pipe flow with pressure drop
 //void pf2()
 //{
-//   SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+//   SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
 //   int myid = comm->getProcessID();
 //
 //   //parameters
diff --git a/apps/cpu/PoiseuilleFlow/pf3.cpp b/apps/cpu/PoiseuilleFlow/pf3.cpp
index fa01b6852..0442b1c67 100644
--- a/apps/cpu/PoiseuilleFlow/pf3.cpp
+++ b/apps/cpu/PoiseuilleFlow/pf3.cpp
@@ -6,7 +6,7 @@
 ////two plates flow with forcing
 //void pf3()
 //{
-//   SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+//   SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
 //   int myid = comm->getProcessID();
 //
 //   //parameters
diff --git a/apps/cpu/PoiseuilleFlow/pf4.cpp b/apps/cpu/PoiseuilleFlow/pf4.cpp
index 2e419358e..9568cdd61 100644
--- a/apps/cpu/PoiseuilleFlow/pf4.cpp
+++ b/apps/cpu/PoiseuilleFlow/pf4.cpp
@@ -6,7 +6,7 @@
 ////two plates flow with pressure drop
 //void pf4()
 //{
-//   SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+//   SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
 //   int myid = comm->getProcessID();
 //
 //   //parameters
diff --git a/apps/cpu/RisingBubble2D/RisingBubble2D.cpp b/apps/cpu/RisingBubble2D/RisingBubble2D.cpp
index c9a28efc4..a6f276130 100644
--- a/apps/cpu/RisingBubble2D/RisingBubble2D.cpp
+++ b/apps/cpu/RisingBubble2D/RisingBubble2D.cpp
@@ -52,7 +52,7 @@ void run(string configname)
         bool newStart = config.getValue<bool>("newStart");
         // double rStep = config.getValue<double>("rStep");
 
-        std::shared_ptr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+        std::shared_ptr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
         int myid = comm->getProcessID();
 
         if (myid == 0) UBLOG(logINFO, "2D Rising Bubble: Start!");
diff --git a/apps/cpu/TPMSRow/TPMSRow.cpp b/apps/cpu/TPMSRow/TPMSRow.cpp
index 596b79d05..09be56c68 100644
--- a/apps/cpu/TPMSRow/TPMSRow.cpp
+++ b/apps/cpu/TPMSRow/TPMSRow.cpp
@@ -47,7 +47,7 @@ void run(string configname)
         bool newStart               = config.getValue<bool>("newStart");
 
         //SPtr<Communicator> comm = MPICommunicator::getInstance();
-        SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+        SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
         int myid                = comm->getProcessID();
         //int numOfProcesses      = comm->getNumberOfProcesses();
 
diff --git a/apps/cpu/ViskomatXL/viskomat.cpp b/apps/cpu/ViskomatXL/viskomat.cpp
index 327f25e59..7db98670e 100644
--- a/apps/cpu/ViskomatXL/viskomat.cpp
+++ b/apps/cpu/ViskomatXL/viskomat.cpp
@@ -40,7 +40,7 @@ void bflow(string configname)
 
       vf::basics::ConfigurationFile   viscosity;
 
-      std::shared_ptr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      std::shared_ptr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/Wing/wing.cpp b/apps/cpu/Wing/wing.cpp
index ff6cbcfca..d7e4cd770 100644
--- a/apps/cpu/Wing/wing.cpp
+++ b/apps/cpu/Wing/wing.cpp
@@ -30,7 +30,7 @@ void setup(const char *cstr1, const char *cstr2)
       int refineLevel = UbSystem::stringTo<int>(cf.getValue("refineLevel"));
       int blocknx = UbSystem::stringTo<int>(cf.getValue("blocknx"));
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if(machine == "Bombadil") int dumy=0; 
diff --git a/apps/cpu/aperm/aperm.cpp b/apps/cpu/aperm/aperm.cpp
index 7591afe4d..44b48bb3c 100644
--- a/apps/cpu/aperm/aperm.cpp
+++ b/apps/cpu/aperm/aperm.cpp
@@ -59,7 +59,7 @@ void run(string configname)
       double          cpStepStart = config.getDouble("cpStepStart");
       bool            newStart = config.getValue<bool>("newStart");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/aperm/aperm.cpp.old b/apps/cpu/aperm/aperm.cpp.old
index fd6f916d8..3776c454f 100644
--- a/apps/cpu/aperm/aperm.cpp.old
+++ b/apps/cpu/aperm/aperm.cpp.old
@@ -58,7 +58,7 @@ void run(string configname)
       bool            yDir = config.getBool("yDir");
       bool            zDir = config.getBool("zDir");
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/aperm/aperm.cpp.old2 b/apps/cpu/aperm/aperm.cpp.old2
index 8f0cf83e2..ece4410e9 100644
--- a/apps/cpu/aperm/aperm.cpp.old2
+++ b/apps/cpu/aperm/aperm.cpp.old2
@@ -55,7 +55,7 @@ void run(string configname)
       bool            yDir = config.getBool("yDir");
       bool            zDir = config.getBool("zDir");
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/bChannelA/bChannelA.cpp b/apps/cpu/bChannelA/bChannelA.cpp
index 01725f2a3..a59d829c3 100644
--- a/apps/cpu/bChannelA/bChannelA.cpp
+++ b/apps/cpu/bChannelA/bChannelA.cpp
@@ -111,7 +111,7 @@ void run(string configname)
       vector<double>  nupsStep          = config.getVector<double>("nupsStep");
       vector<double>  boundingBox       = config.getVector<double>("boundingBox");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/bChannelVA/bChannelVA.cpp b/apps/cpu/bChannelVA/bChannelVA.cpp
index 6cfe5dac2..363d02697 100644
--- a/apps/cpu/bChannelVA/bChannelVA.cpp
+++ b/apps/cpu/bChannelVA/bChannelVA.cpp
@@ -13,7 +13,7 @@ int main(int argc, char* argv[])
    try
    {
       //Sleep(20000);
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       //Pheonix
diff --git a/apps/cpu/bKanal/HLRNb/bKanal.cpp b/apps/cpu/bKanal/HLRNb/bKanal.cpp
index 0c5c46a0c..99b21eabf 100644
--- a/apps/cpu/bKanal/HLRNb/bKanal.cpp
+++ b/apps/cpu/bKanal/HLRNb/bKanal.cpp
@@ -27,7 +27,7 @@ void run(const char *cstr)
 
       UbLog::reportingLevel() = logINFO;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
 
diff --git a/apps/cpu/bKanal/bKanal.cpp b/apps/cpu/bKanal/bKanal.cpp
index 94af8f6aa..33994ad70 100644
--- a/apps/cpu/bKanal/bKanal.cpp
+++ b/apps/cpu/bKanal/bKanal.cpp
@@ -24,7 +24,7 @@ void run(const char *cstr)
 
       UbLog::reportingLevel() = logINFO;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       string machine = string(cstr);
diff --git a/apps/cpu/bKanal/sKanal/bKanal.cpp b/apps/cpu/bKanal/sKanal/bKanal.cpp
index 6a9d3c2c6..1048554fb 100644
--- a/apps/cpu/bKanal/sKanal/bKanal.cpp
+++ b/apps/cpu/bKanal/sKanal/bKanal.cpp
@@ -27,7 +27,7 @@ void run(const char *cstr)
 
       UbLog::reportingLevel() = logINFO;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if(machine == "PIPPINNEU") 
diff --git a/apps/cpu/bKanal2/bKanal2.cpp b/apps/cpu/bKanal2/bKanal2.cpp
index 10e6f9880..ec6326125 100644
--- a/apps/cpu/bKanal2/bKanal2.cpp
+++ b/apps/cpu/bKanal2/bKanal2.cpp
@@ -24,7 +24,7 @@ void run(const char *cstr)
 
       UbLog::reportingLevel() = logINFO;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       string machine = string(cstr);
diff --git a/apps/cpu/bKanalAv/bKanal.cpp b/apps/cpu/bKanalAv/bKanal.cpp
index 71ca1ed04..27bf3c1a6 100644
--- a/apps/cpu/bKanalAv/bKanal.cpp
+++ b/apps/cpu/bKanalAv/bKanal.cpp
@@ -27,7 +27,7 @@ void run(const char *cstr)
 
       UbLog::reportingLevel() = logINFO;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
 
diff --git a/apps/cpu/band/band.cpp b/apps/cpu/band/band.cpp
index 370e50341..b454ff128 100644
--- a/apps/cpu/band/band.cpp
+++ b/apps/cpu/band/band.cpp
@@ -20,7 +20,7 @@ void run(const char *cstr)
 
       //UbLog::reportingLevel() = logDEBUG5;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       string machine = string(cstr);
diff --git a/apps/cpu/bbone/bbone.cpp b/apps/cpu/bbone/bbone.cpp
index 3eb6c827c..558ff4c98 100644
--- a/apps/cpu/bbone/bbone.cpp
+++ b/apps/cpu/bbone/bbone.cpp
@@ -33,7 +33,7 @@ void sbonepd(string configname)
       bool            logToFile         = config.getBool("logToFile");
       double          deltaT            = config.getDouble("deltaT");
       
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/block_test/block_test_incompressible.hpp b/apps/cpu/block_test/block_test_incompressible.hpp
index 2ce506c93..61b8d762b 100644
--- a/apps/cpu/block_test/block_test_incompressible.hpp
+++ b/apps/cpu/block_test/block_test_incompressible.hpp
@@ -29,7 +29,7 @@ void block_test_incompressible(const char *cstr1, const char *cstr2)
       int numOfThreads = UbSystem::stringTo<int>(cf.getValue("numOfThreads"));
       double availMem = 0;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if(machine == "BOMBADIL") 
diff --git a/apps/cpu/bond_benchmark/bonb_b_chanel.cpp b/apps/cpu/bond_benchmark/bonb_b_chanel.cpp
index b5e63c50d..7df5a0162 100644
--- a/apps/cpu/bond_benchmark/bonb_b_chanel.cpp
+++ b/apps/cpu/bond_benchmark/bonb_b_chanel.cpp
@@ -29,7 +29,7 @@ void chanel(const char *cstr)
 
       string comm_type = cf.getValue("comm");
       if(comm_type == "MPI")
-         comm = vf::mpi::MPICommunicator::getInstance();
+         comm = vf::parallel::MPICommunicator::getInstance();
       else if(comm_type == "BOND")
          comm = BondCommunicator::getInstance();
       
diff --git a/apps/cpu/bond_benchmark/bond_b.cpp b/apps/cpu/bond_benchmark/bond_b.cpp
index 6d607811a..e3924595d 100644
--- a/apps/cpu/bond_benchmark/bond_b.cpp
+++ b/apps/cpu/bond_benchmark/bond_b.cpp
@@ -35,7 +35,7 @@ void periodic(const char *cstr1, const char *cstr2)
 
       string comm_type = cf.getValue("comm");
       if(comm_type == "MPI")
-         comm = vf::mpi::MPICommunicator::getInstance();
+         comm = vf::parallel::MPICommunicator::getInstance();
       else if(comm_type == "BOND")
          comm = BondCommunicator::getInstance();
 
diff --git a/apps/cpu/bond_test/bond_test.cpp b/apps/cpu/bond_test/bond_test.cpp
index b7091184f..fd77c285f 100644
--- a/apps/cpu/bond_test/bond_test.cpp
+++ b/apps/cpu/bond_test/bond_test.cpp
@@ -153,7 +153,7 @@ void simulation(const char *cstr)
       CommunicatorPtr comm;
       string comm_type = cf.getValue("comm");
       if(comm_type == "MPI")
-         comm = vf::mpi::MPICommunicator::getInstance();
+         comm = vf::parallel::MPICommunicator::getInstance();
       else if(comm_type == "BOND")
          comm = BondCommunicator::getInstance();
 
diff --git a/apps/cpu/bone/bone.cpp b/apps/cpu/bone/bone.cpp
index 849241ba2..17d0eca37 100644
--- a/apps/cpu/bone/bone.cpp
+++ b/apps/cpu/bone/bone.cpp
@@ -18,7 +18,7 @@ void run(const char *cstr1, const char *cstr2)
       stringstream logFilename;
       double availMem = 0;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       string machine = string(cstr1);
diff --git a/apps/cpu/f16Test/f16test.cpp b/apps/cpu/f16Test/f16test.cpp
index 2360d962a..32f424bdb 100644
--- a/apps/cpu/f16Test/f16test.cpp
+++ b/apps/cpu/f16Test/f16test.cpp
@@ -42,7 +42,7 @@ void run(string configname)
       double          refineDistance = config.getDouble("refineDistance");
       vector<double>  nupsStep = config.getVector<double>("nupsStep");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/insitu_demo/insitu_demo.cpp b/apps/cpu/insitu_demo/insitu_demo.cpp
index 42a1c6b4c..bd5f15019 100644
--- a/apps/cpu/insitu_demo/insitu_demo.cpp
+++ b/apps/cpu/insitu_demo/insitu_demo.cpp
@@ -15,7 +15,7 @@ void chanel(const char *cstr1)
       double availMem = 0;
 
       //CommunicatorPtr comm = FETOLCommunicator::getInstance();
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
 
       int myid = comm->getProcessID();
       int mybundle = comm->getBundleID();
diff --git a/apps/cpu/levels/levels.cpp b/apps/cpu/levels/levels.cpp
index a5ac85880..10672abe4 100644
--- a/apps/cpu/levels/levels.cpp
+++ b/apps/cpu/levels/levels.cpp
@@ -14,7 +14,7 @@ void run(string configname)
 
       string machine = QUOTEME(CAB_MACHINE);
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
 
       int myid = comm->getProcessID();
       int mybundle = comm->getBundleID();
diff --git a/apps/cpu/micropart/micropartTestQs3.hpp b/apps/cpu/micropart/micropartTestQs3.hpp
index 14e9a8441..d8c870269 100644
--- a/apps/cpu/micropart/micropartTestQs3.hpp
+++ b/apps/cpu/micropart/micropartTestQs3.hpp
@@ -9,7 +9,7 @@ void micropartTestQs3(const char *cstr)
 {
    try
    {
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
       int numprocs = comm->getNumberOfProcesses();
 
diff --git a/apps/cpu/mirror/mirror.cpp b/apps/cpu/mirror/mirror.cpp
index b85d9b249..99ba78ff1 100644
--- a/apps/cpu/mirror/mirror.cpp
+++ b/apps/cpu/mirror/mirror.cpp
@@ -49,7 +49,7 @@ void run(string configname)
       string          VRES1100_Spiegel_fein = config.getValue<string>("VRES1100_Spiegel_fein");
 
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/mpi_benchmark/mpib.cpp b/apps/cpu/mpi_benchmark/mpib.cpp
index 797efbc7e..8ddf7bde3 100644
--- a/apps/cpu/mpi_benchmark/mpib.cpp
+++ b/apps/cpu/mpi_benchmark/mpib.cpp
@@ -8,7 +8,7 @@ using namespace std;
 
 void run(string configname)
 {
-   SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+   SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
    int myid = comm->getProcessID();
 
    // Get the name of the processor
diff --git a/apps/cpu/pChannel/pChannel.cpp b/apps/cpu/pChannel/pChannel.cpp
index 72292679f..d30b28d26 100644
--- a/apps/cpu/pChannel/pChannel.cpp
+++ b/apps/cpu/pChannel/pChannel.cpp
@@ -206,7 +206,7 @@ void run(string configname)
       vector<double>  nupsStep          = config.getVector<double>("nupsStep");
       vector<double>  boundingBox       = config.getVector<double>("boundingBox");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/pChannel/pChannel.cpp.hlrn b/apps/cpu/pChannel/pChannel.cpp.hlrn
index f25a0c4c2..812566c96 100644
--- a/apps/cpu/pChannel/pChannel.cpp.hlrn
+++ b/apps/cpu/pChannel/pChannel.cpp.hlrn
@@ -52,7 +52,7 @@ void run(string configname)
       double          timeLineTsStop    = config.getDouble("timeLineTsStop");
 
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/pDisk/pdisk.cpp b/apps/cpu/pDisk/pdisk.cpp
index f19e04ff8..fed4f38b6 100644
--- a/apps/cpu/pDisk/pdisk.cpp
+++ b/apps/cpu/pDisk/pdisk.cpp
@@ -39,7 +39,7 @@ void run(string configname)
 
       //UbLog::reportingLevel() = logDEBUG5;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
 
diff --git a/apps/cpu/perm/perm.cpp b/apps/cpu/perm/perm.cpp
index ff0af00b8..4ea9ac937 100644
--- a/apps/cpu/perm/perm.cpp
+++ b/apps/cpu/perm/perm.cpp
@@ -44,7 +44,7 @@ void perm(string configname)
       double          deltax = config.getValue<double>("deltax");
       bool            writeSampleToFile = config.getValue<bool>("writeSampleToFile");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/perm/perm.cpp_s b/apps/cpu/perm/perm.cpp_s
index 21db434d5..e40c55fbf 100644
--- a/apps/cpu/perm/perm.cpp_s
+++ b/apps/cpu/perm/perm.cpp_s
@@ -23,7 +23,7 @@ void perm(const char *configname)
          throw exceptionText;
       }
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (machine == "BOMBADIL")
diff --git a/apps/cpu/plate/plate.cpp b/apps/cpu/plate/plate.cpp
index 28db0262f..e4c78c604 100644
--- a/apps/cpu/plate/plate.cpp
+++ b/apps/cpu/plate/plate.cpp
@@ -25,7 +25,7 @@ void run(const char *cstr, double endTime)
 
       //UbLog::reportingLevel() = logDEBUG5;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       string machine = string(cstr);
diff --git a/apps/cpu/plate2/plate2.cpp b/apps/cpu/plate2/plate2.cpp
index a908abf5b..1fd5a281e 100644
--- a/apps/cpu/plate2/plate2.cpp
+++ b/apps/cpu/plate2/plate2.cpp
@@ -18,7 +18,7 @@ void run(const char *cstr1, const char *cstr2)
       stringstream logFilename;
       double availMem = 0;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       string machine = string(cstr1);
diff --git a/apps/cpu/poiseuille_example/poiseuille.cpp b/apps/cpu/poiseuille_example/poiseuille.cpp
index 52fede221..d2e2c178e 100644
--- a/apps/cpu/poiseuille_example/poiseuille.cpp
+++ b/apps/cpu/poiseuille_example/poiseuille.cpp
@@ -25,7 +25,7 @@ int main()
     const auto lbmUnitConverter = std::make_shared<LBMUnitConverter>();
     const auto writer = WbWriterVtkXmlBinary::getInstance();
 
-    const auto communicator = vf::mpi::MPICommunicator::getInstance();
+    const auto communicator = vf::parallel::MPICommunicator::getInstance();
     const auto kernel = std::make_shared<CompressibleCumulant4thOrderViscosityLBMKernel>();
     kernel->setBCProcessor(std::make_shared<BCProcessor>());
     kernel->setForcingX1(1e-6 * lbmUnitConverter->getFactorForceWToLb());
diff --git a/apps/cpu/porplate2/porplate.cpp b/apps/cpu/porplate2/porplate.cpp
index 2414e0773..fe93f8fa5 100644
--- a/apps/cpu/porplate2/porplate.cpp
+++ b/apps/cpu/porplate2/porplate.cpp
@@ -316,7 +316,7 @@ void run(const char *cstr, bool firststart)
       stringstream logFilename;
       double availMem = 0;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       string machine = string(cstr);
diff --git a/apps/cpu/rheometer/rheometer.cpp b/apps/cpu/rheometer/rheometer.cpp
index ca3378bd5..68db541bc 100644
--- a/apps/cpu/rheometer/rheometer.cpp
+++ b/apps/cpu/rheometer/rheometer.cpp
@@ -43,7 +43,7 @@ void bflow(string configname)
 
       //outputPath = outputPath + "/rheometerBingham_" + config.getValue<string>("resolution") + "_" + config.getValue<string>("OmegaLB");
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/sbone/sbone.cpp b/apps/cpu/sbone/sbone.cpp
index 321396da6..b52aaa3ac 100644
--- a/apps/cpu/sbone/sbone.cpp
+++ b/apps/cpu/sbone/sbone.cpp
@@ -23,7 +23,7 @@ void sbonepd(const char *configname)
          throw exceptionText;
       }
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if(machine == "BOMBADIL") 
diff --git a/apps/cpu/screw/screw.cpp b/apps/cpu/screw/screw.cpp
index 7ba90a586..ad7c99774 100644
--- a/apps/cpu/screw/screw.cpp
+++ b/apps/cpu/screw/screw.cpp
@@ -29,7 +29,7 @@ int main(int argc, char* argv[])
       int             restartStep  = config.getValue<int>("restartStep");
 
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       SPtr<LBMUnitConverter> conv = SPtr<LBMUnitConverter>(new LBMUnitConverter());
diff --git a/apps/cpu/sphere/sphere.cpp b/apps/cpu/sphere/sphere.cpp
index 1f0f5c116..5411449c7 100644
--- a/apps/cpu/sphere/sphere.cpp
+++ b/apps/cpu/sphere/sphere.cpp
@@ -11,7 +11,7 @@ void run(string configname)
 
    try
    {
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
 
       int myid = comm->getProcessID();
 
diff --git a/apps/cpu/stick/stick.cpp b/apps/cpu/stick/stick.cpp
index 62efec809..8bbc82000 100644
--- a/apps/cpu/stick/stick.cpp
+++ b/apps/cpu/stick/stick.cpp
@@ -19,7 +19,7 @@ void main()
       int numOfThreads = 4;
       double availMem = 10e9;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       double dx = 1;
diff --git a/apps/cpu/teperm/teperm.cpp b/apps/cpu/teperm/teperm.cpp
index 9c4c1585f..78e57d8ef 100644
--- a/apps/cpu/teperm/teperm.cpp
+++ b/apps/cpu/teperm/teperm.cpp
@@ -63,7 +63,7 @@ void run(string configname)
       int             chunk = config.getValue<int>("chunk");
 
 
-      SPtr<vf::mpi::Communicator> comm = vf::mpi::MPICommunicator::getInstance();
+      SPtr<vf::parallel::Communicator> comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       if (logToFile)
diff --git a/apps/cpu/town/town.cpp b/apps/cpu/town/town.cpp
index ccaf90f8d..7fcb83b31 100644
--- a/apps/cpu/town/town.cpp
+++ b/apps/cpu/town/town.cpp
@@ -18,7 +18,7 @@ void run(const char *cstr1, const char *cstr2)
       stringstream logFilename;
       double availMem = 0;
 
-      CommunicatorPtr comm = vf::mpi::MPICommunicator::getInstance();
+      CommunicatorPtr comm = vf::parallel::MPICommunicator::getInstance();
       int myid = comm->getProcessID();
 
       string machine = string(cstr1);
diff --git a/apps/gpu/ActuatorLine/ActuatorLine.cpp b/apps/gpu/ActuatorLine/ActuatorLine.cpp
index 558f15e19..2ec440fed 100644
--- a/apps/gpu/ActuatorLine/ActuatorLine.cpp
+++ b/apps/gpu/ActuatorLine/ActuatorLine.cpp
@@ -31,28 +31,25 @@
 //! \author Henry Korb, Henrik Asmuth
 //=======================================================================================
 #define _USE_MATH_DEFINES
-#include <math.h>
-#include <string>
-#include <sstream>
-#include <iostream>
-#include <stdexcept>
-#include <fstream>
+#include <cmath>
 #include <exception>
+#include <fstream>
+#include <iostream>
 #include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
 
 //////////////////////////////////////////////////////////////////////////
 
-#include "DataTypes.h"
-#include "PointerDefinitions.h"
-
-#include "StringUtilities/StringUtil.h"
-
-
-
+#include <basics/DataTypes.h>
+#include <basics/PointerDefinitions.h>
+#include <basics/StringUtilities/StringUtil.h>
 #include <basics/config/ConfigurationFile.h>
 
 #include <logger/Logger.h>
 
+#include <parallel/MPICommunicator.h>
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -69,7 +66,6 @@
 //////////////////////////////////////////////////////////////////////////
 
 #include "VirtualFluids_GPU/LBM/Simulation.h"
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
@@ -105,7 +101,7 @@ std::string simulationName("ActuatorLine");
 
 void multipleLevel(const std::string& configPath)
 {
-    vf::gpu::Communicator& communicator = vf::gpu::MpiCommunicator::getInstance();
+    vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
 
     vf::basics::ConfigurationFile config;
     config.load(configPath);
@@ -134,7 +130,7 @@ void multipleLevel(const std::string& configPath)
     const float tStartOutProbe      =  config.getValue<real>("tStartOutProbe");
     const float tOutProbe           =  config.getValue<real>("tOutProbe");
         
-    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcess(), communicator.getPID(), &config);
+    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcesses(), communicator.getProcessID(), &config);
     BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
     GridScalingFactory scalingFactory  = GridScalingFactory();
 
diff --git a/apps/gpu/ActuatorLine/CMakeLists.txt b/apps/gpu/ActuatorLine/CMakeLists.txt
index e0ff4e06e..c437ac81a 100644
--- a/apps/gpu/ActuatorLine/CMakeLists.txt
+++ b/apps/gpu/ActuatorLine/CMakeLists.txt
@@ -1,6 +1,6 @@
 PROJECT(ActuatorLine LANGUAGES CUDA CXX)
 
-vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES ActuatorLine.cpp)
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES ActuatorLine.cpp)
 
 set_source_files_properties(ActuatorLine.cpp PROPERTIES LANGUAGE CUDA)
 
diff --git a/apps/gpu/BoundaryLayer/BoundaryLayer.cpp b/apps/gpu/BoundaryLayer/BoundaryLayer.cpp
index 964f267a0..d62b3352d 100644
--- a/apps/gpu/BoundaryLayer/BoundaryLayer.cpp
+++ b/apps/gpu/BoundaryLayer/BoundaryLayer.cpp
@@ -31,28 +31,27 @@
 //! \author Henry Korb, Henrik Asmuth
 //=======================================================================================
 #define _USE_MATH_DEFINES
-#include <math.h>
-#include <string>
-#include <sstream>
-#include <iostream>
-#include <stdexcept>
-#include <fstream>
+#include <cmath>
 #include <exception>
+#include <fstream>
+#include <iostream>
 #include <memory>
 #include <numeric>
+#include <sstream>
+#include <stdexcept>
+#include <string>
 
 //////////////////////////////////////////////////////////////////////////
 
-#include "DataTypes.h"
-#include "PointerDefinitions.h"
-
-#include "StringUtilities/StringUtil.h"
-
+#include <basics/DataTypes.h>
+#include <basics/PointerDefinitions.h>
+#include <basics/StringUtilities/StringUtil.h>
 #include <basics/config/ConfigurationFile.h>
-#include "basics/constants/NumericConstants.h"
+#include <basics/constants/NumericConstants.h>
 
 #include <logger/Logger.h>
 
+#include <parallel/MPICommunicator.h>
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -71,7 +70,6 @@
 //////////////////////////////////////////////////////////////////////////
 
 #include "VirtualFluids_GPU/LBM/Simulation.h"
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
@@ -102,19 +100,18 @@ using namespace vf::basics::constant;
 void multipleLevel(const std::string& configPath)
 {
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    vf::gpu::Communicator& communicator = vf::gpu::MpiCommunicator::getInstance();
+    vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
 
     vf::basics::ConfigurationFile config;
     config.load(configPath);
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////^
-    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcess(), communicator.getPID(), &config);
+    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcesses(), communicator.getProcessID(), &config);
     BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
     GridScalingFactory scalingFactory  = GridScalingFactory();
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     
-    const int  nProcs = communicator.getNumberOfProcess();
-    const uint procID = vf::gpu::MpiCommunicator::getInstance().getPID();
+    const int nProcs = communicator.getNumberOfProcesses();
+    const uint procID = communicator.getProcessID();
     std::vector<uint> devices(10);
     std::iota(devices.begin(), devices.end(), 0);
     para->setDevices(devices);
@@ -422,7 +419,6 @@ void multipleLevel(const std::string& configPath)
         SPtr<PrecursorWriter> precursorWriter = std::make_shared<PrecursorWriter>("precursor", para->getOutputPath()+precursorDirectory, posXPrecursor, 0, L_y, 0, L_z, tStartPrecursor/dt, nTWritePrecursor, useDistributions? OutputVariable::Distributions: OutputVariable::Velocities, 1000);
         para->addProbe(precursorWriter);
     }
-
     auto cudaMemoryManager = std::make_shared<CudaMemoryManager>(para);
     auto gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
 
diff --git a/apps/gpu/BoundaryLayer/CMakeLists.txt b/apps/gpu/BoundaryLayer/CMakeLists.txt
index 801b63480..248c7a161 100644
--- a/apps/gpu/BoundaryLayer/CMakeLists.txt
+++ b/apps/gpu/BoundaryLayer/CMakeLists.txt
@@ -1,6 +1,6 @@
 PROJECT(BoundaryLayer LANGUAGES CUDA CXX)
 
-vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES BoundaryLayer.cpp)
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES BoundaryLayer.cpp)
 
 set_source_files_properties(BoundaryLayer.cpp PROPERTIES LANGUAGE CUDA)
 
diff --git a/apps/gpu/ChannelFlow/CMakeLists.txt b/apps/gpu/ChannelFlow/CMakeLists.txt
index f5b1bfd40..3884074e0 100644
--- a/apps/gpu/ChannelFlow/CMakeLists.txt
+++ b/apps/gpu/ChannelFlow/CMakeLists.txt
@@ -1,6 +1,6 @@
 PROJECT(ChannelFlow LANGUAGES CUDA CXX)
 
-vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES ChannelFlow.cpp)
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES ChannelFlow.cpp)
 
 set_source_files_properties(ChannelFlow.cpp PROPERTIES LANGUAGE CUDA)
 
diff --git a/apps/gpu/ChannelFlow/ChannelFlow.cpp b/apps/gpu/ChannelFlow/ChannelFlow.cpp
index 3279caba1..bac6f4b5a 100644
--- a/apps/gpu/ChannelFlow/ChannelFlow.cpp
+++ b/apps/gpu/ChannelFlow/ChannelFlow.cpp
@@ -41,8 +41,6 @@
 #include <stdexcept>
 #include <string>
 
-#include "mpi.h"
-
 //////////////////////////////////////////////////////////////////////////
 
 #include "DataTypes.h"
@@ -64,7 +62,6 @@
 //////////////////////////////////////////////////////////////////////////
 
 #include "VirtualFluids_GPU/BoundaryConditions/BoundaryConditionFactory.h"
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
@@ -75,6 +72,8 @@
 
 //////////////////////////////////////////////////////////////////////////
 
+#include <parallel/MPICommunicator.h>
+
 int main(int argc, char *argv[])
 {
     try {
@@ -94,21 +93,21 @@ int main(int argc, char *argv[])
         // setup simulation parameters (without config file)
         //////////////////////////////////////////////////////////////////////////
 
-        vf::gpu::Communicator &communicator = vf::gpu::MpiCommunicator::getInstance();
-        const int numberOfProcesses = communicator.getNumberOfProcess();
-        SPtr<Parameter> para = std::make_shared<Parameter>(numberOfProcesses, communicator.getPID());
+        vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
+        const int numberOfProcesses = communicator.getNumberOfProcesses();
+        SPtr<Parameter> para = std::make_shared<Parameter>(numberOfProcesses, communicator.getProcessID());
         std::vector<uint> devices(10);
         std::iota(devices.begin(), devices.end(), 0);
         para->setDevices(devices);
-        para->setMaxDev(communicator.getNumberOfProcess());
+        para->setMaxDev(communicator.getNumberOfProcesses());
         BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
 
         //////////////////////////////////////////////////////////////////////////
         // setup logger
         //////////////////////////////////////////////////////////////////////////
-
+        const auto pid = communicator.getProcessID();
         vf::logging::Logger::changeLogPath("output/vflog_process" +
-                                           std::to_string(vf::gpu::MpiCommunicator::getInstance().getPID()) + ".txt");
+                                           std::to_string(pid) + ".txt");
         vf::logging::Logger::initializeLogger();
 
         //////////////////////////////////////////////////////////////////////////
@@ -150,7 +149,7 @@ int main(int argc, char *argv[])
         para->setOutputPrefix("ChannelFlow");
         para->setMainKernel(vf::CollisionKernel::Compressible::CumulantK17);
 
-        const uint generatePart = vf::gpu::MpiCommunicator::getInstance().getPID();
+        const uint generatePart = pid;
         real overlap = (real)8.0 * dx;
 
         if (numberOfProcesses > 1) {
diff --git a/apps/gpu/DrivenCavity/CMakeLists.txt b/apps/gpu/DrivenCavity/CMakeLists.txt
index 5ad927a42..8646f29c1 100644
--- a/apps/gpu/DrivenCavity/CMakeLists.txt
+++ b/apps/gpu/DrivenCavity/CMakeLists.txt
@@ -2,4 +2,4 @@ PROJECT(DrivenCavity LANGUAGES CXX)
 
 #LIST(APPEND CS_COMPILER_FLAGS_CXX "-DOMPI_SKIP_MPICXX" )
 
-vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES DrivenCavity.cpp)
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES DrivenCavity.cpp)
diff --git a/apps/gpu/DrivenCavity/DrivenCavity.cpp b/apps/gpu/DrivenCavity/DrivenCavity.cpp
index db92f80f1..807d7dc31 100644
--- a/apps/gpu/DrivenCavity/DrivenCavity.cpp
+++ b/apps/gpu/DrivenCavity/DrivenCavity.cpp
@@ -46,6 +46,8 @@
 
 #include <logger/Logger.h>
 
+#include <parallel/MPICommunicator.h>
+
 //////////////////////////////////////////////////////////////////////////
 
 #include "GridGenerator/grid/BoundaryConditions/Side.h"
@@ -57,7 +59,6 @@
 
 #include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
 #include "VirtualFluids_GPU/Factories/GridScalingFactory.h"
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
@@ -155,7 +156,7 @@ int main()
         // set copy mesh to simulation
         //////////////////////////////////////////////////////////////////////////
 
-        vf::gpu::Communicator &communicator = vf::gpu::MpiCommunicator::getInstance();
+        vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
 
         auto cudaMemoryManager = std::make_shared<CudaMemoryManager>(para);
         SPtr<GridProvider> gridGenerator =
diff --git a/apps/gpu/DrivenCavityMultiGPU/CMakeLists.txt b/apps/gpu/DrivenCavityMultiGPU/CMakeLists.txt
index 0043a271f..6460abb1c 100644
--- a/apps/gpu/DrivenCavityMultiGPU/CMakeLists.txt
+++ b/apps/gpu/DrivenCavityMultiGPU/CMakeLists.txt
@@ -1,3 +1,3 @@
 PROJECT(DrivenCavityMultiGPU LANGUAGES CXX)
 
-vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES DrivenCavityMultiGPU.cpp)
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES DrivenCavityMultiGPU.cpp)
diff --git a/apps/gpu/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp b/apps/gpu/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp
index 2a829fd75..a2f0f087e 100755
--- a/apps/gpu/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp
+++ b/apps/gpu/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp
@@ -1,14 +1,12 @@
 #define _USE_MATH_DEFINES
 #include <exception>
+#include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <string>
-#include <filesystem>
-
-#include "mpi.h"
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -35,7 +33,6 @@
 
 //////////////////////////////////////////////////////////////////////////
 
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
@@ -52,7 +49,7 @@
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
 
 //////////////////////////////////////////////////////////////////////////
-
+#include <parallel/MPICommunicator.h>
 #include "utilities/communication.h"
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -61,11 +58,11 @@
 
 void runVirtualFluids(const vf::basics::ConfigurationFile& config)
 {
-    vf::gpu::Communicator& communicator = vf::gpu::MpiCommunicator::getInstance();
+    vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
 
     auto gridBuilder = std::make_shared<MultipleGridBuilder>();
 
-    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcess(), communicator.getPID(), &config);
+    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcesses(), communicator.getProcessID(), &config);
     BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
     GridScalingFactory scalingFactory = GridScalingFactory();
 
@@ -142,7 +139,7 @@ void runVirtualFluids(const vf::basics::ConfigurationFile& config)
 
         if (para->getNumprocs() > 1) {
 
-            const uint generatePart = vf::gpu::MpiCommunicator::getInstance().getPID();
+            const uint generatePart = communicator.getProcessID();
             real overlap            = (real)8.0 * dxGrid;
             gridBuilder->setNumberOfLayers(10, 8);
 
@@ -150,7 +147,7 @@ void runVirtualFluids(const vf::basics::ConfigurationFile& config)
             const real ySplit = 0.0;
             const real zSplit = 0.0;
 
-            if (communicator.getNumberOfProcess() == 2) {
+            if (communicator.getNumberOfProcesses() == 2) {
 
                 if (generatePart == 0) {
                     gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xGridMax, yGridMax, zSplit + overlap,
@@ -197,7 +194,7 @@ void runVirtualFluids(const vf::basics::ConfigurationFile& config)
                 gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
                 gridBuilder->setVelocityBoundaryCondition(SideType::PY, 0.0, 0.0, 0.0);
                 //////////////////////////////////////////////////////////////////////////
-            } else if (communicator.getNumberOfProcess() == 4) {
+            } else if (communicator.getNumberOfProcesses() == 4) {
 
                 if (generatePart == 0) {
                     gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xSplit + overlap, yGridMax,
@@ -281,7 +278,7 @@ void runVirtualFluids(const vf::basics::ConfigurationFile& config)
                     gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
                 }
                 //////////////////////////////////////////////////////////////////////////
-            } else if (communicator.getNumberOfProcess() == 8) {
+            } else if (communicator.getNumberOfProcesses() == 8) {
 
                 if (generatePart == 0) {
                     gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xSplit + overlap, ySplit + overlap,
diff --git a/apps/gpu/DrivenCavityUniform/CMakeLists.txt b/apps/gpu/DrivenCavityUniform/CMakeLists.txt
index f5be8b190..78d6d693a 100644
--- a/apps/gpu/DrivenCavityUniform/CMakeLists.txt
+++ b/apps/gpu/DrivenCavityUniform/CMakeLists.txt
@@ -2,4 +2,4 @@ PROJECT(DrivenCavityUniform LANGUAGES CXX)
 
 #LIST(APPEND CS_COMPILER_FLAGS_CXX "-DOMPI_SKIP_MPICXX" )
 
-vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES DrivenCavity.cpp)
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES DrivenCavity.cpp)
diff --git a/apps/gpu/DrivenCavityUniform/DrivenCavity.cpp b/apps/gpu/DrivenCavityUniform/DrivenCavity.cpp
index c9857b503..4d73fedee 100644
--- a/apps/gpu/DrivenCavityUniform/DrivenCavity.cpp
+++ b/apps/gpu/DrivenCavityUniform/DrivenCavity.cpp
@@ -42,9 +42,11 @@
 //////////////////////////////////////////////////////////////////////////
 
 #include <basics/DataTypes.h>
+#include <basics/PointerDefinitions.h>
+
 #include <logger/Logger.h>
 
-#include "PointerDefinitions.h"
+#include <parallel/MPICommunicator.h>
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -57,7 +59,6 @@
 
 #include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
 #include "VirtualFluids_GPU/Factories/GridScalingFactory.h"
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
@@ -66,6 +67,7 @@
 #include "VirtualFluids_GPU/Parameter/Parameter.h"
 #include "VirtualFluids_GPU/Kernel/Utilities/KernelTypes.h"
 
+
 //////////////////////////////////////////////////////////////////////////
 
 int main()
@@ -157,13 +159,12 @@ int main()
         // set copy mesh to simulation
         //////////////////////////////////////////////////////////////////////////
 
-        vf::gpu::Communicator &communicator = vf::gpu::MpiCommunicator::getInstance();
+        vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
 
         auto cudaMemoryManager = std::make_shared<CudaMemoryManager>(para);
         SPtr<GridProvider> gridGenerator =
             GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
 
-
         //////////////////////////////////////////////////////////////////////////
         // run simulation
         //////////////////////////////////////////////////////////////////////////
diff --git a/apps/gpu/MusselOyster/CMakeLists.txt b/apps/gpu/MusselOyster/CMakeLists.txt
index db61e916c..966c802b2 100644
--- a/apps/gpu/MusselOyster/CMakeLists.txt
+++ b/apps/gpu/MusselOyster/CMakeLists.txt
@@ -1,3 +1,3 @@
 PROJECT(MusselOyster LANGUAGES CXX)
 
-vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES MusselOyster.cpp)
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES MusselOyster.cpp)
diff --git a/apps/gpu/MusselOyster/MusselOyster.cpp b/apps/gpu/MusselOyster/MusselOyster.cpp
index e1d501ee8..8ee16a231 100644
--- a/apps/gpu/MusselOyster/MusselOyster.cpp
+++ b/apps/gpu/MusselOyster/MusselOyster.cpp
@@ -36,7 +36,6 @@
 
 //////////////////////////////////////////////////////////////////////////
 
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
@@ -50,6 +49,7 @@
 //////////////////////////////////////////////////////////////////////////
 
 #include "utilities/communication.h"
+#include <parallel/MPICommunicator.h>
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -85,12 +85,12 @@ const std::string simulationName("MusselOyster");
 
 void runVirtualFluids(const vf::basics::ConfigurationFile& config)
 {
-    vf::gpu::Communicator &communicator = vf::gpu::MpiCommunicator::getInstance();
+    vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
 
     auto gridBuilder = std::make_shared<MultipleGridBuilder>();
 
     SPtr<Parameter> para =
-        std::make_shared<Parameter>(communicator.getNumberOfProcess(), communicator.getPID(), &config);
+        std::make_shared<Parameter>(communicator.getNumberOfProcesses(), communicator.getProcessID(), &config);
     BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -188,12 +188,12 @@ void runVirtualFluids(const vf::basics::ConfigurationFile& config)
             bivalveRef_1_STL = std::make_shared<TriangularMesh>(stlPath + bivalveType + "_Level1.stl");
 
         if (para->getNumprocs() > 1) {
-            const uint generatePart = vf::gpu::MpiCommunicator::getInstance().getPID();
+            const uint generatePart = communicator.getProcessID();
 
             real overlap = (real)8.0 * dxGrid;
             gridBuilder->setNumberOfLayers(10, 8);
 
-            if (communicator.getNumberOfProcess() == 2) {
+            if (communicator.getNumberOfProcesses() == 2) {
                 const real zSplit = 0.0; // round(((double)bbzp + bbzm) * 0.5);
 
                 if (generatePart == 0) {
@@ -244,7 +244,7 @@ void runVirtualFluids(const vf::basics::ConfigurationFile& config)
                 gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
                 gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
                 //////////////////////////////////////////////////////////////////////////
-            } else if (communicator.getNumberOfProcess() == 4) {
+            } else if (communicator.getNumberOfProcesses() == 4) {
 
                 const real xSplit = 100.0;
                 const real zSplit = 0.0;
@@ -334,7 +334,7 @@ void runVirtualFluids(const vf::basics::ConfigurationFile& config)
                     gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
                 }
                 //////////////////////////////////////////////////////////////////////////
-            } else if (communicator.getNumberOfProcess() == 8) {
+            } else if (communicator.getNumberOfProcesses() == 8) {
                 real xSplit = 140.0; // 100.0 // mit groesserem Level 1 140.0
                 real ySplit = 32.0;  // 32.0
                 real zSplit = 0.0;
diff --git a/apps/gpu/SphereGPU/CMakeLists.txt b/apps/gpu/SphereGPU/CMakeLists.txt
index 3f718aac7..1a162d6d2 100644
--- a/apps/gpu/SphereGPU/CMakeLists.txt
+++ b/apps/gpu/SphereGPU/CMakeLists.txt
@@ -1,3 +1,3 @@
 PROJECT(SphereGPU LANGUAGES CXX)
 
-vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES Sphere.cpp)
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES Sphere.cpp)
diff --git a/apps/gpu/SphereGPU/Sphere.cpp b/apps/gpu/SphereGPU/Sphere.cpp
index f7225b521..40cc55df4 100644
--- a/apps/gpu/SphereGPU/Sphere.cpp
+++ b/apps/gpu/SphereGPU/Sphere.cpp
@@ -41,11 +41,14 @@
 #include <string>
 
 //////////////////////////////////////////////////////////////////////////
-#include <basics/PointerDefinitions.h>
 #include <basics/DataTypes.h>
-#include <logger/Logger.h>
+#include <basics/PointerDefinitions.h>
 #include <basics/config/ConfigurationFile.h>
 
+#include <logger/Logger.h>
+
+#include <parallel/MPICommunicator.h>
+
 //////////////////////////////////////////////////////////////////////////
 
 #include "GridGenerator/grid/BoundaryConditions/Side.h"
@@ -60,7 +63,6 @@
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/LBM/Simulation.h"
 #include "VirtualFluids_GPU/Output/FileWriter.h"
 #include "VirtualFluids_GPU/Parameter/Parameter.h"
@@ -69,7 +71,6 @@
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h"
 #include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h"
 
-
 int main(int argc, char *argv[])
 {
     try {
@@ -215,7 +216,7 @@ int main(int argc, char *argv[])
         //////////////////////////////////////////////////////////////////////////
         // setup to copy mesh to simulation
         //////////////////////////////////////////////////////////////////////////
-        vf::gpu::Communicator& communicator = vf::gpu::MpiCommunicator::getInstance();
+        vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
         auto cudaMemoryManager = std::make_shared<CudaMemoryManager>(para);
         SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
 
diff --git a/apps/gpu/SphereRefined/CMakeLists.txt b/apps/gpu/SphereRefined/CMakeLists.txt
index 9ede990de..87432021b 100644
--- a/apps/gpu/SphereRefined/CMakeLists.txt
+++ b/apps/gpu/SphereRefined/CMakeLists.txt
@@ -2,7 +2,7 @@ PROJECT(SphereRefined LANGUAGES CUDA CXX)
 
 #LIST(APPEND CS_COMPILER_FLAGS_CXX "-DOMPI_SKIP_MPICXX" )
 
-vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES SphereRefined.cpp)
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES SphereRefined.cpp)
 
 set_source_files_properties(SphereRefined.cpp PROPERTIES LANGUAGE CUDA)
 
diff --git a/apps/gpu/SphereRefined/SphereRefined.cpp b/apps/gpu/SphereRefined/SphereRefined.cpp
index f91f24766..a66518595 100644
--- a/apps/gpu/SphereRefined/SphereRefined.cpp
+++ b/apps/gpu/SphereRefined/SphereRefined.cpp
@@ -60,7 +60,6 @@
 
 #include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
 #include "VirtualFluids_GPU/Factories/GridScalingFactory.h"
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
@@ -70,12 +69,14 @@
 #include "VirtualFluids_GPU/Factories/GridScalingFactory.h"
 #include "VirtualFluids_GPU/Kernel/Utilities/KernelTypes.h"
 
+#include <parallel/MPICommunicator.h>
+
 //////////////////////////////////////////////////////////////////////////
 
 int main()
 {
     try {
-        vf::gpu::Communicator &communicator = vf::gpu::MpiCommunicator::getInstance();
+        vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
         vf::logging::Logger::initializeLogger();
         //////////////////////////////////////////////////////////////////////////
         // Simulation parameters
diff --git a/apps/gpu/SphereScaling/CMakeLists.txt b/apps/gpu/SphereScaling/CMakeLists.txt
index db3747f2b..7d2d77221 100644
--- a/apps/gpu/SphereScaling/CMakeLists.txt
+++ b/apps/gpu/SphereScaling/CMakeLists.txt
@@ -1,6 +1,6 @@
 PROJECT(SphereScaling LANGUAGES CUDA CXX)
 
-vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES SphereScaling.cpp)
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES SphereScaling.cpp)
 
 set_source_files_properties(SphereScaling.cpp PROPERTIES LANGUAGE CUDA)
 
diff --git a/apps/gpu/SphereScaling/SphereScaling.cpp b/apps/gpu/SphereScaling/SphereScaling.cpp
index 248f2074e..eab466a14 100755
--- a/apps/gpu/SphereScaling/SphereScaling.cpp
+++ b/apps/gpu/SphereScaling/SphereScaling.cpp
@@ -1,15 +1,13 @@
 #define _USE_MATH_DEFINES
+#include <cmath>
 #include <exception>
+#include <filesystem>
 #include <fstream>
 #include <iostream>
-#include <math.h>
 #include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <string>
-#include <filesystem>
-
-#include "mpi.h"
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -38,7 +36,6 @@
 
 //////////////////////////////////////////////////////////////////////////
 
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
@@ -57,6 +54,7 @@
 //////////////////////////////////////////////////////////////////////////
 
 #include "utilities/communication.h"
+#include <parallel/MPICommunicator.h>
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -64,9 +62,9 @@
 
 void runVirtualFluids(const vf::basics::ConfigurationFile& config)
 {
-    vf::gpu::Communicator& communicator = vf::gpu::MpiCommunicator::getInstance();
+    vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
 
-    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcess(), communicator.getPID(), &config);
+    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcesses(), communicator.getProcessID(), &config);
     BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
     GridScalingFactory scalingFactory = GridScalingFactory();
 
@@ -166,12 +164,12 @@ void runVirtualFluids(const vf::basics::ConfigurationFile& config)
         const real dCubeLev1   = 72.0; // Phoenix: 72.0
 
         if (para->getNumprocs() > 1) {
-            const uint generatePart = vf::gpu::MpiCommunicator::getInstance().getPID();
+            const uint generatePart = communicator.getProcessID();
 
             real overlap = (real)8.0 * dxGrid;
             gridBuilder->setNumberOfLayers(10, 8);
 
-            if (communicator.getNumberOfProcess() == 2) {
+            if (communicator.getNumberOfProcesses() == 2) {
                 real zSplit = 0.5 * sideLengthCube;
 
                 if (scalingType == "weak") {
@@ -245,7 +243,7 @@ void runVirtualFluids(const vf::basics::ConfigurationFile& config)
                 // gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
                 //////////////////////////////////////////////////////////////////////////
 
-            } else if (communicator.getNumberOfProcess() == 4) {
+            } else if (communicator.getNumberOfProcesses() == 4) {
                 real ySplit = 0.5 * sideLengthCube;
                 real zSplit = 0.5 * sideLengthCube;
 
@@ -361,7 +359,7 @@ void runVirtualFluids(const vf::basics::ConfigurationFile& config)
                 gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
                 // gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
                 //////////////////////////////////////////////////////////////////////////
-            } else if (communicator.getNumberOfProcess() == 8) {
+            } else if (communicator.getNumberOfProcesses() == 8) {
                 real xSplit = 0.5 * sideLengthCube;
                 real ySplit = 0.5 * sideLengthCube;
                 real zSplit = 0.5 * sideLengthCube;
diff --git a/apps/gpu/TGV_3D/TGV_3D.cpp b/apps/gpu/TGV_3D/TGV_3D.cpp
index 3ec7ac651..0a2d47dfc 100644
--- a/apps/gpu/TGV_3D/TGV_3D.cpp
+++ b/apps/gpu/TGV_3D/TGV_3D.cpp
@@ -35,14 +35,12 @@
 #include <filesystem>
 #include <fstream>
 #include <iostream>
-#include <math.h>
+#include <cmath>
 #include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 
-#include "mpi.h"
-
 //////////////////////////////////////////////////////////////////////////
 
 #include "DataTypes.h"
@@ -66,7 +64,6 @@
 
 //////////////////////////////////////////////////////////////////////////
 
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
@@ -76,7 +73,7 @@
 #include "VirtualFluids_GPU/Output/FileWriter.h"
 #include "VirtualFluids_GPU/Parameter/Parameter.h"
 
-
+#include <parallel/MPICommunicator.h>
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -131,13 +128,11 @@ std::string simulationName("TGV_3D");
 
 void multipleLevel(const std::string& configPath)
 {
-    vf::gpu::Communicator& communicator = vf::gpu::MpiCommunicator::getInstance();
-
-    
+    vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
 
     vf::basics::ConfigurationFile config;
     config.load(configPath);
-    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcess(), communicator.getPID(), &config);
+    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcesses(), communicator.getProcessID(), &config);
     BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/apps/gpu/WTG_RUB/CMakeLists.txt b/apps/gpu/WTG_RUB/CMakeLists.txt
index 606987dfb..d67ec1c07 100644
--- a/apps/gpu/WTG_RUB/CMakeLists.txt
+++ b/apps/gpu/WTG_RUB/CMakeLists.txt
@@ -1,6 +1,6 @@
 PROJECT(WTG_RUB LANGUAGES CUDA CXX)
 
-vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES WTG_RUB.cpp)
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES WTG_RUB.cpp)
 
 set_source_files_properties(WTG_RUB.cpp PROPERTIES LANGUAGE CUDA)
 
diff --git a/apps/gpu/WTG_RUB/WTG_RUB.cpp b/apps/gpu/WTG_RUB/WTG_RUB.cpp
index 00fe00a24..3d3283b2b 100644
--- a/apps/gpu/WTG_RUB/WTG_RUB.cpp
+++ b/apps/gpu/WTG_RUB/WTG_RUB.cpp
@@ -31,7 +31,7 @@
 //! \author Martin Schoenherr
 //=======================================================================================
 #define _USE_MATH_DEFINES
-#include <math.h>
+#include <cmath>
 #include <string>
 #include <sstream>
 #include <iostream>
@@ -41,8 +41,6 @@
 #include <memory>
 #include <filesystem>
 
-#include "mpi.h"
-
 //////////////////////////////////////////////////////////////////////////
 
 #include "DataTypes.h"
@@ -68,7 +66,6 @@
 //////////////////////////////////////////////////////////////////////////
 
 #include "VirtualFluids_GPU/LBM/Simulation.h"
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
@@ -77,6 +74,7 @@
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
 #include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
 
+#include <parallel/MPICommunicator.h>
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -129,11 +127,8 @@ std::string chooseVariation();
 
 void multipleLevel(const std::string& configPath)
 {
-    vf::gpu::Communicator& communicator = vf::gpu::MpiCommunicator::getInstance();
-
-    auto gridFactory = GridFactory::make();
-    gridFactory->setTriangularMeshDiscretizationMethod(TriangularMeshDiscretizationMethod::POINT_IN_OBJECT);
-    auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
+    vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
+    auto gridBuilder = MultipleGridBuilder::makeShared();
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -218,7 +213,7 @@ void multipleLevel(const std::string& configPath)
     vf::basics::ConfigurationFile config;
     config.load(configPath);
 
-    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcess(), communicator.getPID(), &config);
+    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcesses(), communicator.getProcessID(), &config);
     BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/apps/gpu/gridGeneratorTest/gridGenerator.cpp b/apps/gpu/gridGeneratorTest/gridGenerator.cpp
index dd74ae56a..501dfd06e 100644
--- a/apps/gpu/gridGeneratorTest/gridGenerator.cpp
+++ b/apps/gpu/gridGeneratorTest/gridGenerator.cpp
@@ -1,4 +1,3 @@
-#include <mpi.h>
 #include <fstream>
 #include <iostream>
 #include <stdexcept>
@@ -10,7 +9,6 @@
 #include "basics/config/ConfigurationFile.h"
 
 #include "VirtualFluids_GPU/LBM/Simulation.h"
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
 #include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
@@ -47,13 +45,14 @@
 #include "utilities/communication.h"
 #include "utilities/transformator/TransformatorImp.h"
 
+#include <parallel/MPICommunicator.h>
 
 void runVirtualFluids(const vf::basics::ConfigurationFile &config)
 {
-    vf::gpu::Communicator &communicator = vf::gpu::MpiCommunicator::getInstance();
+    vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
     auto gridBuilder = std::make_shared<MultipleGridBuilder>();
 
-    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcess(), communicator.getPID(), &config);
+    SPtr<Parameter> para = std::make_shared<Parameter>(communicator.getNumberOfProcesses(), communicator.getProcessID(), &config);
     BoundaryConditionFactory bcFactory = BoundaryConditionFactory();
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -621,7 +620,7 @@ void runVirtualFluids(const vf::basics::ConfigurationFile &config)
             para->setMaxDev(2);
 
             //const uint generatePart = 1;
-            const uint generatePart = communicator.getPID();
+            const uint generatePart = communicator.getProcessID();
 
             std::ofstream logFile2;
 
diff --git a/apps/gpu/tests/NumericalTests/Utilities/VirtualFluidSimulationFactory/VirtualFluidSimulationFactory.cpp b/apps/gpu/tests/NumericalTests/Utilities/VirtualFluidSimulationFactory/VirtualFluidSimulationFactory.cpp
index 3564f6502..baa9ed03f 100644
--- a/apps/gpu/tests/NumericalTests/Utilities/VirtualFluidSimulationFactory/VirtualFluidSimulationFactory.cpp
+++ b/apps/gpu/tests/NumericalTests/Utilities/VirtualFluidSimulationFactory/VirtualFluidSimulationFactory.cpp
@@ -9,9 +9,10 @@
 #include "VirtualFluids_GPU/Parameter/Parameter.h"
 
 #include "VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 #include "VirtualFluids_GPU/LBM/Simulation.h"
 
+#include <parallel/MPICommunicator.h>
+
 std::shared_ptr<Parameter> vf::gpu::tests::makeParameter(std::shared_ptr<SimulationParameter> simPara)
 {
     auto para = std::make_shared<Parameter>(1, 0);
@@ -119,8 +120,9 @@ const std::function<void()> vf::gpu::tests::makeVirtualFluidSimulation(std::shar
     auto cudaManager = std::make_shared<CudaMemoryManager>(para);
     auto grid = makeGridReader(condition, para, cudaManager);
     BoundaryConditionFactory bc_factory;
+    vf::parallel::Communicator &communicator = *vf::parallel::MPICommunicator::getInstance();
     auto simulation =
-        std::make_shared<Simulation>(para, cudaManager, vf::gpu::MpiCommunicator::getInstance(), *grid.get(), &bc_factory);
+        std::make_shared<Simulation>(para, cudaManager, communicator, *grid.get(), &bc_factory);
     simulation->setDataWriter(dataWriter);
 
     return [simulation]() { simulation->run(); };
diff --git a/pythonbindings/CMakeLists.txt b/pythonbindings/CMakeLists.txt
index c9bef9ef0..8dbf9047a 100644
--- a/pythonbindings/CMakeLists.txt
+++ b/pythonbindings/CMakeLists.txt
@@ -41,18 +41,27 @@ target_include_directories(lbm_bindings PRIVATE ${CMAKE_SOURCE_DIR}/src/)
 target_include_directories(lbm_bindings PRIVATE ${CMAKE_BINARY_DIR})
 add_dependencies(python_bindings lbm_bindings)
 
+pybind11_add_module(communicator_bindings MODULE src/communicator.cpp)
+set_target_properties(  communicator_bindings PROPERTIES
+                        LIBRARY_OUTPUT_DIRECTORY ${PYFLUIDS_DIR}
+                        OUTPUT_NAME "communicator")
+target_link_libraries(communicator_bindings PRIVATE parallel)
+target_include_directories(communicator_bindings PRIVATE ${CMAKE_SOURCE_DIR}/src/)
+target_include_directories(communicator_bindings PRIVATE ${CMAKE_BINARY_DIR})
+target_compile_definitions(communicator_bindings PRIVATE VF_MPI)
+add_dependencies(python_bindings communicator_bindings)
+
 
 IF(BUILD_VF_GPU)
     pybind11_add_module(gpu_bindings MODULE src/gpu/gpu.cpp)
     set_target_properties(  gpu_bindings PROPERTIES
                             LIBRARY_OUTPUT_DIRECTORY ${PYFLUIDS_DIR}
                             OUTPUT_NAME "gpu")
-    target_link_libraries(gpu_bindings PRIVATE basics)
     set_source_files_properties(src/gpu/gpu.cpp PROPERTIES LANGUAGE CUDA)
 
     target_include_directories(gpu_bindings PRIVATE ${VF_THIRD_DIR}/cuda_samples/)
 
-    target_link_libraries(gpu_bindings PRIVATE GridGenerator VirtualFluids_GPU)
+    target_link_libraries(gpu_bindings PRIVATE basics GridGenerator VirtualFluids_GPU parallel)
 
     target_include_directories(gpu_bindings PRIVATE ${CMAKE_SOURCE_DIR}/src/)
     target_include_directories(gpu_bindings PRIVATE ${CMAKE_BINARY_DIR})
@@ -70,15 +79,9 @@ IF(BUILD_VF_CPU)
     target_include_directories(cpu_bindings PRIVATE ${CMAKE_SOURCE_DIR}/src/)
     target_include_directories(cpu_bindings PRIVATE ${CMAKE_BINARY_DIR})
 
-    target_compile_definitions(cpu_bindings PUBLIC VF_DOUBLE_ACCURACY) # TODO: remove this and always set it dynamically
-    target_compile_definitions(basics_bindings PUBLIC VF_DOUBLE_ACCURACY)
-    target_compile_definitions(logger_bindings PUBLIC VF_DOUBLE_ACCURACY)
-    target_compile_definitions(lbm_bindings PUBLIC VF_DOUBLE_ACCURACY)
-
     target_compile_definitions(cpu_bindings PRIVATE VF_METIS VF_MPI)
     add_dependencies(python_bindings cpu_bindings)
 
-
     # include bindings for muparsers
     pybind11_add_module(pymuparser MODULE src/muParser.cpp)
 
@@ -91,3 +94,16 @@ IF(BUILD_VF_CPU)
     target_compile_definitions(pymuparser PRIVATE VF_METIS VF_MPI)
     target_link_libraries(pymuparser PRIVATE muparser)
 ENDIF()
+
+if(BUILD_VF_DOUBLE_ACCURACY)
+IF(BUILD_VF_CPU)
+    target_compile_definitions(cpu_bindings PRIVATE VF_DOUBLE_ACCURACY)
+endif()
+    target_compile_definitions(basics_bindings PRIVATE VF_DOUBLE_ACCURACY)
+    target_compile_definitions(logger_bindings PRIVATE VF_DOUBLE_ACCURACY)
+    target_compile_definitions(lbm_bindings PRIVATE VF_DOUBLE_ACCURACY)
+    IF(BUILD_VF_GPU)
+    target_compile_definitions(gpu_bindings PRIVATE VF_DOUBLE_ACCURACY)
+    endif()
+    target_compile_definitions(communicator_bindings PRIVATE VF_DOUBLE_ACCURACY)
+endif()
\ No newline at end of file
diff --git a/pythonbindings/pyfluids/__init__.py b/pythonbindings/pyfluids/__init__.py
index f0537b758..5b4197972 100644
--- a/pythonbindings/pyfluids/__init__.py
+++ b/pythonbindings/pyfluids/__init__.py
@@ -33,22 +33,26 @@ r"""
 =======================================================================================
 """
 try:
-    from .bindings import basics
+    from . import basics
 except ImportError:
     print("Basics bindings not included")
 try:
-    from .bindings import logger
+    from . import logger
 except ImportError:
     print("Logger bindings not included")
 try:
-    from .bindings import lbm
+    from . import lbm
 except ImportError:
     print("LBM bindings not included")
 try:
-    from .bindings import gpu
+    from . import communicator
+except ImportError:
+    print("communicator bindings not included")
+try:
+    from . import gpu
 except ImportError:
     print("GPU bindings not included")
 try:
-    from .bindings import cpu
+    from . import cpu
 except ImportError:
     print("CPU bindings not included")
\ No newline at end of file
diff --git a/pythonbindings/src/VirtualFluids.cpp b/pythonbindings/src/communicator.cpp
similarity index 72%
rename from pythonbindings/src/VirtualFluids.cpp
rename to pythonbindings/src/communicator.cpp
index 91682b79e..fe706bb9e 100644
--- a/pythonbindings/src/VirtualFluids.cpp
+++ b/pythonbindings/src/communicator.cpp
@@ -26,39 +26,24 @@
 //  You should have received a copy of the GNU General Public License along
 //  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file VirtualFluids.cpp
-//! \ingroup src
+//! \file communicator.cpp
+//! \ingroup submodules
 //! \author Henry Korb
 //=======================================================================================
+#include <pybind11/cast.h>
 #include <pybind11/pybind11.h>
-#include "basics/basics.cpp"
-#include "lbm/lbm.cpp"
-#include "logger/logger.cpp"
 
-#ifdef VF_GPU_PYTHONBINDINGS
-#include "gpu/gpu.cpp"
-#endif
-#ifdef VF_CPU_PYTHONBINDINGS
-#include "cpu/cpu.cpp"
-#endif
+#include <parallel/MPICommunicator.h>
 
-
-namespace py_bindings
+namespace communicator_bindings
 {
-    namespace py = pybind11;
+namespace py = pybind11;
 
-    PYBIND11_MODULE(bindings, m)
-    {
-        // because we do not use the old logger (src/basics/logger) anymore and cout is not passed anymore to the old logger, we probably do not need this anymore
-        // pybind11::add_ostream_redirect(m, "ostream_redirect");
-        basics::makeModule(m);
-        lbm::makeModule(m);
-        logging::makeModule(m);
-#ifdef VF_GPU_PYTHONBINDINGS
-        gpu::makeModule(m);
-#endif
-#ifdef VF_CPU_PYTHONBINDINGS
-        cpu::makeModule(m);
-#endif
-    }
+PYBIND11_MODULE(communicator, m)
+{
+    py::class_<vf::parallel::MPICommunicator, std::shared_ptr<vf::parallel::MPICommunicator>>(m, "Communicator")
+        .def_static("get_instance", &vf::parallel::MPICommunicator::getInstance)
+        .def("get_number_of_processes", &vf::parallel::MPICommunicator::getNumberOfProcesses)
+        .def("get_process_id", py::overload_cast<>(&vf::parallel::MPICommunicator::getProcessID, py::const_));
 }
+} // namespace communicator_bindings
diff --git a/pythonbindings/src/gpu/gpu.cpp b/pythonbindings/src/gpu/gpu.cpp
index 8946b1d8a..dcb4ded4b 100644
--- a/pythonbindings/src/gpu/gpu.cpp
+++ b/pythonbindings/src/gpu/gpu.cpp
@@ -35,7 +35,6 @@
 #include "submodules/simulation.cpp"
 #include "submodules/parameter.cpp"
 #include "submodules/boundary_conditions.cpp"
-#include "submodules/communicator.cpp"
 #include "submodules/cuda_memory_manager.cpp"
 #include "submodules/probes.cpp"
 #include "submodules/precursor_writer.cpp"
@@ -48,23 +47,20 @@
 
 namespace gpu_bindings
 {
-    namespace py = pybind11;
-
-    PYBIND11_MODULE(gpu, m)
-    {
-        simulation::makeModule(m);
-        parameter::makeModule(m);
-        pre_collision_interactor::makeModule(m);
-        actuator_farm::makeModule(m);
-        boundary_conditions::makeModule(m);
-        transient_bc_setter::makeModule(m);
-        communicator::makeModule(m); 
-        cuda_memory_manager::makeModule(m);
-        probes::makeModule(m);
-        precursor_writer::makeModule(m);
-        grid_generator::makeModule(m);
-        grid_provider::makeModule(m);
-        turbulence_model::makeModule(m);
-        grid_scaling_factory::makeModule(m);
-    }
-}
\ No newline at end of file
+PYBIND11_MODULE(gpu, m)
+{
+    simulation::makeModule(m);
+    parameter::makeModule(m);
+    pre_collision_interactor::makeModule(m);
+    actuator_farm::makeModule(m);
+    boundary_conditions::makeModule(m);
+    transient_bc_setter::makeModule(m);
+    cuda_memory_manager::makeModule(m);
+    probes::makeModule(m);
+    precursor_writer::makeModule(m);
+    grid_generator::makeModule(m);
+    grid_provider::makeModule(m);
+    turbulence_model::makeModule(m);
+    grid_scaling_factory::makeModule(m);
+}
+} // namespace gpu_bindings
diff --git a/pythonbindings/src/gpu/submodules/communicator.cpp b/pythonbindings/src/gpu/submodules/communicator.cpp
deleted file mode 100644
index 1cf40090f..000000000
--- a/pythonbindings/src/gpu/submodules/communicator.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//=======================================================================================
-// ____          ____    __    ______     __________   __      __       __        __
-// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
-//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
-//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
-//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
-//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
-//      \    \  |    |   ________________________________________________________________
-//       \    \ |    |  |  ______________________________________________________________|
-//        \    \|    |  |  |         __          __     __     __     ______      _______
-//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
-//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
-//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
-//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
-//
-//  This file is part of VirtualFluids. VirtualFluids is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file communicator.cpp
-//! \ingroup submodules
-//! \author Henry Korb
-//=======================================================================================
-#include <pybind11/pybind11.h>
-#include <gpu/VirtualFluids_GPU/Communication/Communicator.h>
-#include <gpu/VirtualFluids_GPU/Communication/MpiCommunicator.h>
-
-namespace communicator
-{
-    namespace py = pybind11;
-
-    void makeModule(py::module_ &parentModule)
-    {
-        py::class_<vf::gpu::CommunicationRoutine, std::unique_ptr<vf::gpu::CommunicationRoutine, py::nodelete>>(parentModule, "CommunicationRoutine");
-
-        py::class_<vf::gpu::Communicator, vf::gpu::CommunicationRoutine, std::unique_ptr<vf::gpu::Communicator, py::nodelete>>(parentModule, "Communicator")
-            .def("get_number_of_process", &vf::gpu::Communicator::getNumberOfProcess)
-            .def("get_pid", &vf::gpu::Communicator::getPID);
-
-        py::class_<vf::gpu::MpiCommunicator, vf::gpu::Communicator, std::unique_ptr<vf::gpu::MpiCommunicator, py::nodelete>>(parentModule, "MpiCommunicator")
-            .def_static("get_instance", &vf::gpu::MpiCommunicator::getInstance, py::return_value_policy::reference);
-    }
-} // namespace communicator
\ No newline at end of file
diff --git a/pythonbindings/src/gpu/submodules/simulation.cpp b/pythonbindings/src/gpu/submodules/simulation.cpp
index d32ef272a..545fe082f 100644
--- a/pythonbindings/src/gpu/submodules/simulation.cpp
+++ b/pythonbindings/src/gpu/submodules/simulation.cpp
@@ -32,7 +32,6 @@
 //=======================================================================================
 #include <pybind11/pybind11.h>
 #include <gpu/VirtualFluids_GPU/LBM/Simulation.h>
-#include <gpu/VirtualFluids_GPU/Communication/Communicator.h>
 #include <gpu/VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactory.h>
 #include <gpu/VirtualFluids_GPU/PreProcessor/PreProcessorFactory/PreProcessorFactory.h>
 #include <gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h>
@@ -43,6 +42,7 @@
 #include "gpu/VirtualFluids_GPU/Factories/BoundaryConditionFactory.h"
 #include "gpu/VirtualFluids_GPU/TurbulenceModels/TurbulenceModelFactory.h"
 #include "gpu/VirtualFluids_GPU/Factories/GridScalingFactory.h"
+#include "parallel/Communicator.h"
 
 namespace simulation
 {
@@ -54,7 +54,7 @@ namespace simulation
         py::class_<Simulation>(parentModule, "Simulation")
         .def(py::init<  std::shared_ptr<Parameter>,
                         std::shared_ptr<CudaMemoryManager>,
-                        vf::gpu::Communicator &,
+                        vf::parallel::Communicator &,
                         GridProvider &,
                         BoundaryConditionFactory*,
                         GridScalingFactory*>(), 
@@ -66,7 +66,7 @@ namespace simulation
                         py::arg("gridScalingFactory"))
         .def(py::init<  std::shared_ptr<Parameter>,
                         std::shared_ptr<CudaMemoryManager>,
-                        vf::gpu::Communicator &,
+                        vf::parallel::Communicator &,
                         GridProvider &,
                         BoundaryConditionFactory*>(), 
                         py::arg("parameter"),
@@ -76,7 +76,7 @@ namespace simulation
                         py::arg("bcFactory"))
         .def(py::init<  std::shared_ptr<Parameter>,
                         std::shared_ptr<CudaMemoryManager>,
-                        vf::gpu::Communicator &,
+                        vf::parallel::Communicator &,
                         GridProvider &,
                         BoundaryConditionFactory*,
                         std::shared_ptr<TurbulenceModelFactory>,
diff --git a/src/cpu/LiggghtsCoupling/SimulationObserver/LiggghtsCouplingSimulationObserver.cpp b/src/cpu/LiggghtsCoupling/SimulationObserver/LiggghtsCouplingSimulationObserver.cpp
index 7b7fafd8a..718301257 100644
--- a/src/cpu/LiggghtsCoupling/SimulationObserver/LiggghtsCouplingSimulationObserver.cpp
+++ b/src/cpu/LiggghtsCoupling/SimulationObserver/LiggghtsCouplingSimulationObserver.cpp
@@ -13,7 +13,7 @@
 #include "fix_lb_coupling_onetoone.h"
 
 LiggghtsCouplingSimulationObserver::LiggghtsCouplingSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s,
-                                                         SPtr<vf::mpi::Communicator> comm,
+                                                         SPtr<vf::parallel::Communicator> comm,
                                                          LiggghtsCouplingWrapper &wrapper, int demSteps,
                                                          SPtr<LBMUnitConverter> units)
     : SimulationObserver(grid, s), comm(comm), wrapper(wrapper), demSteps(demSteps), units(units)
diff --git a/src/cpu/LiggghtsCoupling/SimulationObserver/LiggghtsCouplingSimulationObserver.h b/src/cpu/LiggghtsCoupling/SimulationObserver/LiggghtsCouplingSimulationObserver.h
index 0ae1786dd..fb4938328 100644
--- a/src/cpu/LiggghtsCoupling/SimulationObserver/LiggghtsCouplingSimulationObserver.h
+++ b/src/cpu/LiggghtsCoupling/SimulationObserver/LiggghtsCouplingSimulationObserver.h
@@ -46,7 +46,7 @@
 
 
 class SimulationObserver;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class LiggghtsCouplingWrapper;
 class Grid3D;
 class Block3D;
@@ -61,7 +61,7 @@ struct ParticleData {
 class LiggghtsCouplingSimulationObserver : public SimulationObserver
 {
 public:
-    LiggghtsCouplingSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<vf::mpi::Communicator> comm,
+    LiggghtsCouplingSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<vf::parallel::Communicator> comm,
                                 LiggghtsCouplingWrapper &wrapper, int demSteps, SPtr<LBMUnitConverter> units);
     virtual ~LiggghtsCouplingSimulationObserver();
 
@@ -88,7 +88,7 @@ protected:
     void addTorque(int const partId, int const coord, double const value, double *torque);
 
 private:
-    SPtr<vf::mpi::Communicator> comm;
+    SPtr<vf::parallel::Communicator> comm;
     LiggghtsCouplingWrapper &wrapper;
     SPtr<LBMUnitConverter> units;
     int demSteps;
diff --git a/src/cpu/MultiphaseFlow/SimulationObservers/WriteMultiphaseQuantitiesSimulationObserver.cpp b/src/cpu/MultiphaseFlow/SimulationObservers/WriteMultiphaseQuantitiesSimulationObserver.cpp
index adc6a4f81..820c84855 100644
--- a/src/cpu/MultiphaseFlow/SimulationObservers/WriteMultiphaseQuantitiesSimulationObserver.cpp
+++ b/src/cpu/MultiphaseFlow/SimulationObservers/WriteMultiphaseQuantitiesSimulationObserver.cpp
@@ -39,7 +39,7 @@
 
 #include "BCArray3D.h"
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "DataSet3D.h"
 #include "Grid3D.h"
 #include "LBMUnitConverter.h"
@@ -53,7 +53,7 @@ WriteMultiphaseQuantitiesSimulationObserver::WriteMultiphaseQuantitiesSimulation
                                                                              const std::string &path,
                                                                              WbWriter *const writer,
                                                                              SPtr<LBMUnitConverter> conv,
-                                                                             std::shared_ptr<vf::mpi::Communicator> comm)
+                                                                             std::shared_ptr<vf::parallel::Communicator> comm)
         : SimulationObserver(grid, s), path(path), writer(writer), conv(conv), comm(comm)
 {
     gridRank = comm->getProcessID();
diff --git a/src/cpu/MultiphaseFlow/SimulationObservers/WriteMultiphaseQuantitiesSimulationObserver.h b/src/cpu/MultiphaseFlow/SimulationObservers/WriteMultiphaseQuantitiesSimulationObserver.h
index 452a06d3b..e5ba399ad 100644
--- a/src/cpu/MultiphaseFlow/SimulationObservers/WriteMultiphaseQuantitiesSimulationObserver.h
+++ b/src/cpu/MultiphaseFlow/SimulationObservers/WriteMultiphaseQuantitiesSimulationObserver.h
@@ -42,7 +42,7 @@
 #include "LBMSystem.h"
 #include "UbTuple.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class LBMUnitConverter;
@@ -63,7 +63,7 @@ public:
     //! \param conv is LBMUnitConverter object
     //! \param comm is Communicator object
     WriteMultiphaseQuantitiesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                                          WbWriter *const writer, SPtr<LBMUnitConverter> conv, std::shared_ptr<vf::mpi::Communicator> comm);
+                                          WbWriter *const writer, SPtr<LBMUnitConverter> conv, std::shared_ptr<vf::parallel::Communicator> comm);
     ~WriteMultiphaseQuantitiesSimulationObserver() override = default;
 
     void update(real step) override;
@@ -90,7 +90,7 @@ private:
     int minInitLevel;
     int maxInitLevel;
     int gridRank;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 
     real gradX1_phi(const real *const &);
     real gradX2_phi(const real *const &);
diff --git a/src/cpu/MultiphaseFlow/SimulationObservers/WriteSharpInterfaceQuantitiesSimulationObserver.cpp b/src/cpu/MultiphaseFlow/SimulationObservers/WriteSharpInterfaceQuantitiesSimulationObserver.cpp
index 64ef23bcf..7a0ae87b6 100644
--- a/src/cpu/MultiphaseFlow/SimulationObservers/WriteSharpInterfaceQuantitiesSimulationObserver.cpp
+++ b/src/cpu/MultiphaseFlow/SimulationObservers/WriteSharpInterfaceQuantitiesSimulationObserver.cpp
@@ -39,7 +39,7 @@
 
 #include "BCArray3D.h"
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "DataSet3D.h"
 #include "Grid3D.h"
 #include "LBMUnitConverter.h"
@@ -52,7 +52,7 @@ WriteSharpInterfaceQuantitiesSimulationObserver::WriteSharpInterfaceQuantitiesSi
                                                                              const std::string &path,
                                                                              WbWriter *const writer,
                                                                              SPtr<LBMUnitConverter> conv,
-                                                                             std::shared_ptr<vf::mpi::Communicator> comm)
+                                                                             std::shared_ptr<vf::parallel::Communicator> comm)
         : SimulationObserver(grid, s), path(path), writer(writer), conv(conv), comm(comm)
 {
     gridRank = comm->getProcessID();
diff --git a/src/cpu/MultiphaseFlow/SimulationObservers/WriteSharpInterfaceQuantitiesSimulationObserver.h b/src/cpu/MultiphaseFlow/SimulationObservers/WriteSharpInterfaceQuantitiesSimulationObserver.h
index 8c1e63478..bf42cbfa4 100644
--- a/src/cpu/MultiphaseFlow/SimulationObservers/WriteSharpInterfaceQuantitiesSimulationObserver.h
+++ b/src/cpu/MultiphaseFlow/SimulationObservers/WriteSharpInterfaceQuantitiesSimulationObserver.h
@@ -42,7 +42,7 @@
 #include "LBMSystem.h"
 #include "UbTuple.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class LBMUnitConverter;
@@ -63,7 +63,7 @@ public:
     //! \param conv is LBMUnitConverter object
     //! \param comm is Communicator object
     WriteSharpInterfaceQuantitiesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                                          WbWriter *const writer, SPtr<LBMUnitConverter> conv, std::shared_ptr<vf::mpi::Communicator> comm);
+                                          WbWriter *const writer, SPtr<LBMUnitConverter> conv, std::shared_ptr<vf::parallel::Communicator> comm);
     ~WriteSharpInterfaceQuantitiesSimulationObserver() override = default;
 
     void update(double step) override;
@@ -90,7 +90,7 @@ private:
     int minInitLevel;
     int maxInitLevel;
     int gridRank;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 
     real gradX1_phi(const real *const &);
     real gradX2_phi(const real *const &);
diff --git a/src/cpu/NonNewtonianFluids/SimulationObservers/CalculateTorqueSimulationObserver.cpp b/src/cpu/NonNewtonianFluids/SimulationObservers/CalculateTorqueSimulationObserver.cpp
index 82adf2f0e..7ed670deb 100644
--- a/src/cpu/NonNewtonianFluids/SimulationObservers/CalculateTorqueSimulationObserver.cpp
+++ b/src/cpu/NonNewtonianFluids/SimulationObservers/CalculateTorqueSimulationObserver.cpp
@@ -1,7 +1,7 @@
 #include "CalculateTorqueSimulationObserver.h"
 #include "BCSet.h"
 
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "D3Q27Interactor.h"
 #include "UbScheduler.h"
 #include "Grid3D.h"
@@ -14,7 +14,7 @@
 #include "DistributionArray3D.h"
 #include "NonNewtonianFluids/LBM/Rheology.h"
 
-CalculateTorqueSimulationObserver::CalculateTorqueSimulationObserver( SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path_, std::shared_ptr<vf::mpi::Communicator> comm) : SimulationObserver(grid, s), path(path_), comm(comm), torqueX1global(0), torqueX2global(0), torqueX3global(0)
+CalculateTorqueSimulationObserver::CalculateTorqueSimulationObserver( SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path_, std::shared_ptr<vf::parallel::Communicator> comm) : SimulationObserver(grid, s), path(path_), comm(comm), torqueX1global(0), torqueX2global(0), torqueX3global(0)
 {
    if (comm->getProcessID() == comm->getRoot())
    {
diff --git a/src/cpu/NonNewtonianFluids/SimulationObservers/CalculateTorqueSimulationObserver.h b/src/cpu/NonNewtonianFluids/SimulationObservers/CalculateTorqueSimulationObserver.h
index d0cd9c41d..e1948d95f 100644
--- a/src/cpu/NonNewtonianFluids/SimulationObservers/CalculateTorqueSimulationObserver.h
+++ b/src/cpu/NonNewtonianFluids/SimulationObservers/CalculateTorqueSimulationObserver.h
@@ -17,7 +17,7 @@
 #include "D3Q27System.h"
 
 class ForceCalculator;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class D3Q27Interactor;
@@ -29,7 +29,7 @@ class CalculateTorqueSimulationObserver: public SimulationObserver
 {
 public:
    //! Constructor
-   CalculateTorqueSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm);
+   CalculateTorqueSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm);
 	virtual ~CalculateTorqueSimulationObserver();             
 	void update(real step); 
    void addInteractor(SPtr<D3Q27Interactor> interactor);
@@ -42,7 +42,7 @@ protected:
 
 private:
    std::string path;
-   std::shared_ptr<vf::mpi::Communicator> comm;
+   std::shared_ptr<vf::parallel::Communicator> comm;
    std::vector<SPtr<D3Q27Interactor> > interactors;
    real torqueX1global;
    real torqueX2global;
diff --git a/src/cpu/NonNewtonianFluids/SimulationObservers/WriteThixotropyQuantitiesSimulationObserver.cpp b/src/cpu/NonNewtonianFluids/SimulationObservers/WriteThixotropyQuantitiesSimulationObserver.cpp
index 61f13299d..bc3eab4fb 100644
--- a/src/cpu/NonNewtonianFluids/SimulationObservers/WriteThixotropyQuantitiesSimulationObserver.cpp
+++ b/src/cpu/NonNewtonianFluids/SimulationObservers/WriteThixotropyQuantitiesSimulationObserver.cpp
@@ -49,7 +49,7 @@ using namespace std;
 
 WriteThixotropyQuantitiesSimulationObserver::WriteThixotropyQuantitiesSimulationObserver() = default;
 //////////////////////////////////////////////////////////////////////////
-WriteThixotropyQuantitiesSimulationObserver::WriteThixotropyQuantitiesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string& path, WbWriter* const writer, SPtr<LBMUnitConverter> conv, std::shared_ptr<vf::mpi::Communicator> comm) : SimulationObserver(grid, s), path(path), writer(writer),	conv(conv),	comm(comm)
+WriteThixotropyQuantitiesSimulationObserver::WriteThixotropyQuantitiesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string& path, WbWriter* const writer, SPtr<LBMUnitConverter> conv, std::shared_ptr<vf::parallel::Communicator> comm) : SimulationObserver(grid, s), path(path), writer(writer),	conv(conv),	comm(comm)
 {
 	gridRank = comm->getProcessID();
 	minInitLevel = this->grid->getCoarsestInitializedLevel();
diff --git a/src/cpu/NonNewtonianFluids/SimulationObservers/WriteThixotropyQuantitiesSimulationObserver.h b/src/cpu/NonNewtonianFluids/SimulationObservers/WriteThixotropyQuantitiesSimulationObserver.h
index 3ac9664e5..5ef994a08 100644
--- a/src/cpu/NonNewtonianFluids/SimulationObservers/WriteThixotropyQuantitiesSimulationObserver.h
+++ b/src/cpu/NonNewtonianFluids/SimulationObservers/WriteThixotropyQuantitiesSimulationObserver.h
@@ -38,14 +38,14 @@
 #include "Grid3D.h"
 #include "Block3D.h"
 #include "LBMUnitConverter.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "WbWriter.h"
 
 class WriteThixotropyQuantitiesSimulationObserver : public  SimulationObserver
 {
 public:
 	WriteThixotropyQuantitiesSimulationObserver();
-	WriteThixotropyQuantitiesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string& path, WbWriter* const writer, SPtr<LBMUnitConverter> conv, std::shared_ptr<vf::mpi::Communicator> comm);
+	WriteThixotropyQuantitiesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string& path, WbWriter* const writer, SPtr<LBMUnitConverter> conv, std::shared_ptr<vf::parallel::Communicator> comm);
 	~WriteThixotropyQuantitiesSimulationObserver() = default;
 
    void update(real step) override;
@@ -69,7 +69,7 @@ private:
    int minInitLevel;
    int maxInitLevel;
    int gridRank;
-   std::shared_ptr<vf::mpi::Communicator> comm;
+   std::shared_ptr<vf::parallel::Communicator> comm;
 //	double ConcentrationSum;
 };
 #endif
diff --git a/src/cpu/VirtualFluids.h b/src/cpu/VirtualFluids.h
index d8f79867c..e3adfb6ef 100644
--- a/src/cpu/VirtualFluids.h
+++ b/src/cpu/VirtualFluids.h
@@ -40,9 +40,9 @@
 #include <omp.h>
 #endif
 
-#include <mpi/Communicator.h>
-#include <mpi/MPICommunicator.h>
-#include <mpi/NullCommunicator.h>
+#include <parallel/Communicator.h>
+#include <parallel/MPICommunicator.h>
+#include <parallel/NullCommunicator.h>
 
 #include <basics/PointerDefinitions.h>
 
diff --git a/src/cpu/VirtualFluidsCore/CMakeLists.txt b/src/cpu/VirtualFluidsCore/CMakeLists.txt
index aae663e80..3e767e49c 100644
--- a/src/cpu/VirtualFluidsCore/CMakeLists.txt
+++ b/src/cpu/VirtualFluidsCore/CMakeLists.txt
@@ -20,7 +20,7 @@ if(BUILD_USE_OPENMP)
     list(APPEND VF_LIBRARIES OpenMP::OpenMP_CXX)
 endif()
 
-vf_add_library(BUILDTYPE static PUBLIC_LINK basics muparser ${VF_LIBRARIES} PRIVATE_LINK lbm mpi logger)
+vf_add_library(BUILDTYPE static PUBLIC_LINK basics muparser ${VF_LIBRARIES} parallel PRIVATE_LINK lbm logger)
 
 vf_add_tests()
 
diff --git a/src/cpu/VirtualFluidsCore/Interactors/InteractorsHelper.cpp b/src/cpu/VirtualFluidsCore/Interactors/InteractorsHelper.cpp
index 38e5be2e5..159d04d97 100644
--- a/src/cpu/VirtualFluidsCore/Interactors/InteractorsHelper.cpp
+++ b/src/cpu/VirtualFluidsCore/Interactors/InteractorsHelper.cpp
@@ -34,7 +34,7 @@
 #include "InteractorsHelper.h"
 
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "SetBcBlocksBlockVisitor.h"
 #include "SetSolidBlocksBlockVisitor.h"
 #include <Grid3D.h>
@@ -100,6 +100,6 @@ void InteractorsHelper::updateGrid()
         ids.push_back(block->getGlobalID());
 
     std::vector<int> rids;
-    vf::mpi::Communicator::getInstance()->allGather(ids, rids);
+    vf::parallel::Communicator::getInstance()->allGather(ids, rids);
     grid->deleteBlocks(rids);
 }
diff --git a/src/cpu/VirtualFluidsCore/Parallel/BlocksDistributor.cpp b/src/cpu/VirtualFluidsCore/Parallel/BlocksDistributor.cpp
index eef54a862..718267be6 100644
--- a/src/cpu/VirtualFluidsCore/Parallel/BlocksDistributor.cpp
+++ b/src/cpu/VirtualFluidsCore/Parallel/BlocksDistributor.cpp
@@ -1,3 +1,3 @@
 #include "BlocksDistributor.h"
 
-BlocksDistributor::BlocksDistributor(SPtr<Grid3D> grid, std::shared_ptr<vf::mpi::Communicator> comm) : grid(grid), comm(comm) {}
+BlocksDistributor::BlocksDistributor(SPtr<Grid3D> grid, std::shared_ptr<vf::parallel::Communicator> comm) : grid(grid), comm(comm) {}
diff --git a/src/cpu/VirtualFluidsCore/Parallel/BlocksDistributor.h b/src/cpu/VirtualFluidsCore/Parallel/BlocksDistributor.h
index 85aa52d05..7db87d088 100644
--- a/src/cpu/VirtualFluidsCore/Parallel/BlocksDistributor.h
+++ b/src/cpu/VirtualFluidsCore/Parallel/BlocksDistributor.h
@@ -1,7 +1,7 @@
 #ifndef BlocksDistributor_H
 #define BlocksDistributor_H
 
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "Grid3D.h"
 
 #include <PointerDefinitions.h>
@@ -9,13 +9,13 @@
 class BlocksDistributor
 {
 public:
-    BlocksDistributor(SPtr<Grid3D> grid, std::shared_ptr<vf::mpi::Communicator> comm);
+    BlocksDistributor(SPtr<Grid3D> grid, std::shared_ptr<vf::parallel::Communicator> comm);
     ~BlocksDistributor();
 
 protected:
 private:
     SPtr<Grid3D> grid;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 };
 
 #endif
diff --git a/src/cpu/VirtualFluidsCore/Simulation/Grid3D.cpp b/src/cpu/VirtualFluidsCore/Simulation/Grid3D.cpp
index a214b4bd0..14987f641 100644
--- a/src/cpu/VirtualFluidsCore/Simulation/Grid3D.cpp
+++ b/src/cpu/VirtualFluidsCore/Simulation/Grid3D.cpp
@@ -51,7 +51,7 @@ using namespace std;
 
 Grid3D::Grid3D() { levelSet.resize(D3Q27System::MAXLEVEL + 1); }
 //////////////////////////////////////////////////////////////////////////
-Grid3D::Grid3D(std::shared_ptr<vf::mpi::Communicator> comm)
+Grid3D::Grid3D(std::shared_ptr<vf::parallel::Communicator> comm)
 
 {
     levelSet.resize(D3Q27System::MAXLEVEL + 1);
@@ -59,7 +59,7 @@ Grid3D::Grid3D(std::shared_ptr<vf::mpi::Communicator> comm)
     rank = comm->getProcessID();
 }
 //////////////////////////////////////////////////////////////////////////
-Grid3D::Grid3D(std::shared_ptr<vf::mpi::Communicator> comm, int blockNx1, int blockNx2, int blockNx3, int gridNx1, int gridNx2, int gridNx3)
+Grid3D::Grid3D(std::shared_ptr<vf::parallel::Communicator> comm, int blockNx1, int blockNx2, int blockNx3, int gridNx1, int gridNx2, int gridNx3)
     :
 
       blockNx1(blockNx1), blockNx2(blockNx2), blockNx3(blockNx2), nx1(gridNx1), nx2(gridNx2), nx3(gridNx3)
@@ -2314,7 +2314,7 @@ void Grid3D::renumberBlockIDs()
 
 
 //////////////////////////////////////////////////////////////////////////
-void Grid3D::updateDistributedBlocks(std::shared_ptr<vf::mpi::Communicator> comm)
+void Grid3D::updateDistributedBlocks(std::shared_ptr<vf::parallel::Communicator> comm)
 {
 
     std::vector<int> blocks;
diff --git a/src/cpu/VirtualFluidsCore/Simulation/Grid3D.h b/src/cpu/VirtualFluidsCore/Simulation/Grid3D.h
index 821adff47..50f3ac53a 100644
--- a/src/cpu/VirtualFluidsCore/Simulation/Grid3D.h
+++ b/src/cpu/VirtualFluidsCore/Simulation/Grid3D.h
@@ -48,7 +48,7 @@ class CoordinateTransformation3D;
 #include <Block3DVisitor.h>
 #include <Grid3DVisitor.h>
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Block3D;
 class Interactor3D;
 
@@ -65,8 +65,8 @@ public:
 
 public:
     Grid3D();
-    Grid3D(std::shared_ptr<vf::mpi::Communicator> comm);
-    Grid3D(std::shared_ptr<vf::mpi::Communicator> comm, int blockNx1, int blockNx2, int blockNx3, int gridNx1, int gridNx2, int gridNx3);
+    Grid3D(std::shared_ptr<vf::parallel::Communicator> comm);
+    Grid3D(std::shared_ptr<vf::parallel::Communicator> comm, int blockNx1, int blockNx2, int blockNx3, int gridNx1, int gridNx2, int gridNx3);
     virtual ~Grid3D() = default;
     //////////////////////////////////////////////////////////////////////////
     // blocks control
@@ -95,7 +95,7 @@ public:
     BlockIDMap &getBlockIDs();
     void deleteBlockIDs();
     void renumberBlockIDs();
-    void updateDistributedBlocks(std::shared_ptr<vf::mpi::Communicator> comm);
+    void updateDistributedBlocks(std::shared_ptr<vf::parallel::Communicator> comm);
     SPtr<Block3D> getSuperBlock(SPtr<Block3D> block);
     SPtr<Block3D> getSuperBlock(int ix1, int ix2, int ix3, int level);
     void getSubBlocks(SPtr<Block3D> block, int levelDepth, std::vector<SPtr<Block3D>> &blocks);
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/AdjustForcingSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/AdjustForcingSimulationObserver.cpp
index 2254b9b02..aeae2f617 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/AdjustForcingSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/AdjustForcingSimulationObserver.cpp
@@ -6,7 +6,7 @@
 
 #include <fstream>
 
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "Grid3D.h"
 #include "IntegrateValuesHelper.h"
 #include "UbScheduler.h"
@@ -14,7 +14,7 @@
 
 AdjustForcingSimulationObserver::AdjustForcingSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
                                                    SPtr<IntegrateValuesHelper> integrateValues, real vTarged,
-                                                   std::shared_ptr<vf::mpi::Communicator> comm)
+                                                   std::shared_ptr<vf::parallel::Communicator> comm)
 
     : SimulationObserver(grid, s), path(path), integrateValues(integrateValues), comm(comm), vx1Targed(vTarged)
 {
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/AdjustForcingSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/AdjustForcingSimulationObserver.h
index 9e570e34d..13f88c711 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/AdjustForcingSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/AdjustForcingSimulationObserver.h
@@ -7,7 +7,7 @@
 #include "SimulationObserver.h"
 #include "lbm/constants/D3Q27.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class UbScheduler;
 class Grid3D;
 class IntegrateValuesHelper;
@@ -22,7 +22,7 @@ class AdjustForcingSimulationObserver : public SimulationObserver
 {
 public:
     AdjustForcingSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                             SPtr<IntegrateValuesHelper> integrateValues, real vTarged, std::shared_ptr<vf::mpi::Communicator> comm);
+                             SPtr<IntegrateValuesHelper> integrateValues, real vTarged, std::shared_ptr<vf::parallel::Communicator> comm);
     //!< calls collect PostprocessData
     void update(real step) override;
 
@@ -31,7 +31,7 @@ protected:
     SPtr<IntegrateValuesHelper> integrateValues;
     //!< compares velocity in integrateValues with target velocity and adjusts forcing accordingly.
     void collectData(real step);
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 
 private:
     real vx1Targed; //!< target velocity.
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/AverageValuesSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/AverageValuesSimulationObserver.cpp
index 1adf3ad99..d87dddb97 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/AverageValuesSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/AverageValuesSimulationObserver.cpp
@@ -7,7 +7,7 @@
 
 #include "BCArray3D.h"
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "DataSet3D.h"
 #include "Grid3D.h"
 #include "UbScheduler.h"
@@ -185,7 +185,7 @@ void AverageValuesSimulationObserver::collectData(real step)
     piece           = subfolder + "/" + piece;
 
     vector<string> cellDataNames;
-    std::shared_ptr<vf::mpi::Communicator> comm = vf::mpi::Communicator::getInstance();
+    std::shared_ptr<vf::parallel::Communicator> comm = vf::parallel::Communicator::getInstance();
     vector<string> pieces   = comm->gather(piece);
     if (comm->getProcessID() == comm->getRoot()) {
         string pname =
@@ -448,7 +448,7 @@ void AverageValuesSimulationObserver::calculateAverageValues(real timeStep)
 ////////////////////////////////////////////////////////////////////////////
 // void AverageValuesSimulationObserver::initPlotData(double step)
 //{
-//   std::shared_ptr<vf::mpi::Communicator> comm = vf::mpi::Communicator::getInstance();
+//   std::shared_ptr<vf::parallel::Communicator> comm = vf::parallel::Communicator::getInstance();
 //	if (comm->getProcessID() == comm->getRoot())
 //	{
 //		std::ofstream ostr;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/CalculateForcesSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/CalculateForcesSimulationObserver.cpp
index 8610c5df9..f1c8060ca 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/CalculateForcesSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/CalculateForcesSimulationObserver.cpp
@@ -4,7 +4,7 @@
 #include "BCArray3D.h"
 #include "Block3D.h"
 #include "BoundaryConditions.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "D3Q27Interactor.h"
 #include "DataSet3D.h"
 #include "DistributionArray3D.h"
@@ -14,7 +14,7 @@
 #include "UbScheduler.h"
 
 CalculateForcesSimulationObserver::CalculateForcesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                                                       std::shared_ptr<vf::mpi::Communicator> comm, real v, real a)
+                                                       std::shared_ptr<vf::parallel::Communicator> comm, real v, real a)
     : SimulationObserver(grid, s), path(path), comm(comm), v(v), a(a), forceX1global(0), forceX2global(0), forceX3global(0)
 {
     if (comm->getProcessID() == comm->getRoot()) {
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/CalculateForcesSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/CalculateForcesSimulationObserver.h
index 02b76e77b..e1d376f4c 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/CalculateForcesSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/CalculateForcesSimulationObserver.h
@@ -17,7 +17,7 @@
 #include "lbm/constants/D3Q27.h"
 
 class ForceCalculator;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class D3Q27Interactor;
@@ -30,7 +30,7 @@ public:
     //! Constructor
     //! \param v - velocity of fluid in LB units
     //! \param a - area of object in LB units
-    CalculateForcesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm,
+    CalculateForcesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm,
                                real v, real a);
     ~CalculateForcesSimulationObserver() override;
     void update(real step) override;
@@ -46,7 +46,7 @@ protected:
 
 private:
     std::string path;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     std::vector<SPtr<D3Q27Interactor>> interactors;
     real forceX1global;
     real forceX2global;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/CalculateTorqueSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/CalculateTorqueSimulationObserver.cpp
index 768fbbb26..6d3b22d64 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/CalculateTorqueSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/CalculateTorqueSimulationObserver.cpp
@@ -1,7 +1,7 @@
 #include "NonNewtonianFluids/SimulationObservers/CalculateTorqueSimulationObserver.h"
 #include "BCSet.h"
 
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "D3Q27Interactor.h"
 #include "UbScheduler.h"
 #include "Grid3D.h"
@@ -14,7 +14,7 @@
 #include "DistributionArray3D.h"
 #include "NonNewtonianFluids/LBM/Rheology.h"
 
-CalculateTorqueSimulationObserver::CalculateTorqueSimulationObserver( SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path_, std::shared_ptr<vf::mpi::Communicator> comm) : SimulationObserver(grid, s), path(path_), comm(comm), torqueX1global(0), torqueX2global(0), torqueX3global(0)
+CalculateTorqueSimulationObserver::CalculateTorqueSimulationObserver( SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path_, std::shared_ptr<vf::parallel::Communicator> comm) : SimulationObserver(grid, s), path(path_), comm(comm), torqueX1global(0), torqueX2global(0), torqueX3global(0)
 {
    if (comm->getProcessID() == comm->getRoot())
    {
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/DecreaseViscositySimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/DecreaseViscositySimulationObserver.cpp
index 075449176..ffbfde51c 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/DecreaseViscositySimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/DecreaseViscositySimulationObserver.cpp
@@ -10,13 +10,13 @@
 #include <vector>
 
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "Grid3D.h"
 #include "LBMKernel.h"
 #include "UbScheduler.h"
 
 DecreaseViscositySimulationObserver::DecreaseViscositySimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, mu::Parser *nueFunc,
-                                                           std::shared_ptr<vf::mpi::Communicator> comm)
+                                                           std::shared_ptr<vf::parallel::Communicator> comm)
 
     : SimulationObserver(grid, s), nueFunc(nueFunc), comm(comm)
 {
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/DecreaseViscositySimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/DecreaseViscositySimulationObserver.h
index 2e2c655d2..741b65783 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/DecreaseViscositySimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/DecreaseViscositySimulationObserver.h
@@ -11,7 +11,7 @@
 
 class UbScheduler;
 class Grid3D;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 
 //! \brief The class sets viscosity/collision factor according to a previously defined function in time.
 //! \details initialization in test case (example):
@@ -28,7 +28,7 @@ namespace vf::mpi {class Communicator;}
 class DecreaseViscositySimulationObserver : public SimulationObserver
 {
 public:
-    DecreaseViscositySimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, mu::Parser *nueFunc, std::shared_ptr<vf::mpi::Communicator> comm);
+    DecreaseViscositySimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, mu::Parser *nueFunc, std::shared_ptr<vf::parallel::Communicator> comm);
     ~DecreaseViscositySimulationObserver() override;
     //! calls collect PostprocessData.
     void update(real step) override;
@@ -36,7 +36,7 @@ public:
 protected:
     //! resets the collision factor depending on the current timestep.
     void setViscosity(real step);
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 
 private:
     mutable mu::value_type timeStep;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/EmergencyExitSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/EmergencyExitSimulationObserver.cpp
index a6826a713..ea6287ff3 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/EmergencyExitSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/EmergencyExitSimulationObserver.cpp
@@ -1,5 +1,5 @@
 #include "EmergencyExitSimulationObserver.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "Grid3D.h"
 #include "MPIIORestartSimulationObserver.h"
 #include "UbLogger.h"
@@ -8,7 +8,7 @@
 #include <basics/utilities/UbFileOutputASCII.h>
 
 EmergencyExitSimulationObserver::EmergencyExitSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                                                   SPtr<MPIIORestartSimulationObserver> rp, std::shared_ptr<vf::mpi::Communicator> comm)
+                                                   SPtr<MPIIORestartSimulationObserver> rp, std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), path(path), rp(rp), comm(comm)
 {
     this->path = path + "/exit";
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/EmergencyExitSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/EmergencyExitSimulationObserver.h
index f4a8e79f6..f2757d8ed 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/EmergencyExitSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/EmergencyExitSimulationObserver.h
@@ -14,7 +14,7 @@
 #include "SimulationObserver.h"
 
 class MPIIORestartSimulationObserver;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 
@@ -22,7 +22,7 @@ class EmergencyExitSimulationObserver : public SimulationObserver
 {
 public:
     EmergencyExitSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                             SPtr<MPIIORestartSimulationObserver> rp, std::shared_ptr<vf::mpi::Communicator> comm);
+                             SPtr<MPIIORestartSimulationObserver> rp, std::shared_ptr<vf::parallel::Communicator> comm);
     ~EmergencyExitSimulationObserver() override;
 
     void update(real step) override;
@@ -35,7 +35,7 @@ protected:
 
 private:
     std::string path;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     SPtr<MPIIORestartSimulationObserver> rp;
     std::string metafile;
 };
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/ForceCalculator.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/ForceCalculator.cpp
index 9a39ce11e..7f2b30a96 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/ForceCalculator.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/ForceCalculator.cpp
@@ -4,13 +4,13 @@
 #include "BCArray3D.h"
 #include "Block3D.h"
 #include "BoundaryConditions.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "D3Q27Interactor.h"
 #include "DataSet3D.h"
 #include "DistributionArray3D.h"
 #include "LBMKernel.h"
 
-ForceCalculator::ForceCalculator(std::shared_ptr<vf::mpi::Communicator> comm)
+ForceCalculator::ForceCalculator(std::shared_ptr<vf::parallel::Communicator> comm)
     : comm(comm), forceX1global(0), forceX2global(0), forceX3global(0)
 {
 }
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/ForceCalculator.h b/src/cpu/VirtualFluidsCore/SimulationObservers/ForceCalculator.h
index 03b00f360..6f7266d8e 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/ForceCalculator.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/ForceCalculator.h
@@ -15,14 +15,14 @@
 #include "Vector3D.h"
 
 class D3Q27Interactor;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class DistributionArray3D;
 class BoundaryConditions;
 
 class ForceCalculator
 {
 public:
-    ForceCalculator(std::shared_ptr<vf::mpi::Communicator> comm);
+    ForceCalculator(std::shared_ptr<vf::parallel::Communicator> comm);
     virtual ~ForceCalculator();
 
     void calculateForces(std::vector<std::shared_ptr<D3Q27Interactor>> interactors);
@@ -35,7 +35,7 @@ public:
 private:
     void gatherGlobalForces();
 
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 
     real forceX1global;
     real forceX2global;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/InSituCatalystSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/InSituCatalystSimulationObserver.cpp
index 4e8fd6d5f..07a27f074 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/InSituCatalystSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/InSituCatalystSimulationObserver.cpp
@@ -20,7 +20,7 @@ InSituCatalystSimulationObserver::InSituCatalystSimulationObserver() {}
 InSituCatalystSimulationObserver::InSituCatalystSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, std::string script)
     : SimulationObserver(grid, s)
 {
-    gridRank     = vf::mpi::Communicator::getInstance()->getProcessID();
+    gridRank     = vf::parallel::Communicator::getInstance()->getProcessID();
     minInitLevel = this->grid->getCoarsestInitializedLevel();
     maxInitLevel = this->grid->getFinestInitializedLevel();
 
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/InSituVTKSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/InSituVTKSimulationObserver.cpp
index 74c1b653b..2dbdcb637 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/InSituVTKSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/InSituVTKSimulationObserver.cpp
@@ -30,7 +30,7 @@ InSituVTKSimulationObserver::InSituVTKSimulationObserver(SPtr<Grid3D> grid, SPtr
                                            SPtr<LBMUnitConverter> conv)
     : SimulationObserver(grid, s), conv(conv)
 {
-    gridRank     = vf::mpi::Communicator::getInstance()->getProcessID();
+    gridRank     = vf::parallel::Communicator::getInstance()->getProcessID();
     minInitLevel = this->grid->getCoarsestInitializedLevel();
     maxInitLevel = this->grid->getFinestInitializedLevel();
 
@@ -269,7 +269,7 @@ void InSituVTKSimulationObserver::readConfigFile(const std::string &configFile)
     string dummy;
     int wRank = 0;
     getline(ifs, dummy);
-    int np = vf::mpi::Communicator::getInstance()->getNumberOfProcesses();
+    int np = vf::parallel::Communicator::getInstance()->getNumberOfProcesses();
 
     while (ifs.good()) {
         getline(ifs, dummy, ';');
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/IntegrateValuesHelper.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/IntegrateValuesHelper.cpp
index 7eabcd284..da55dbee1 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/IntegrateValuesHelper.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/IntegrateValuesHelper.cpp
@@ -10,7 +10,7 @@
 #include "LBMKernel.h"
 
 //////////////////////////////////////////////////////////////////////////
-IntegrateValuesHelper::IntegrateValuesHelper(SPtr<Grid3D> grid, std::shared_ptr<vf::mpi::Communicator> comm, real minX1, real minX2,
+IntegrateValuesHelper::IntegrateValuesHelper(SPtr<Grid3D> grid, std::shared_ptr<vf::parallel::Communicator> comm, real minX1, real minX2,
                                              real minX3, real maxX1, real maxX2, real maxX3)
     :
 
@@ -21,7 +21,7 @@ IntegrateValuesHelper::IntegrateValuesHelper(SPtr<Grid3D> grid, std::shared_ptr<
     init(-1);
 }
 //////////////////////////////////////////////////////////////////////////
-IntegrateValuesHelper::IntegrateValuesHelper(SPtr<Grid3D> grid, std::shared_ptr<vf::mpi::Communicator> comm, real minX1, real minX2,
+IntegrateValuesHelper::IntegrateValuesHelper(SPtr<Grid3D> grid, std::shared_ptr<vf::parallel::Communicator> comm, real minX1, real minX2,
                                              real minX3, real maxX1, real maxX2, real maxX3, int level)
     :
 
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/IntegrateValuesHelper.h b/src/cpu/VirtualFluidsCore/SimulationObservers/IntegrateValuesHelper.h
index c804d7462..6404ca7bf 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/IntegrateValuesHelper.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/IntegrateValuesHelper.h
@@ -5,7 +5,7 @@
 
 #include "Block3D.h"
 #include "CbArray2D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "D3Q27System.h"
 #include "GbCuboid3D.h"
 #include "Grid3D.h"
@@ -36,9 +36,9 @@ public:
     };
 
 public:
-    IntegrateValuesHelper(SPtr<Grid3D> grid, std::shared_ptr<vf::mpi::Communicator> comm, real minX1, real minX2, real minX3,
+    IntegrateValuesHelper(SPtr<Grid3D> grid, std::shared_ptr<vf::parallel::Communicator> comm, real minX1, real minX2, real minX3,
                           real maxX1, real maxX2, real maxX3);
-    IntegrateValuesHelper(SPtr<Grid3D> grid, std::shared_ptr<vf::mpi::Communicator> comm, real minX1, real minX2, real minX3,
+    IntegrateValuesHelper(SPtr<Grid3D> grid, std::shared_ptr<vf::parallel::Communicator> comm, real minX1, real minX2, real minX3,
                           real maxX1, real maxX2, real maxX3, int level);
     virtual ~IntegrateValuesHelper();
 
@@ -77,7 +77,7 @@ private:
     real sAvVx1, sAvVx2, sAvVx3, sTSx1, sTSx2, sTSx3, sTSx1x3;
     std::vector<CalcNodes> cnodes;
     GbCuboid3DPtr boundingBox;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     CbArray2D<Node> cnodes2DMatrix;
     enum Values { AvVx = 0, AvVy = 1, AvVz = 2, AvVxx = 3, AvVyy = 4, AvVzz = 5, AvVxy = 6, AvVyz = 7, AvVxz = 8 };
 };
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/LineTimeSeriesSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/LineTimeSeriesSimulationObserver.cpp
index 75350fb6e..e312bf2b2 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/LineTimeSeriesSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/LineTimeSeriesSimulationObserver.cpp
@@ -3,7 +3,7 @@
 #include "WbWriterVtkXmlASCII.h"
 
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "CompressibleCumulantLBMKernel.h"
 #include "CoordinateTransformation3D.h"
 #include "DataSet3D.h"
@@ -13,7 +13,7 @@
 #include "UbScheduler.h"
 
 LineTimeSeriesSimulationObserver::LineTimeSeriesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                                                     SPtr<GbLine3D> line, int level, std::shared_ptr<vf::mpi::Communicator> comm)
+                                                     SPtr<GbLine3D> line, int level, std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), path(path), length(0), ix1(0), ix2(0), ix3(0), level(level), line(line)
 {
     root  = comm->isRoot();
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/LineTimeSeriesSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/LineTimeSeriesSimulationObserver.h
index 0f8a9ab44..db4fea82f 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/LineTimeSeriesSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/LineTimeSeriesSimulationObserver.h
@@ -9,7 +9,7 @@
 #include "SimulationObserver.h"
 #include "LBMSystem.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class GbLine3D;
@@ -27,7 +27,7 @@ public:
 
 public:
     LineTimeSeriesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, SPtr<GbLine3D> line,
-                              int level, std::shared_ptr<vf::mpi::Communicator> comm);
+                              int level, std::shared_ptr<vf::parallel::Communicator> comm);
     ~LineTimeSeriesSimulationObserver() override = default;
 
     void update(real step) override;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationBESimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationBESimulationObserver.cpp
index 5cdc87c68..6fc3eb9b7 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationBESimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationBESimulationObserver.cpp
@@ -3,7 +3,7 @@
 #include "BCSet.h"
 #include "Block3D.h"
 #include "BoundaryConditions.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "CoordinateTransformation3D.h"
 #include "D3Q27EsoTwist3DSplittedVector.h"
 #include "D3Q27System.h"
@@ -25,7 +25,7 @@ using namespace MPIIODataStructures;
 #define MESSAGE_TAG 80
 #define SEND_BLOCK_SIZE 100000
 
-MPIIOMigrationBESimulationObserver::MPIIOMigrationBESimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<Grid3DVisitor> mV, const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm)
+MPIIOMigrationBESimulationObserver::MPIIOMigrationBESimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<Grid3DVisitor> mV, const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm)
     : MPIIOSimulationObserver(grid, s, path, comm), nue(-999.999), nuL(-999.999), nuG(-999.999), densityRatio(-999.999)
 {
     memset(&boundCondParamStr, 0, sizeof(boundCondParamStr));
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationBESimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationBESimulationObserver.h
index fa55ea17a..cec360a73 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationBESimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationBESimulationObserver.h
@@ -10,7 +10,7 @@
 
 class Grid3D;
 class UbScheduler;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class BCSet;
 class LBMKernel;
 class Grid3DVisitor;
@@ -33,7 +33,7 @@ class MPIIOMigrationBESimulationObserver : public MPIIOSimulationObserver
 
 public:
     MPIIOMigrationBESimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<Grid3DVisitor> mV, const std::string &path,
-                                std::shared_ptr<vf::mpi::Communicator> comm);
+                                std::shared_ptr<vf::parallel::Communicator> comm);
     ~MPIIOMigrationBESimulationObserver() override;
     //! Each timestep writes the grid into the files
     void update(real step) override;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationSimulationObserver.cpp
index 860b3f02a..b2bbc1bcd 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationSimulationObserver.cpp
@@ -3,7 +3,7 @@
 #include "BCSet.h"
 #include "Block3D.h"
 #include "BoundaryConditions.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "CoordinateTransformation3D.h"
 #include "D3Q27EsoTwist3DSplittedVector.h"
 #include "D3Q27System.h"
@@ -22,7 +22,7 @@
 
 using namespace MPIIODataStructures;
 
-MPIIOMigrationSimulationObserver::MPIIOMigrationSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<Grid3DVisitor> mV, const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm)
+MPIIOMigrationSimulationObserver::MPIIOMigrationSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<Grid3DVisitor> mV, const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm)
     : MPIIOSimulationObserver(grid, s, path, comm)
 {
     memset(&boundCondParamStr, 0, sizeof(boundCondParamStr));
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationSimulationObserver.h
index 588366a64..bf70641fd 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOMigrationSimulationObserver.h
@@ -9,7 +9,7 @@
 
 class Grid3D;
 class UbScheduler;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class BCSet;
 class LBMKernel;
 class Grid3DVisitor;
@@ -31,7 +31,7 @@ public:
         PressureField = 9
     };
 
-    MPIIOMigrationSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<Grid3DVisitor> mV, const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm);
+    MPIIOMigrationSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<Grid3DVisitor> mV, const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm);
     ~MPIIOMigrationSimulationObserver() override;
     //! Each timestep writes the grid into the files
     void update(real step) override;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIORestartSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIORestartSimulationObserver.cpp
index fdc3f4d43..e4722b31f 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIORestartSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIORestartSimulationObserver.cpp
@@ -3,7 +3,7 @@
 #include "BCSet.h"
 #include "Block3D.h"
 #include "BoundaryConditions.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "CoordinateTransformation3D.h"
 #include "D3Q27EsoTwist3DSplittedVector.h"
 #include "D3Q27System.h"
@@ -25,7 +25,7 @@
 
 using namespace MPIIODataStructures;
 
-MPIIORestartSimulationObserver::MPIIORestartSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm)
+MPIIORestartSimulationObserver::MPIIORestartSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm)
     : MPIIOSimulationObserver(grid, s, path, comm)
 {
     memset(&boundCondParamStr, 0, sizeof(boundCondParamStr));
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIORestartSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIORestartSimulationObserver.h
index 5681d9886..d07bf6b77 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIORestartSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIORestartSimulationObserver.h
@@ -11,7 +11,7 @@
 
 class Grid3D;
 class UbScheduler;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class BCSet;
 class LBMKernel;
 
@@ -32,7 +32,7 @@ public:
         PressureField = 9
     };
 
-    MPIIORestartSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm);
+    MPIIORestartSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm);
     ~MPIIORestartSimulationObserver() override;
     //! Each timestep writes the grid into the files
     void update(real step) override;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOSimulationObserver.cpp
index adb47a75a..19ea0482a 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOSimulationObserver.cpp
@@ -1,6 +1,6 @@
 #include "MPIIOSimulationObserver.h"
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "CoordinateTransformation3D.h"
 #include "Grid3D.h"
 #include "MPIIODataStructures.h"
@@ -13,7 +13,7 @@
 using namespace MPIIODataStructures;
 
 MPIIOSimulationObserver::MPIIOSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                                   std::shared_ptr<vf::mpi::Communicator> comm)
+                                   std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), path(path), comm(comm)
 {
     UbSystem::makeDirectory(path + "/mpi_io_cp");
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOSimulationObserver.h
index c04938a71..9ecaf89a7 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/MPIIOSimulationObserver.h
@@ -8,14 +8,14 @@
 
 class Grid3D;
 class UbScheduler;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 
 //! \class MPIWriteBlocksBESimulationObserver
 //! \brief Writes the grid each timestep into the files and reads the grip from the files before regenerating
 class MPIIOSimulationObserver : public SimulationObserver
 {
 public:
-    MPIIOSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm);
+    MPIIOSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm);
     ~MPIIOSimulationObserver() override;
 
     //! Each timestep writes the grid into the files
@@ -37,7 +37,7 @@ public:
 
 protected:
     std::string path;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     MPI_Datatype gridParamType, block3dType, dataSetParamType, boundCondType, arrayPresenceType;
 };
 #endif // ! _MPIIOSimulationObserver_H_
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/MicrophoneArraySimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/MicrophoneArraySimulationObserver.cpp
index 2979c841c..10749b513 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/MicrophoneArraySimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/MicrophoneArraySimulationObserver.cpp
@@ -2,7 +2,7 @@
 #include "BCArray3D.h"
 #include "BCSet.h"
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "D3Q27System.h"
 #include "DataSet3D.h"
 #include "DistributionArray3D.h"
@@ -13,7 +13,7 @@
 #include <sstream>
 
 MicrophoneArraySimulationObserver::MicrophoneArraySimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                                                       std::shared_ptr<vf::mpi::Communicator> comm)
+                                                       std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), path(path), comm(comm)
 {
     count = 0;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/MicrophoneArraySimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/MicrophoneArraySimulationObserver.h
index e87954fa2..f95b435ba 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/MicrophoneArraySimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/MicrophoneArraySimulationObserver.h
@@ -8,7 +8,7 @@
 #include <string>
 #include <vector>
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class Vector3D;
@@ -23,7 +23,7 @@ class MicrophoneArraySimulationObserver : public SimulationObserver
 {
 public:
     MicrophoneArraySimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                               std::shared_ptr<vf::mpi::Communicator> comm);
+                               std::shared_ptr<vf::parallel::Communicator> comm);
     ~MicrophoneArraySimulationObserver() override;
 
     //! calls collectData.
@@ -38,7 +38,7 @@ protected:
 
 private:
     std::string path;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 
     struct Mic {
         unsigned int id;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/NUPSCounterSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/NUPSCounterSimulationObserver.cpp
index 3bd0bd6f8..81d7217c4 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/NUPSCounterSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/NUPSCounterSimulationObserver.cpp
@@ -33,12 +33,12 @@
 
 #include "NUPSCounterSimulationObserver.h"
 
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "Grid3D.h"
 #include "UbScheduler.h"
 
 NUPSCounterSimulationObserver::NUPSCounterSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, int numOfThreads,
-                                               std::shared_ptr<vf::mpi::Communicator> comm)
+                                               std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), numOfThreads(numOfThreads), nup(0), nup_t(0), nupsStep(0.0), comm(comm)
 {
     if (comm->getProcessID() == comm->getRoot()) {
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/NUPSCounterSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/NUPSCounterSimulationObserver.h
index fdce1c4d6..f0585bbdb 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/NUPSCounterSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/NUPSCounterSimulationObserver.h
@@ -39,7 +39,7 @@
 #include "SimulationObserver.h"
 #include "basics/utilities/UbTiming.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 
@@ -54,7 +54,7 @@ public:
     //! \param s is UbScheduler object for scheduling of observer
     //! \param numOfThreads is number of threads
     //! \param comm is Communicator object
-    NUPSCounterSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, int numOfThreads, std::shared_ptr<vf::mpi::Communicator> comm);
+    NUPSCounterSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, int numOfThreads, std::shared_ptr<vf::parallel::Communicator> comm);
     ~NUPSCounterSimulationObserver() override;
 
     void update(real step) override;
@@ -70,7 +70,7 @@ protected:
     real nup;
     real nup_t;
     real nupsStep;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 };
 
 #endif
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/PressureCoefficientSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/PressureCoefficientSimulationObserver.cpp
index 7c9cd4b85..f36997c05 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/PressureCoefficientSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/PressureCoefficientSimulationObserver.cpp
@@ -4,7 +4,7 @@
 #include "BCArray3D.h"
 #include "BCSet.h"
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "D3Q27Interactor.h"
 #include "DataSet3D.h"
 #include "GbCuboid3D.h"
@@ -14,7 +14,7 @@
 
 PressureCoefficientSimulationObserver::PressureCoefficientSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s,
                                                                GbCuboid3DPtr plane, const std::string &path,
-                                                               std::shared_ptr<vf::mpi::Communicator> comm)
+                                                               std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), plane(plane), path(path), comm(comm)
 {
     maxStep       = scheduler->getMaxEnd();
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/PressureCoefficientSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/PressureCoefficientSimulationObserver.h
index bfb56a65d..16e14af64 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/PressureCoefficientSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/PressureCoefficientSimulationObserver.h
@@ -11,7 +11,7 @@
 
 class GbCuboid3D;
 class D3Q27Interactor;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 
@@ -19,7 +19,7 @@ class PressureCoefficientSimulationObserver : public SimulationObserver
 {
 public:
     PressureCoefficientSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<GbCuboid3D> plane,
-                                   const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm);
+                                   const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm);
     ~PressureCoefficientSimulationObserver() override;
 
     void update(real step) override;
@@ -35,7 +35,7 @@ protected:
 private:
     SPtr<GbCuboid3D> plane;
     std::string path;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     std::vector<SPtr<D3Q27Interactor>> interactors;
     int numberOfSteps;
     real maxStep;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/PressureDifferenceSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/PressureDifferenceSimulationObserver.cpp
index 9b3c63f40..c8726bd57 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/PressureDifferenceSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/PressureDifferenceSimulationObserver.cpp
@@ -9,7 +9,7 @@
 
 #include <fstream>
 
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "Grid3D.h"
 #include "IntegrateValuesHelper.h"
 #include "LBMUnitConverter.h"
@@ -18,7 +18,7 @@
 PressureDifferenceSimulationObserver::PressureDifferenceSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s,
                                                              const std::string &path, SPtr<IntegrateValuesHelper> h1,
                                                              SPtr<IntegrateValuesHelper> h2, real rhoReal,
-                                                             real uReal, real uLB, std::shared_ptr<vf::mpi::Communicator> comm)
+                                                             real uReal, real uLB, std::shared_ptr<vf::parallel::Communicator> comm)
 
     : SimulationObserver(grid, s), path(path), h1(h1), h2(h2), comm(comm)
 {
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/PressureDifferenceSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/PressureDifferenceSimulationObserver.h
index 35356d25f..df9d5364b 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/PressureDifferenceSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/PressureDifferenceSimulationObserver.h
@@ -14,7 +14,7 @@
 #include "SimulationObserver.h"
 #include "LBMSystem.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class LBMUnitConverter;
@@ -26,7 +26,7 @@ public:
     PressureDifferenceSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
                                   SPtr<IntegrateValuesHelper> h1, SPtr<IntegrateValuesHelper> h2, real rhoReal,
                                   real uReal, real uLB,
-                                  /*const SPtr<LBMUnitConverter> conv,*/ std::shared_ptr<vf::mpi::Communicator> comm);
+                                  /*const SPtr<LBMUnitConverter> conv,*/ std::shared_ptr<vf::parallel::Communicator> comm);
     ~PressureDifferenceSimulationObserver() override;
 
     void update(real step) override;
@@ -36,7 +36,7 @@ protected:
     std::string path;
     SPtr<LBMUnitConverter> conv;
     void collectData(real step);
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     real factor1; //= (1/3)*rhoReal*(uReal/uLB)^2 for calculation pReal = rhoLB * (1/3)*rhoReal*(uReal/uLB)^2,
                      //rhoReal and uReal in SI
     real factor2; //= rhoReal*(uReal/uLB)^2       for calculation pReal = press * rhoReal*(uReal/uLB)^2, rhoReal and
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/QCriterionSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/QCriterionSimulationObserver.cpp
index f94b1b447..2060160f9 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/QCriterionSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/QCriterionSimulationObserver.cpp
@@ -7,11 +7,11 @@
 #include "basics/writer/WbWriterVtkXmlASCII.h"
 
 #include "BCArray3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "UbScheduler.h"
 
 QCriterionSimulationObserver::QCriterionSimulationObserver(SPtr<Grid3D> grid, const std::string &path, WbWriter *const writer,
-                                             SPtr<UbScheduler> s, std::shared_ptr<vf::mpi::Communicator> comm)
+                                             SPtr<UbScheduler> s, std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), path(path), comm(comm), writer(writer)
 {
     init();
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/QCriterionSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/QCriterionSimulationObserver.h
index 1d5aec23f..45eddf04a 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/QCriterionSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/QCriterionSimulationObserver.h
@@ -13,7 +13,7 @@
 #include "LBMSystem.h"
 #include "UbTuple.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class WbWriter;
@@ -29,7 +29,7 @@ class QCriterionSimulationObserver : public SimulationObserver
 {
 public:
     QCriterionSimulationObserver(SPtr<Grid3D> grid, const std::string &path, WbWriter *const writer, SPtr<UbScheduler> s,
-                          std::shared_ptr<vf::mpi::Communicator> comm);
+                          std::shared_ptr<vf::parallel::Communicator> comm);
     //! Make update if timestep is write-timestep specified in SPtr<UbScheduler> s
     void update(real step) override;
 
@@ -58,7 +58,7 @@ private:
     int gridRank; // comm-Rank des aktuellen prozesses
     std::string path;
     WbWriter *writer;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     enum Values { xdir = 0, ydir = 1, zdir = 2 }; // labels for the different components
 };
 
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/ShearStressSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/ShearStressSimulationObserver.cpp
index 2d7863292..b90dd53d7 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/ShearStressSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/ShearStressSimulationObserver.cpp
@@ -4,7 +4,7 @@
 
 #include "BCArray3D.h"
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "D3Q27Interactor.h"
 #include "DataSet3D.h"
 #include "Grid3D.h"
@@ -16,7 +16,7 @@ ShearStressSimulationObserver::ShearStressSimulationObserver(SPtr<Grid3D> grid,
                                                SPtr<UbScheduler> s, SPtr<UbScheduler> rs)
     : SimulationObserver(grid, s), Resetscheduler(rs), path(path), writer(writer)
 {
-    std::shared_ptr<vf::mpi::Communicator> comm = vf::mpi::Communicator::getInstance();
+    std::shared_ptr<vf::parallel::Communicator> comm = vf::parallel::Communicator::getInstance();
     normals.push_back(0);
     normals.push_back(0);
     normals.push_back(1);
@@ -62,7 +62,7 @@ void ShearStressSimulationObserver::collectData(real step)
 
     // vector<string> cellDataNames;
 
-    // std::shared_ptr<vf::mpi::Communicator> comm = vf::mpi::Communicator::getInstance();
+    // std::shared_ptr<vf::parallel::Communicator> comm = vf::parallel::Communicator::getInstance();
     // vector<string> pieces = comm->gatherStrings(piece);
     // if (comm->getProcessID() == comm->getRoot())
     //{
@@ -94,7 +94,7 @@ void ShearStressSimulationObserver::collectData(real step)
     piece           = subfolder + "/" + piece;
 
     vector<string> cellDataNames;
-    std::shared_ptr<vf::mpi::Communicator> comm = vf::mpi::Communicator::getInstance();
+    std::shared_ptr<vf::parallel::Communicator> comm = vf::parallel::Communicator::getInstance();
     vector<string> pieces   = comm->gather(piece);
     if (comm->getProcessID() == comm->getRoot()) {
         string pname =
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/TimeAveragedValuesSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/TimeAveragedValuesSimulationObserver.cpp
index ebd65f625..6c1d833cc 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/TimeAveragedValuesSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/TimeAveragedValuesSimulationObserver.cpp
@@ -4,7 +4,7 @@
 #include "LBMKernel.h"
 
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "DataSet3D.h"
 #include "Grid3D.h"
 #include "UbScheduler.h"
@@ -16,7 +16,7 @@ TimeAveragedValuesSimulationObserver::TimeAveragedValuesSimulationObserver() = d
 //////////////////////////////////////////////////////////////////////////
 TimeAveragedValuesSimulationObserver::TimeAveragedValuesSimulationObserver(SPtr<Grid3D> grid, const std::string &path,
                                                              WbWriter *const writer, SPtr<UbScheduler> s,
-                                                             std::shared_ptr<vf::mpi::Communicator> comm, int options)
+                                                             std::shared_ptr<vf::parallel::Communicator> comm, int options)
     : SimulationObserver(grid, s), path(path), writer(writer), comm(comm), options(options)
 {
     init();
@@ -26,7 +26,7 @@ TimeAveragedValuesSimulationObserver::TimeAveragedValuesSimulationObserver(SPtr<
 //////////////////////////////////////////////////////////////////////////
 TimeAveragedValuesSimulationObserver::TimeAveragedValuesSimulationObserver(SPtr<Grid3D> grid, const std::string &path,
                                                              WbWriter *const writer, SPtr<UbScheduler> s,
-                                                             std::shared_ptr<vf::mpi::Communicator> comm, int options,
+                                                             std::shared_ptr<vf::parallel::Communicator> comm, int options,
                                                              std::vector<int> levels, std::vector<real> &levelCoords,
                                                              std::vector<real> &bounds, bool timeAveraging)
     : SimulationObserver(grid, s), path(path), writer(writer), comm(comm), options(options), levels(levels),
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/TimeAveragedValuesSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/TimeAveragedValuesSimulationObserver.h
index 14a1f6354..a9f78137b 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/TimeAveragedValuesSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/TimeAveragedValuesSimulationObserver.h
@@ -9,7 +9,7 @@
 #include "IntegrateValuesHelper.h"
 #include "LBMSystem.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class WbWriter;
@@ -41,9 +41,9 @@ public:
 public:
     TimeAveragedValuesSimulationObserver();
     TimeAveragedValuesSimulationObserver(SPtr<Grid3D> grid, const std::string &path, WbWriter *const writer,
-                                  SPtr<UbScheduler> s, std::shared_ptr<vf::mpi::Communicator> comm, int options);
+                                  SPtr<UbScheduler> s, std::shared_ptr<vf::parallel::Communicator> comm, int options);
     TimeAveragedValuesSimulationObserver(SPtr<Grid3D> grid, const std::string &path, WbWriter *const writer,
-                                  SPtr<UbScheduler> s, std::shared_ptr<vf::mpi::Communicator> comm, int options, std::vector<int> levels,
+                                  SPtr<UbScheduler> s, std::shared_ptr<vf::parallel::Communicator> comm, int options, std::vector<int> levels,
                                   std::vector<real> &levelCoords, std::vector<real> &bounds,
                                   bool timeAveraging = true);
     //! Make update
@@ -70,7 +70,7 @@ protected:
     void calculateAverageValuesForPlane(std::vector<IntegrateValuesHelper::CalcNodes> &cnodes);
 
 private:
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     std::vector<UbTupleFloat3> nodes;
     std::vector<UbTupleUInt8> cells;
     std::vector<std::string> datanames;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/TimeseriesSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/TimeseriesSimulationObserver.cpp
index e0560e276..5245c51d9 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/TimeseriesSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/TimeseriesSimulationObserver.cpp
@@ -9,14 +9,14 @@
 
 #include <fstream>
 
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "Grid3D.h"
 #include "IntegrateValuesHelper.h"
 #include "LBMUnitConverter.h"
 #include "UbScheduler.h"
 
 TimeseriesSimulationObserver::TimeseriesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<IntegrateValuesHelper> h1,
-                                             const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm)
+                                             const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), h1(h1), path(path), comm(comm)
 {
     if (comm->getProcessID() == comm->getRoot()) {
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/TimeseriesSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/TimeseriesSimulationObserver.h
index db41bd2ec..d467b2301 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/TimeseriesSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/TimeseriesSimulationObserver.h
@@ -13,7 +13,7 @@
 
 #include "SimulationObserver.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class IntegrateValuesHelper;
@@ -27,7 +27,7 @@ class TimeseriesSimulationObserver : public SimulationObserver
 {
 public:
     TimeseriesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<IntegrateValuesHelper> h1,
-                          const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm);
+                          const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm);
     ~TimeseriesSimulationObserver() override;
 
     //! calls collectData.
@@ -38,7 +38,7 @@ protected:
 
     //! object that can compute spacial average values in 3D-subdomain.
     SPtr<IntegrateValuesHelper> h1;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 
 private:
     std::string path; //! output filename, e.g.  pathname + "/steps/timeseries"
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/TurbulenceIntensitySimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/TurbulenceIntensitySimulationObserver.cpp
index 47b865ed7..d5a0ccb59 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/TurbulenceIntensitySimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/TurbulenceIntensitySimulationObserver.cpp
@@ -3,7 +3,7 @@
 #include "BCArray3D.h"
 #include "BCSet.h"
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "DataSet3D.h"
 #include "Grid3D.h"
 #include "LBMKernel.h"
@@ -14,7 +14,7 @@
 
 TurbulenceIntensitySimulationObserver::TurbulenceIntensitySimulationObserver(SPtr<Grid3D> grid, const std::string &path,
                                                                WbWriter *const writer, SPtr<UbScheduler> s,
-                                                               std::shared_ptr<vf::mpi::Communicator> comm)
+                                                               std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), path(path), comm(comm), writer(writer)
 {
     init();
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/TurbulenceIntensitySimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/TurbulenceIntensitySimulationObserver.h
index c615bbda5..cffaf49ed 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/TurbulenceIntensitySimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/TurbulenceIntensitySimulationObserver.h
@@ -8,7 +8,7 @@
 #include "SimulationObserver.h"
 #include "UbTuple.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class WbWriter;
@@ -18,7 +18,7 @@ class TurbulenceIntensitySimulationObserver : public SimulationObserver
 {
 public:
     TurbulenceIntensitySimulationObserver(SPtr<Grid3D> grid, const std::string &path, WbWriter *const writer,
-                                   SPtr<UbScheduler> s, std::shared_ptr<vf::mpi::Communicator> comm);
+                                   SPtr<UbScheduler> s, std::shared_ptr<vf::parallel::Communicator> comm);
     void update(real step) override;
 
 protected:
@@ -39,7 +39,7 @@ private:
     int gridRank;
     std::string path;
     WbWriter *writer;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     enum Values { AvVx = 0, AvVy = 1, AvVz = 2, AvVxxyyzz = 3 };
 };
 #endif
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBlocksSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBlocksSimulationObserver.cpp
index fd983bd02..15a1c39cf 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBlocksSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBlocksSimulationObserver.cpp
@@ -36,13 +36,13 @@
 #include <logger/Logger.h>
 
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "D3Q27System.h"
 #include "Grid3D.h"
 #include "UbScheduler.h"
 
 WriteBlocksSimulationObserver::WriteBlocksSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                                               WbWriter *const writer, std::shared_ptr<vf::mpi::Communicator> comm)
+                                               WbWriter *const writer, std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), path(path), writer(writer), comm(comm)
 {
 }
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBlocksSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBlocksSimulationObserver.h
index 805605b64..636dc9f18 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBlocksSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBlocksSimulationObserver.h
@@ -39,7 +39,7 @@
 
 #include "SimulationObserver.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class WbWriter;
@@ -57,7 +57,7 @@ public:
     //! \param writer is WbWriter object
     //! \param comm is Communicator object
     WriteBlocksSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, WbWriter *const writer,
-                           std::shared_ptr<vf::mpi::Communicator> comm);
+                           std::shared_ptr<vf::parallel::Communicator> comm);
     ~WriteBlocksSimulationObserver() override;
 
     void update(real step) override;
@@ -69,7 +69,7 @@ protected:
 
     std::string path;
     WbWriter *writer;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 };
 
 #endif
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBoundaryConditionsSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBoundaryConditionsSimulationObserver.cpp
index 9d09db9e2..d88315475 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBoundaryConditionsSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBoundaryConditionsSimulationObserver.cpp
@@ -42,7 +42,7 @@
 #include "BCArray3D.h"
 #include "Block3D.h"
 #include "CbArray3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "Grid3D.h"
 #include "LBMUnitConverter.h"
 #include "UbScheduler.h"
@@ -55,7 +55,7 @@ WriteBoundaryConditionsSimulationObserver::WriteBoundaryConditionsSimulationObse
 //////////////////////////////////////////////////////////////////////////
 WriteBoundaryConditionsSimulationObserver::WriteBoundaryConditionsSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s,
                                                                        const std::string &path, WbWriter *const writer,
-                                                                       std::shared_ptr<vf::mpi::Communicator> comm)
+                                                                       std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), path(path), writer(writer), comm(comm)
 {
     gridRank     = comm->getProcessID();
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBoundaryConditionsSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBoundaryConditionsSimulationObserver.h
index ad5b20df9..aff789343 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBoundaryConditionsSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteBoundaryConditionsSimulationObserver.h
@@ -41,7 +41,7 @@
 #include "SimulationObserver.h"
 #include "UbTuple.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class WbWriter;
@@ -61,7 +61,7 @@ public:
     //! \param writer is WbWriter object
     //! \param comm is Communicator object
     WriteBoundaryConditionsSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                                       WbWriter *const writer, std::shared_ptr<vf::mpi::Communicator> comm);
+                                       WbWriter *const writer, std::shared_ptr<vf::parallel::Communicator> comm);
     ~WriteBoundaryConditionsSimulationObserver() override = default;
 
     void update(real step) override;
@@ -84,6 +84,6 @@ private:
     int minInitLevel;
     int maxInitLevel;
     int gridRank;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 };
 #endif
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteGbObjectsSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteGbObjectsSimulationObserver.cpp
index 62178444f..7ad8c2dc1 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteGbObjectsSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteGbObjectsSimulationObserver.cpp
@@ -1,5 +1,5 @@
 #include "WriteGbObjectsSimulationObserver.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "GbObject3D.h"
 #include "UbScheduler.h"
 #include "WbWriterVtkXmlASCII.h"
@@ -7,7 +7,7 @@
 #include <vector>
 
 WriteGbObjectsSimulationObserver::WriteGbObjectsSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                                                     WbWriter *const writer, std::shared_ptr<vf::mpi::Communicator> comm)
+                                                     WbWriter *const writer, std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), path(path), writer(writer), comm(comm)
 {
 }
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteGbObjectsSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteGbObjectsSimulationObserver.h
index 50f88c65d..44e466ee9 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteGbObjectsSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteGbObjectsSimulationObserver.h
@@ -7,7 +7,7 @@
 #include <vector>
 
 class GbObject3D;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class WbWriter;
@@ -21,7 +21,7 @@ class WriteGbObjectsSimulationObserver : public SimulationObserver
 {
 public:
     WriteGbObjectsSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path, WbWriter *const writer,
-                              std::shared_ptr<vf::mpi::Communicator> comm);
+                              std::shared_ptr<vf::parallel::Communicator> comm);
     ~WriteGbObjectsSimulationObserver() override;
     //! calls collectData.
     void update(real step) override;
@@ -35,7 +35,7 @@ private:
     std::vector<SPtr<GbObject3D>> objects;
     std::string path;
     WbWriter *writer;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 };
 
 #endif // WriteGbObjectsSimulationObserver_h__
\ No newline at end of file
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMQFromSelectionSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMQFromSelectionSimulationObserver.cpp
index caf1e8c1e..945058c8e 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMQFromSelectionSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMQFromSelectionSimulationObserver.cpp
@@ -6,7 +6,7 @@
 
 #include "BCArray3D.h"
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "DataSet3D.h"
 #include "GbObject3D.h"
 #include "Grid3D.h"
@@ -19,7 +19,7 @@ WriteMQFromSelectionSimulationObserver::WriteMQFromSelectionSimulationObserver()
 WriteMQFromSelectionSimulationObserver::WriteMQFromSelectionSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s,
                                                                  SPtr<GbObject3D> gbObject, const std::string &path,
                                                                  WbWriter *const writer, SPtr<LBMUnitConverter> conv,
-                                                                 std::shared_ptr<vf::mpi::Communicator> comm)
+                                                                 std::shared_ptr<vf::parallel::Communicator> comm)
     : SimulationObserver(grid, s), gbObject(gbObject), path(path), writer(writer), conv(conv), comm(comm)
 {
     gridRank     = comm->getProcessID();
@@ -80,7 +80,7 @@ void WriteMQFromSelectionSimulationObserver::collectData(real step)
     piece                = subfolder + "/" + piece;
 
     std::vector<std::string> cellDataNames;
-    std::shared_ptr<vf::mpi::Communicator> comm         = vf::mpi::Communicator::getInstance();
+    std::shared_ptr<vf::parallel::Communicator> comm         = vf::parallel::Communicator::getInstance();
     std::vector<std::string> pieces = comm->gather(piece);
     if (comm->getProcessID() == comm->getRoot()) {
         std::string pname =
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMQFromSelectionSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMQFromSelectionSimulationObserver.h
index e91fc369e..107d49720 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMQFromSelectionSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMQFromSelectionSimulationObserver.h
@@ -10,7 +10,7 @@
 #include "LBMSystem.h"
 #include "UbTuple.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class LBMUnitConverter;
@@ -24,7 +24,7 @@ public:
     WriteMQFromSelectionSimulationObserver();
     WriteMQFromSelectionSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, SPtr<GbObject3D> gbObject,
                                     const std::string &path, WbWriter *const writer, SPtr<LBMUnitConverter> conv,
-                                    std::shared_ptr<vf::mpi::Communicator> comm);
+                                    std::shared_ptr<vf::parallel::Communicator> comm);
     ~WriteMQFromSelectionSimulationObserver() override = default;
 
     void update(real step) override;
@@ -47,7 +47,7 @@ private:
     int minInitLevel;
     int maxInitLevel;
     int gridRank;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     SPtr<GbObject3D> gbObject;
 
     using CalcMacrosFct = void (*)(const real *const &, real &, real &, real &, real &);
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesPlusMassSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesPlusMassSimulationObserver.cpp
index 142bcc52b..f098a21f5 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesPlusMassSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesPlusMassSimulationObserver.cpp
@@ -52,7 +52,7 @@ WriteMacroscopicQuantitiesPlusMassSimulationObserver::WriteMacroscopicQuantities
                                                                              const std::string &path,
                                                                              WbWriter *const writer,
                                                                              SPtr<LBMUnitConverter> conv,
-                                                                             std::shared_ptr<vf::mpi::Communicator> comm)
+                                                                             std::shared_ptr<vf::parallel::Communicator> comm)
         : SimulationObserver(grid, s), path(path), writer(writer), conv(conv), comm(comm)
 {
     gridRank = comm->getProcessID();
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesPlusMassSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesPlusMassSimulationObserver.h
index ce6946528..ee892f41a 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesPlusMassSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesPlusMassSimulationObserver.h
@@ -42,7 +42,7 @@
 #include "LBMSystem.h"
 #include "UbTuple.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class LBMUnitConverter;
@@ -63,7 +63,7 @@ public:
     //! \param conv is LBMUnitConverter object
     //! \param comm is Communicator object
     WriteMacroscopicQuantitiesPlusMassSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                                          WbWriter *const writer, SPtr<LBMUnitConverter> conv, std::shared_ptr<vf::mpi::Communicator> comm);
+                                          WbWriter *const writer, SPtr<LBMUnitConverter> conv, std::shared_ptr<vf::parallel::Communicator> comm);
     ~WriteMacroscopicQuantitiesPlusMassSimulationObserver() override = default;
 
     void update(real step) override;
@@ -90,7 +90,7 @@ private:
     int minInitLevel;
     int maxInitLevel;
     int gridRank;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 
     using CalcMacrosFct = void (*)(const real *const &, real &, real &, real &, real &);
     CalcMacrosFct calcMacros;
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesSimulationObserver.cpp b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesSimulationObserver.cpp
index b87b5cfcf..d5c80b4df 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesSimulationObserver.cpp
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesSimulationObserver.cpp
@@ -39,7 +39,7 @@
 
 #include "BCArray3D.h"
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "DataSet3D.h"
 #include "Grid3D.h"
 #include "LBMUnitConverter.h"
@@ -52,7 +52,7 @@ WriteMacroscopicQuantitiesSimulationObserver::WriteMacroscopicQuantitiesSimulati
                                                                              const std::string &path,
                                                                              WbWriter *const writer,
                                                                              SPtr<LBMUnitConverter> conv,
-                                                                             std::shared_ptr<vf::mpi::Communicator> comm)
+                                                                             std::shared_ptr<vf::parallel::Communicator> comm)
         : SimulationObserver(grid, s), path(path), writer(writer), conv(conv), comm(comm)
 {
     gridRank = comm->getProcessID();
diff --git a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesSimulationObserver.h b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesSimulationObserver.h
index 85de03364..279f9dfba 100644
--- a/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesSimulationObserver.h
+++ b/src/cpu/VirtualFluidsCore/SimulationObservers/WriteMacroscopicQuantitiesSimulationObserver.h
@@ -42,7 +42,7 @@
 #include "LBMSystem.h"
 #include "UbTuple.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class UbScheduler;
 class LBMUnitConverter;
@@ -63,7 +63,7 @@ public:
     //! \param conv is LBMUnitConverter object
     //! \param comm is Communicator object
     WriteMacroscopicQuantitiesSimulationObserver(SPtr<Grid3D> grid, SPtr<UbScheduler> s, const std::string &path,
-                                          WbWriter *const writer, SPtr<LBMUnitConverter> conv, std::shared_ptr<vf::mpi::Communicator> comm);
+                                          WbWriter *const writer, SPtr<LBMUnitConverter> conv, std::shared_ptr<vf::parallel::Communicator> comm);
     ~WriteMacroscopicQuantitiesSimulationObserver() override = default;
 
     void update(real step) override;
@@ -90,7 +90,7 @@ private:
     int minInitLevel;
     int maxInitLevel;
     int gridRank;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 
     using CalcMacrosFct = void (*)(const real *const &, real &, real &, real &, real &);
     CalcMacrosFct calcMacros;
diff --git a/src/cpu/VirtualFluidsCore/Utilities/CheckpointConverter.cpp b/src/cpu/VirtualFluidsCore/Utilities/CheckpointConverter.cpp
index 358dabf43..0f41364ad 100644
--- a/src/cpu/VirtualFluidsCore/Utilities/CheckpointConverter.cpp
+++ b/src/cpu/VirtualFluidsCore/Utilities/CheckpointConverter.cpp
@@ -1,7 +1,7 @@
 #include "CheckpointConverter.h"
 #include "Block3D.h"
 #include "BoundaryConditions.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "CoordinateTransformation3D.h"
 #include "DataSet3D.h"
 #include "Grid3D.h"
@@ -12,7 +12,7 @@
 
 using namespace MPIIODataStructures;
 
-CheckpointConverter::CheckpointConverter(SPtr<Grid3D> grid, const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm)
+CheckpointConverter::CheckpointConverter(SPtr<Grid3D> grid, const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm)
     : grid(grid), path(path), comm(comm)
 {
     UbSystem::makeDirectory(path + "/mpi_io_cp");
diff --git a/src/cpu/VirtualFluidsCore/Utilities/CheckpointConverter.h b/src/cpu/VirtualFluidsCore/Utilities/CheckpointConverter.h
index bab67ae66..a2902b366 100644
--- a/src/cpu/VirtualFluidsCore/Utilities/CheckpointConverter.h
+++ b/src/cpu/VirtualFluidsCore/Utilities/CheckpointConverter.h
@@ -8,14 +8,14 @@
 #include <vector>
 
 class Grid3D;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 
 //! \class UtilConvertor
 //! \brief Converts timestep data from MPIIORestartSimulationObserver format into MPIIOMigrationSimulationObserver format
 class CheckpointConverter
 {
 public:
-    CheckpointConverter(SPtr<Grid3D> grid, const std::string &path, std::shared_ptr<vf::mpi::Communicator> comm);
+    CheckpointConverter(SPtr<Grid3D> grid, const std::string &path, std::shared_ptr<vf::parallel::Communicator> comm);
     virtual ~CheckpointConverter();
 
     void convert(int step, int procCount);
@@ -26,7 +26,7 @@ public:
 
 protected:
     std::string path;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     SPtr<Grid3D> grid;
 
 private:
diff --git a/src/cpu/VirtualFluidsCore/Visitors/CreateTransmittersHelper.cpp b/src/cpu/VirtualFluidsCore/Visitors/CreateTransmittersHelper.cpp
index b931cbbbd..35816e3d5 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/CreateTransmittersHelper.cpp
+++ b/src/cpu/VirtualFluidsCore/Visitors/CreateTransmittersHelper.cpp
@@ -50,7 +50,7 @@ CreateTransmittersHelper::CreateTransmittersHelper() = default;
 //////////////////////////////////////////////////////////////////////////
 void CreateTransmittersHelper::createTransmitters(SPtr<Block3D> sblock, SPtr<Block3D> tblock, int dir, IBlock ib,
                                                   TransmitterPtr &sender, TransmitterPtr &receiver,
-                                                  std::shared_ptr<vf::mpi::Communicator> comm, TransmitterType tType)
+                                                  std::shared_ptr<vf::parallel::Communicator> comm, TransmitterType tType)
 {
     // SourceBlock
     int srcLevel = sblock->getLevel();
diff --git a/src/cpu/VirtualFluidsCore/Visitors/CreateTransmittersHelper.h b/src/cpu/VirtualFluidsCore/Visitors/CreateTransmittersHelper.h
index af60de0a2..1a52078fa 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/CreateTransmittersHelper.h
+++ b/src/cpu/VirtualFluidsCore/Visitors/CreateTransmittersHelper.h
@@ -35,7 +35,7 @@
 #define CREATETRANSMITTERSHELPER_H
 
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 
 #include "LBMSystem.h"
 
@@ -61,7 +61,7 @@ public:
 public:
     CreateTransmittersHelper();
     void createTransmitters(const SPtr<Block3D> sblock, const SPtr<Block3D> tblock, int dir, IBlock ib,
-                            TransmitterPtr &sender, TransmitterPtr &receiver, std::shared_ptr<vf::mpi::Communicator> comm,
+                            TransmitterPtr &sender, TransmitterPtr &receiver, std::shared_ptr<vf::parallel::Communicator> comm,
                             TransmitterType tType);
 
 protected:
diff --git a/src/cpu/VirtualFluidsCore/Visitors/MetisPartitioningGridVisitor.cpp b/src/cpu/VirtualFluidsCore/Visitors/MetisPartitioningGridVisitor.cpp
index 1e62e0a2c..30708d664 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/MetisPartitioningGridVisitor.cpp
+++ b/src/cpu/VirtualFluidsCore/Visitors/MetisPartitioningGridVisitor.cpp
@@ -2,14 +2,14 @@
 
 #include "MetisPartitioningGridVisitor.h"
 #include "Block3D.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "D3Q27System.h"
 #include "Grid3D.h"
 #include <cmath>
 
 using namespace std;
 
-MetisPartitioningGridVisitor::MetisPartitioningGridVisitor(std::shared_ptr<vf::mpi::Communicator> comm, GraphType graphType, int numOfDirs,
+MetisPartitioningGridVisitor::MetisPartitioningGridVisitor(std::shared_ptr<vf::parallel::Communicator> comm, GraphType graphType, int numOfDirs,
                                                            MetisPartitioner::PartType partType, bool threads,
                                                            int numberOfThreads)
     : Grid3DVisitor(), numberOfThreads(numberOfThreads), numOfDirs(numOfDirs), comm(comm), threads(threads),
diff --git a/src/cpu/VirtualFluidsCore/Visitors/MetisPartitioningGridVisitor.h b/src/cpu/VirtualFluidsCore/Visitors/MetisPartitioningGridVisitor.h
index c270d3ce3..d4e290609 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/MetisPartitioningGridVisitor.h
+++ b/src/cpu/VirtualFluidsCore/Visitors/MetisPartitioningGridVisitor.h
@@ -9,7 +9,7 @@
 #include "Grid3DVisitor.h"
 #include "MetisPartitioner.h"
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 
 ////////////////////////////////////////////////////////////////////////
 //! \brief The class implements domain decomposition with METIS library
@@ -32,7 +32,7 @@ public:
     //! \param numOfDirs - maximum number of neighbors for each process
     //! \param threads - on/off decomposition for threads
     //! \param numberOfThreads - number of threads
-    MetisPartitioningGridVisitor(std::shared_ptr<vf::mpi::Communicator> comm, GraphType graphType, int numOfDirs,
+    MetisPartitioningGridVisitor(std::shared_ptr<vf::parallel::Communicator> comm, GraphType graphType, int numOfDirs,
                                  MetisPartitioner::PartType partType = MetisPartitioner::KWAY, bool threads = false,
                                  int numberOfThreads = 0);
     ~MetisPartitioningGridVisitor() override;
@@ -52,7 +52,7 @@ protected:
     int numOfDirs;
     std::vector<int> blockID;
     std::vector<idx_t> parts;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     int bundleRoot;
     int processRoot;
     int bundleID;
diff --git a/src/cpu/VirtualFluidsCore/Visitors/RefineAroundGbObjectHelper.cpp b/src/cpu/VirtualFluidsCore/Visitors/RefineAroundGbObjectHelper.cpp
index a73965641..b2eefc859 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/RefineAroundGbObjectHelper.cpp
+++ b/src/cpu/VirtualFluidsCore/Visitors/RefineAroundGbObjectHelper.cpp
@@ -1,5 +1,5 @@
 #include "RefineAroundGbObjectHelper.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "OverlapBlockVisitor.h"
 #include "RatioBlockVisitor.h"
 #include "RatioSmoothBlockVisitor.h"
@@ -11,7 +11,7 @@
 RefineAroundGbObjectHelper::RefineAroundGbObjectHelper(SPtr<Grid3D> grid, int refineLevel,
                                                        SPtr<D3Q27TriFaceMeshInteractor> objectIter,
                                                        real startDistance, real stopDistance,
-                                                       std::shared_ptr<vf::mpi::Communicator> comm)
+                                                       std::shared_ptr<vf::parallel::Communicator> comm)
     : grid(grid), refineLevel(refineLevel), objectIter(objectIter), startDistance(startDistance),
       stopDistance(stopDistance), comm(comm)
 {
diff --git a/src/cpu/VirtualFluidsCore/Visitors/RefineAroundGbObjectHelper.h b/src/cpu/VirtualFluidsCore/Visitors/RefineAroundGbObjectHelper.h
index 76874ce76..c5cc4d6fc 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/RefineAroundGbObjectHelper.h
+++ b/src/cpu/VirtualFluidsCore/Visitors/RefineAroundGbObjectHelper.h
@@ -5,7 +5,7 @@
 #include "lbm/constants/D3Q27.h"
 
 class Grid3D;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class D3Q27TriFaceMeshInteractor;
 
 //! \brief Refine blocks on base of bounding boxes.
@@ -21,7 +21,7 @@ public:
     //! \param startDistance start distance from geometry for refinement
     //! \param stopDistance stop distance from geometry for refinement
     RefineAroundGbObjectHelper(SPtr<Grid3D> grid, int maxRefineLevel, SPtr<D3Q27TriFaceMeshInteractor> objectIter,
-                               real startDistance, real stopDistance, std::shared_ptr<vf::mpi::Communicator> comm);
+                               real startDistance, real stopDistance, std::shared_ptr<vf::parallel::Communicator> comm);
     virtual ~RefineAroundGbObjectHelper();
     //! start refinement
     void refine();
@@ -31,7 +31,7 @@ private:
     SPtr<D3Q27TriFaceMeshInteractor> objectIter;
     int refineLevel;
     real startDistance, stopDistance;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 };
 
 #endif
diff --git a/src/cpu/VirtualFluidsCore/Visitors/RefineCrossAndInsideGbObjectHelper.cpp b/src/cpu/VirtualFluidsCore/Visitors/RefineCrossAndInsideGbObjectHelper.cpp
index 52c7c3ac1..f11e8e5ce 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/RefineCrossAndInsideGbObjectHelper.cpp
+++ b/src/cpu/VirtualFluidsCore/Visitors/RefineCrossAndInsideGbObjectHelper.cpp
@@ -1,6 +1,6 @@
 #include "RefineCrossAndInsideGbObjectHelper.h"
 #include "CheckRatioBlockVisitor.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "OverlapBlockVisitor.h"
 #include "RatioBlockVisitor.h"
 #include "RatioSmoothBlockVisitor.h"
@@ -11,7 +11,7 @@
 #include <Grid3D.h>
 
 RefineCrossAndInsideGbObjectHelper::RefineCrossAndInsideGbObjectHelper(SPtr<Grid3D> grid, int maxRefineLevel,
-                                                                       std::shared_ptr<vf::mpi::Communicator> comm)
+                                                                       std::shared_ptr<vf::parallel::Communicator> comm)
     : grid(grid), maxRefineLevel(maxRefineLevel), comm(comm)
 {
 }
diff --git a/src/cpu/VirtualFluidsCore/Visitors/RefineCrossAndInsideGbObjectHelper.h b/src/cpu/VirtualFluidsCore/Visitors/RefineCrossAndInsideGbObjectHelper.h
index d0a9ac448..28caf212a 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/RefineCrossAndInsideGbObjectHelper.h
+++ b/src/cpu/VirtualFluidsCore/Visitors/RefineCrossAndInsideGbObjectHelper.h
@@ -4,7 +4,7 @@
 #include <PointerDefinitions.h>
 #include <vector>
 
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Grid3D;
 class GbObject3D;
 
@@ -17,7 +17,7 @@ public:
     //! Constructor
     //! \param grid a smart pointer to the grid object
     //! \param maxRefineLevel an integer for maximal refinement level
-    RefineCrossAndInsideGbObjectHelper(SPtr<Grid3D> grid, int maxRefineLevel, std::shared_ptr<vf::mpi::Communicator> comm);
+    RefineCrossAndInsideGbObjectHelper(SPtr<Grid3D> grid, int maxRefineLevel, std::shared_ptr<vf::parallel::Communicator> comm);
     virtual ~RefineCrossAndInsideGbObjectHelper();
     //! add geometric object
     //! \param object a smart pointer to bounding box
@@ -31,7 +31,7 @@ private:
     std::vector<SPtr<GbObject3D>> objects;
     std::vector<int> levels;
     int maxRefineLevel;
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
 };
 
 #endif
diff --git a/src/cpu/VirtualFluidsCore/Visitors/RenumberGridVisitor.cpp b/src/cpu/VirtualFluidsCore/Visitors/RenumberGridVisitor.cpp
index ed9a3ee59..a21b8b8a6 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/RenumberGridVisitor.cpp
+++ b/src/cpu/VirtualFluidsCore/Visitors/RenumberGridVisitor.cpp
@@ -4,7 +4,7 @@
 #include "D3Q27System.h"
 //#include <mpi.h>
 
-RenumberGridVisitor::RenumberGridVisitor(std::shared_ptr<vf::mpi::Communicator> com) : comm(com) {}
+RenumberGridVisitor::RenumberGridVisitor(std::shared_ptr<vf::parallel::Communicator> com) : comm(com) {}
 
 //////////////////////////////////////////////////////////////////////////
 void RenumberGridVisitor::visit(SPtr<Grid3D> grid)
diff --git a/src/cpu/VirtualFluidsCore/Visitors/RenumberGridVisitor.h b/src/cpu/VirtualFluidsCore/Visitors/RenumberGridVisitor.h
index 993bccd10..aa56b4691 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/RenumberGridVisitor.h
+++ b/src/cpu/VirtualFluidsCore/Visitors/RenumberGridVisitor.h
@@ -8,7 +8,7 @@
 #ifndef RenumberGridVisitor_h
 #define RenumberGridVisitor_h
 
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "Grid3DVisitor.h"
 
 class Grid3D;
@@ -19,14 +19,14 @@ class Grid3D;
 class RenumberGridVisitor : public Grid3DVisitor
 {
 public:
-    RenumberGridVisitor(std::shared_ptr<vf::mpi::Communicator> com);
+    RenumberGridVisitor(std::shared_ptr<vf::parallel::Communicator> com);
 
     ~RenumberGridVisitor() override = default;
 
     void visit(SPtr<Grid3D> grid) override;
 
 private:
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     //   static int counter;
 };
 
diff --git a/src/cpu/VirtualFluidsCore/Visitors/SetConnectorsBlockVisitor.h b/src/cpu/VirtualFluidsCore/Visitors/SetConnectorsBlockVisitor.h
index ae214c77c..fcf2c93d2 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/SetConnectorsBlockVisitor.h
+++ b/src/cpu/VirtualFluidsCore/Visitors/SetConnectorsBlockVisitor.h
@@ -41,7 +41,7 @@
 #include "D3Q27System.h"
 #include "Grid3D.h"
 #include "CreateTransmittersHelper.h"
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "OneDistributionFullDirectConnector.h"
 #include "OneDistributionFullVectorConnector.h"
 #include "TwoDistributionsFullDirectConnector.h"
@@ -62,19 +62,19 @@ public:
     using LocalConnector  = T1;
     using RemoteConnector = T2;
 public:
-    SetConnectorsBlockVisitor(std::shared_ptr<vf::mpi::Communicator> comm);
+    SetConnectorsBlockVisitor(std::shared_ptr<vf::parallel::Communicator> comm);
     ~SetConnectorsBlockVisitor() override;
     void visit(SPtr<Grid3D> grid, SPtr<Block3D> block) override;
     //////////////////////////////////////////////////////////////////////////
 protected:
     void setSameLevelConnectors(SPtr<Grid3D> grid, SPtr<Block3D> block);
     void setRemoteConnectors(SPtr<Block3D> sblock, SPtr<Block3D> tblock, int dir);
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     int gridRank{0};
 };
 
 template <class T1, class T2>
-SetConnectorsBlockVisitor<T1, T2>::SetConnectorsBlockVisitor(std::shared_ptr<vf::mpi::Communicator> comm)
+SetConnectorsBlockVisitor<T1, T2>::SetConnectorsBlockVisitor(std::shared_ptr<vf::parallel::Communicator> comm)
     : Block3DVisitor(0, D3Q27System::MAXLEVEL), comm(comm)
 {
 }
diff --git a/src/cpu/VirtualFluidsCore/Visitors/SetInterpolationConnectorsBlockVisitor.cpp b/src/cpu/VirtualFluidsCore/Visitors/SetInterpolationConnectorsBlockVisitor.cpp
index 362e8c927..bdf851025 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/SetInterpolationConnectorsBlockVisitor.cpp
+++ b/src/cpu/VirtualFluidsCore/Visitors/SetInterpolationConnectorsBlockVisitor.cpp
@@ -39,10 +39,10 @@
 #include "D3Q27System.h"
 #include <basics/transmitter/TbTransmitterLocal.h>
 
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "Interpolator.h"
 
-SetInterpolationConnectorsBlockVisitor::SetInterpolationConnectorsBlockVisitor(std::shared_ptr<vf::mpi::Communicator> comm, real nue, SPtr<Interpolator> iProcessor) :
+SetInterpolationConnectorsBlockVisitor::SetInterpolationConnectorsBlockVisitor(std::shared_ptr<vf::parallel::Communicator> comm, real nue, SPtr<Interpolator> iProcessor) :
 Block3DVisitor(0, D3Q27System::MAXLEVEL), 
 	comm(comm),
 	nue(nue),
diff --git a/src/cpu/VirtualFluidsCore/Visitors/SetInterpolationConnectorsBlockVisitor.h b/src/cpu/VirtualFluidsCore/Visitors/SetInterpolationConnectorsBlockVisitor.h
index b1f6f99e1..4e4b20539 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/SetInterpolationConnectorsBlockVisitor.h
+++ b/src/cpu/VirtualFluidsCore/Visitors/SetInterpolationConnectorsBlockVisitor.h
@@ -43,14 +43,14 @@
 
 class Grid3D;
 class Block3D;
-namespace vf::mpi {class Communicator;}
+namespace vf::parallel {class Communicator;}
 class Interpolator;
 
 //! \brief  A class sets connectors between blocks.
 class SetInterpolationConnectorsBlockVisitor : public Block3DVisitor
 {
 public:
-    SetInterpolationConnectorsBlockVisitor(std::shared_ptr<vf::mpi::Communicator> comm, real nue, SPtr<Interpolator> iProcessor);
+    SetInterpolationConnectorsBlockVisitor(std::shared_ptr<vf::parallel::Communicator> comm, real nue, SPtr<Interpolator> iProcessor);
     ~SetInterpolationConnectorsBlockVisitor() override;
     void visit(SPtr<Grid3D> grid, SPtr<Block3D> block) override;
     //////////////////////////////////////////////////////////////////////////
@@ -63,7 +63,7 @@ protected:
                             CreateTransmittersHelper::TransmitterPtr &receiverCF,
                             CreateTransmittersHelper::TransmitterPtr &senderFC,
                             CreateTransmittersHelper::TransmitterPtr &receiverFC);
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     int gridRank;
     real nue;
     SPtr<Interpolator> iProcessor;
diff --git a/src/cpu/VirtualFluidsCore/Visitors/ZoltanPartitioningGridVisitor.cpp b/src/cpu/VirtualFluidsCore/Visitors/ZoltanPartitioningGridVisitor.cpp
index 7d9f5e8d4..c168cd664 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/ZoltanPartitioningGridVisitor.cpp
+++ b/src/cpu/VirtualFluidsCore/Visitors/ZoltanPartitioningGridVisitor.cpp
@@ -9,7 +9,7 @@
 
 using namespace std;
 
-ZoltanPartitioningGridVisitor::ZoltanPartitioningGridVisitor(std::shared_ptr<vf::mpi::Communicator> comm, int numOfDirs,
+ZoltanPartitioningGridVisitor::ZoltanPartitioningGridVisitor(std::shared_ptr<vf::parallel::Communicator> comm, int numOfDirs,
                                                              int numOfLocalParts)
     : comm(comm), numOfDirs(numOfDirs), numOfLocalParts(numOfLocalParts)
 {
diff --git a/src/cpu/VirtualFluidsCore/Visitors/ZoltanPartitioningGridVisitor.h b/src/cpu/VirtualFluidsCore/Visitors/ZoltanPartitioningGridVisitor.h
index aeaf4d705..1f02c5efa 100644
--- a/src/cpu/VirtualFluidsCore/Visitors/ZoltanPartitioningGridVisitor.h
+++ b/src/cpu/VirtualFluidsCore/Visitors/ZoltanPartitioningGridVisitor.h
@@ -10,14 +10,14 @@
 
 #if defined VF_ZOLTAN && defined VF_MPI
 
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 #include "Grid3DVisitor.h"
 #include "ZoltanPartitioner.h"
 
 class ZoltanPartitioningGridVisitor : public Grid3DVisitor
 {
 public:
-    ZoltanPartitioningGridVisitor(std::shared_ptr<vf::mpi::Communicator> comm, int numOfDirs, int numOfLocalParts = 1);
+    ZoltanPartitioningGridVisitor(std::shared_ptr<vf::parallel::Communicator> comm, int numOfDirs, int numOfLocalParts = 1);
     ~ZoltanPartitioningGridVisitor();
     void visit(SPtr<Grid3D> grid);
 
@@ -26,7 +26,7 @@ protected:
     void repartGrid(SPtr<Grid3D> grid, ZoltanPartitioner &zp);
 
 private:
-    std::shared_ptr<vf::mpi::Communicator> comm;
+    std::shared_ptr<vf::parallel::Communicator> comm;
     int numOfDirs;
     int numOfLocalParts;
     ZoltanGraph *graph;
diff --git a/src/cpu/simulationconfig/Simulation.cpp b/src/cpu/simulationconfig/Simulation.cpp
index 67c4e469d..4b7d52286 100644
--- a/src/cpu/simulationconfig/Simulation.cpp
+++ b/src/cpu/simulationconfig/Simulation.cpp
@@ -29,7 +29,7 @@
 #include <LBM/Interpolation/CompressibleOffsetMomentsInterpolator.h>
 #include <LBM/LBMKernel.h>
 #include <LBM/LBMUnitConverter.h>
-#include <mpi/MPICommunicator.h>
+#include <parallel/MPICommunicator.h>
 #include <Visitors/GenBlocksGridVisitor.h>
 #include <Visitors/InitDistributionsBlockVisitor.h>
 #include <Visitors/MetisPartitioningGridVisitor.h>
@@ -45,7 +45,7 @@
 
 CPUSimulation::CPUSimulation()
 {
-    this->communicator = vf::mpi::MPICommunicator::getInstance();
+    this->communicator = vf::parallel::MPICommunicator::getInstance();
     this->grid = std::make_shared<Grid3D>(communicator);
 }
 
diff --git a/src/cpu/simulationconfig/Simulation.h b/src/cpu/simulationconfig/Simulation.h
index ee8fc911c..be2953918 100644
--- a/src/cpu/simulationconfig/Simulation.h
+++ b/src/cpu/simulationconfig/Simulation.h
@@ -5,7 +5,7 @@
 #include <memory>
 #include <set>
 
-#include <mpi/Communicator.h>
+#include <parallel/Communicator.h>
 
 #include <geometry3d/GbPoint3D.h>
 #include <Interactors/Interactor3D.h>
@@ -78,7 +78,7 @@ private:
 
     std::shared_ptr<LBMKernel> lbmKernel;
     std::shared_ptr<AbstractLBMSystem> lbmSystem;
-    std::shared_ptr<vf::mpi::Communicator> communicator;
+    std::shared_ptr<vf::parallel::Communicator> communicator;
 
     std::shared_ptr<Grid3D> grid;
     std::vector<std::shared_ptr<Interactor3D>> interactors;
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
index 4924432db..fddcc7f79 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
@@ -50,7 +50,7 @@
 using namespace vf::basics::constant;
 
 struct Vertex;
-class  Grid;
+class Grid;
 class Transformator;
 class ArrowTransformator;
 class PolyDataWriterWrapper;
diff --git a/src/gpu/VirtualFluids_GPU/CMakeLists.txt b/src/gpu/VirtualFluids_GPU/CMakeLists.txt
index a6eb9d861..a817e9cd4 100644
--- a/src/gpu/VirtualFluids_GPU/CMakeLists.txt
+++ b/src/gpu/VirtualFluids_GPU/CMakeLists.txt
@@ -1,11 +1,7 @@
 project(VirtualFluids_GPU LANGUAGES CUDA CXX)
 
-set(additional_libraries "")
-if(MSVC)
-    set(additional_libraries ws2_32 Traffic) # ws_32 throws an error on Phoenix
-endif()
 
-vf_add_library(PUBLIC_LINK basics lbm PRIVATE_LINK ${additional_libraries} GridGenerator MPI::MPI_CXX vf_cuda)
+vf_add_library(PUBLIC_LINK basics lbm parallel PRIVATE_LINK GridGenerator vf_cuda)
 
 #SET(TPN_WIN32 "/EHsc")
 #https://stackoverflow.com/questions/6832666/lnk2019-when-including-asio-headers-solution-generated-with-cmake
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
index a2b1039af..cf1aaa398 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
@@ -13,6 +13,8 @@
 #include "CollisionStrategy.h"
 #include "RefinementStrategy.h"
 
+#include <parallel/Communicator.h>
+
 void UpdateGrid27::updateGrid(int level, unsigned int t)
 {
     //////////////////////////////////////////////////////////////////////////
@@ -381,7 +383,7 @@ void UpdateGrid27::exchangeData(int level)
     exchangeMultiGPU_noStreams_withPrepare(level, false);
 }
 
-UpdateGrid27::UpdateGrid27(SPtr<Parameter> para, vf::gpu::Communicator &comm, SPtr<CudaMemoryManager> cudaMemoryManager,
+UpdateGrid27::UpdateGrid27(SPtr<Parameter> para, vf::parallel::Communicator &comm, SPtr<CudaMemoryManager> cudaMemoryManager,
                            std::vector<std::shared_ptr<PorousMedia>> &pm, std::vector<SPtr<Kernel>> &kernels , BoundaryConditionFactory* bcFactory, SPtr<TurbulenceModelFactory>  tmFactory, GridScalingFactory* scalingFactory)
     : para(para), comm(comm), cudaMemoryManager(cudaMemoryManager), pm(pm), kernels(kernels), tmFactory(tmFactory)
 {
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
index 9c6ff4872..9de7e73ec 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
@@ -1,13 +1,17 @@
 #ifndef UPDATEGRID27_H
 #define UPDATEGRID27_H
 
-#include "LBM/LB.h"
+#include "Calculation/PorousMedia.h"
+#include "GPU/CudaMemoryManager.h"
 #include "GPU/GPU_Interface.h"
-#include "Parameter/Parameter.h"
+#include "LBM/LB.h"
 #include "Parameter/CudaStreamManager.h"
-#include "GPU/CudaMemoryManager.h"
-#include "Communication/Communicator.h"
-#include "Calculation/PorousMedia.h"
+#include "Parameter/Parameter.h"
+
+namespace vf::parallel
+{
+class Communicator;
+}
 
 class BCKernelManager;
 class ADKernelManager;
@@ -24,7 +28,7 @@ using RefinementStrategy = std::function<void (UpdateGrid27* updateGrid, Paramet
 class UpdateGrid27
 {
 public:
-    UpdateGrid27(SPtr<Parameter> para, vf::gpu::Communicator &comm, SPtr<CudaMemoryManager> cudaMemoryManager,
+    UpdateGrid27(SPtr<Parameter> para, vf::parallel::Communicator& comm, SPtr<CudaMemoryManager> cudaMemoryManager,
                  std::vector<std::shared_ptr<PorousMedia>> &pm, std::vector<SPtr<Kernel>> &kernels, BoundaryConditionFactory* bcFactory, SPtr<TurbulenceModelFactory> tmFactory, GridScalingFactory* scalingFactory);
     void updateGrid(int level, unsigned int t);
     void exchangeData(int level);
@@ -72,7 +76,7 @@ private:
 
 private:
     SPtr<Parameter> para;
-    vf::gpu::Communicator& comm;
+    vf::parallel::Communicator& comm;
     SPtr<CudaMemoryManager> cudaMemoryManager;
     std::vector<std::shared_ptr<PorousMedia>> pm;
     std::vector<SPtr<Kernel>> kernels;
diff --git a/src/gpu/VirtualFluids_GPU/Communication/CommunicationRoutine.h b/src/gpu/VirtualFluids_GPU/Communication/CommunicationRoutine.h
deleted file mode 100644
index 26c017f93..000000000
--- a/src/gpu/VirtualFluids_GPU/Communication/CommunicationRoutine.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef INDEX_EXCHANGE
-#define INDEX_EXCHANGE
-
-#include <basics/DataTypes.h>
-
-namespace vf::gpu
-{
-class CommunicationRoutine
-{
-public:
-    virtual ~CommunicationRoutine() = default;
-
-    virtual void receive_send(uint *buffer_receive, int size_buffer_recv, int neighbor_rank_recv, uint *buffer_send,
-                              int size_buffer_send, int neighbor_rank_send) const = 0;
-    virtual int getPID() const = 0;
-};
-} // namespace vf::gpu
-
-#endif
diff --git a/src/gpu/VirtualFluids_GPU/Communication/CommunicationRoutineMocks.h b/src/gpu/VirtualFluids_GPU/Communication/CommunicationRoutineMocks.h
deleted file mode 100644
index d05e5b6a3..000000000
--- a/src/gpu/VirtualFluids_GPU/Communication/CommunicationRoutineMocks.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef VF_GPU_COMMUNICATIONROUTINEMOCKS_H
-#define VF_GPU_COMMUNICATIONROUTINEMOCKS_H
-
-#include "CommunicationRoutine.h"
-
-namespace vf::gpu::test 
-{
-
-class CommunicationRoutineTestDouble : public vf::gpu::CommunicationRoutine
-{
-public:
-    void receive_send(uint *buffer_receive, int size_buffer_recv, int neighbor_rank_recv, uint *buffer_send,
-                              int size_buffer_send, int neighbor_rank_send) const override { } 
-    int getPID() const override { return 0; }
-};
-
-}
-
-
-
-#endif
diff --git a/src/gpu/VirtualFluids_GPU/Communication/Communicator.h b/src/gpu/VirtualFluids_GPU/Communication/Communicator.h
deleted file mode 100644
index c52d5af9c..000000000
--- a/src/gpu/VirtualFluids_GPU/Communication/Communicator.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef COMMUNICATOR_GPU_H
-#define COMMUNICATOR_GPU_H
-
-#include <vector>
-#include <basics/DataTypes.h>
-
-#include "VirtualFluids_GPU_export.h"
-#include "CommunicationRoutine.h"
-
-namespace vf::gpu
-{
-
-class VIRTUALFLUIDS_GPU_EXPORT Communicator : public CommunicationRoutine
-{
-public:
-    virtual void waitAll() = 0;
-    virtual int getPID() const override = 0;
-    virtual int getNumberOfProcess() const = 0;
-    virtual void exchngData(float *sbuf_t, float *rbuf_t, float *sbuf_b, float *rbuf_b, int count) = 0;
-    //////////////////////////////////////////////////////////////////////////
-    virtual void exchngDataGPU(real *sbuf, int count_s, real *rbuf, int count_r, int nb_rank) = 0;
-    virtual void nbRecvDataGPU(real *rbuf, int count_r, int nb_rank) = 0;
-    virtual void nbSendDataGPU(real *sbuf, int count_s, int nb_rank) = 0;
-    virtual void waitallGPU() = 0;
-    virtual void sendDataGPU(real *sbuf, int count_s, int nb_rank) = 0;
-    virtual void waitGPU(int id) = 0;
-    virtual void resetRequest() = 0;
-    //////////////////////////////////////////////////////////////////////////
-    virtual int mapCudaDevice(const int &rank, const int &size, const std::vector<unsigned int> &devices, const int &maxdev) = 0;
-    virtual double reduceSum(double quantityPerProcess) = 0;
-    //////////////////////////////////////////////////////////////////////////
-    virtual void receive_send(uint *buffer_receive, int size_buffer_recv, int neighbor_rank_recv, uint *buffer_send,
-                              int size_buffer_send, int neighbor_rank_send) const override = 0;
-
-};
-
-} // namespace vf::gpu
-
-#endif
diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
index 48a27efa6..ff5e39c70 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
@@ -1,9 +1,11 @@
-#include <helper_cuda.h>
 #include <cuda_runtime.h>
+#include <helper_cuda.h>
 
 #include "Communication/ExchangeData27.h"
 #include "Parameter/CudaStreamManager.h"
 
+#include <parallel/Communicator.h>
+
 using namespace vf::lbm::dir;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -52,22 +54,20 @@ void scatterNodesFromRecvBufferGPU(Parameter *para, int level, CudaStreamIndex s
     }
 }
 
-void startBlockingMpiSend(unsigned int numberOfSendProcessNeighbors, vf::gpu::Communicator &comm,
+void startBlockingMpiSend(unsigned int numberOfSendProcessNeighbors, vf::parallel::Communicator &comm,
                           std::vector<ProcessNeighbor27> *sendProcessNeighborHost)
 {
     for (unsigned int i = 0; i < numberOfSendProcessNeighbors; i++) {
-            comm.sendDataGPU((*sendProcessNeighborHost)[i].f[0], 
-                            (*sendProcessNeighborHost)[i].numberOfFs,
-                            (*sendProcessNeighborHost)[i].rankNeighbor);
+        comm.send((*sendProcessNeighborHost)[i].f[0], (*sendProcessNeighborHost)[i].numberOfFs,
+                  (*sendProcessNeighborHost)[i].rankNeighbor);
     }
 }
 
-void startNonBlockingMpiReceive(unsigned int numberOfSendProcessNeighbors, vf::gpu::Communicator &comm,
+void startNonBlockingMpiReceive(unsigned int numberOfSendProcessNeighbors, vf::parallel::Communicator &comm,
                                 std::vector<ProcessNeighbor27> *recvProcessNeighborHost)
 {
     for (unsigned int i = 0; i < numberOfSendProcessNeighbors; i++) {
-            comm.nbRecvDataGPU((*recvProcessNeighborHost)[i].f[0], 
-                                (*recvProcessNeighborHost)[i].numberOfFs,
+        comm.receiveNonBlocking((*recvProcessNeighborHost)[i].f[0], (*recvProcessNeighborHost)[i].numberOfFs,
                                 (*recvProcessNeighborHost)[i].rankNeighbor);
     }
 }
@@ -117,7 +117,7 @@ void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, CudaStre
                                 (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
 }
 
-void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::parallel::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
                                     int level, CudaStreamIndex streamIndex)
 {
     exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, streamIndex,
@@ -127,7 +127,7 @@ void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm
                            &para->getParH(level)->recvProcessNeighborX);
 }
 
-void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::parallel::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
                                      int level, CudaStreamIndex streamIndex)
 {
     exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, streamIndex,
@@ -149,7 +149,7 @@ void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, CudaS
                                   (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
 }
 
-void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, 
+void exchangeCollDataXGPU27(Parameter *para, vf::parallel::Communicator& comm, CudaMemoryManager *cudaMemoryManager, 
                             int level, CudaStreamIndex streamIndex,
                             std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
                             std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
@@ -158,7 +158,7 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe
 {
     cudaStream_t stream = para->getStreamManager()->getStream(streamIndex);
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    //! \details steps: 
+    //! \details steps:
     //! 1. copy data from device to host
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
         cudaMemoryManager->cudaCopyProcessNeighborXFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs);
@@ -174,10 +174,10 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe
     startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost);
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //! 5. wait for until data is received
-    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) comm.waitGPU(i);
+    comm.waitAll();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //! 6. reset the request array, which was used for the mpi communication
-    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) comm.resetRequest();
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) comm.resetRequests();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //! 7. copy received data from host to device
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
@@ -201,7 +201,7 @@ void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, CudaStre
                                 (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
 }
 
-void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::parallel::Communicator& comm, CudaMemoryManager *cudaMemoryManager,
                                     int level, CudaStreamIndex streamIndex)
 {
     exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, streamIndex,
@@ -211,7 +211,7 @@ void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm
                            &para->getParH(level)->recvProcessNeighborY);
 }
 
-void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::parallel::Communicator& comm, CudaMemoryManager *cudaMemoryManager,
                                      int level, CudaStreamIndex streamIndex)
 {
     exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, streamIndex,
@@ -233,7 +233,7 @@ void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, CudaS
                                   (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
 }
 
-void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level,
+void exchangeCollDataYGPU27(Parameter *para, vf::parallel::Communicator& comm, CudaMemoryManager *cudaMemoryManager, int level,
                             CudaStreamIndex streamIndex,
                             std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
                             std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
@@ -270,10 +270,10 @@ void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe
     startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost);
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // wait
-    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) comm.waitGPU(i);
+    comm.waitAll();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // reset the request array
-    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) comm.resetRequest();
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) comm.resetRequests();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // copy Host to Device
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) {
@@ -298,7 +298,7 @@ void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, CudaStre
                                 (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")));
 }
 
-void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::parallel::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
                                     int level, CudaStreamIndex streamIndex)
 {
     exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, streamIndex,
@@ -307,7 +307,7 @@ void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm
                            &para->getParH(level)->sendProcessNeighborZ,
                            &para->getParH(level)->recvProcessNeighborZ);
 }
-void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::parallel::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
                                      int level, CudaStreamIndex streamIndex)
 {
     exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, streamIndex,
@@ -330,7 +330,7 @@ void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, CudaS
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level, 
+void exchangeCollDataZGPU27(Parameter *para, vf::parallel::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level, 
                             CudaStreamIndex streamIndex,
                             std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
                             std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
@@ -380,10 +380,10 @@ void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe
     startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost);
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // wait
-    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) comm.waitGPU(i);
+    comm.waitAll();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // reset the request array
-    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) comm.resetRequest();
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) comm.resetRequests();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // copy Host to Device
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
@@ -416,7 +416,7 @@ void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //1D domain decomposition
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePreCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
+void exchangePreCollDataGPU27(Parameter* para, vf::parallel::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
 {
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighbors(level, "send")); i++)
     {
@@ -434,7 +434,7 @@ void exchangePreCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cuda
         //////////////////////////////////////////////////////////////////////////
         cudaMemoryManager->cudaCopyProcessNeighborFsDH(level, i);
         //////////////////////////////////////////////////////////////////////////
-        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighbor[i].f[0], 
+        comm.receiveSend(para->getParH(level)->sendProcessNeighbor[i].f[0], 
                             para->getParH(level)->sendProcessNeighbor[i].numberOfFs,
                             para->getParH(level)->recvProcessNeighbor[i].f[0],
                             para->getParH(level)->recvProcessNeighbor[i].numberOfFs,
@@ -461,7 +461,7 @@ void exchangePreCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cuda
 
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
+void exchangePostCollDataGPU27(Parameter* para, vf::parallel::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
 {
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighbors(level, "send")); i++)
     {
@@ -479,7 +479,7 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
         //////////////////////////////////////////////////////////////////////////
         cudaMemoryManager->cudaCopyProcessNeighborFsDH(level, i);
         //////////////////////////////////////////////////////////////////////////
-        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighbor[i].f[0], 
+        comm.receiveSend(para->getParH(level)->sendProcessNeighbor[i].f[0], 
                             para->getParH(level)->sendProcessNeighbor[i].numberOfFs,
                             para->getParH(level)->recvProcessNeighbor[i].f[0],
                             para->getParH(level)->recvProcessNeighbor[i].numberOfFs,
@@ -502,7 +502,7 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//void exchangePostCollDataXGPU27(Parameter* para, vf::gpu::Communicator& comm, int level)
+//void exchangePostCollDataXGPU27(Parameter* para, vf::parallel::Communicator& comm, int level)
 //{
 //    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
 //    {
@@ -520,7 +520,7 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 //        //////////////////////////////////////////////////////////////////////////
 //        para->cudaCopyProcessNeighborXFsDH(level, i);
 //        //////////////////////////////////////////////////////////////////////////
-//        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], 
+//        comm.receiveSend(para->getParH(level)->sendProcessNeighborX[i].f[0], 
 //                            para->getParH(level)->sendProcessNeighborX[i].numberOfFs,
 //                            para->getParH(level)->recvProcessNeighborX[i].f[0],
 //                            para->getParH(level)->recvProcessNeighborX[i].numberOfFs,
@@ -549,7 +549,7 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //// Y
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//void exchangePreCollDataYGPU27(Parameter* para, vf::gpu::Communicator& comm, int level)
+//void exchangePreCollDataYGPU27(Parameter* para, vf::parallel::Communicator& comm, int level)
 //{
 //    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
 //    {
@@ -567,7 +567,7 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 //        //////////////////////////////////////////////////////////////////////////
 //        para->cudaCopyProcessNeighborYFsDH(level, i);
 //        //////////////////////////////////////////////////////////////////////////
-//        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], 
+//        comm.receiveSend(para->getParH(level)->sendProcessNeighborY[i].f[0], 
 //                            para->getParH(level)->sendProcessNeighborY[i].numberOfFs,
 //                            para->getParH(level)->recvProcessNeighborY[i].f[0],
 //                            para->getParH(level)->recvProcessNeighborY[i].numberOfFs,
@@ -589,7 +589,7 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 //    }
 //}
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//void exchangePostCollDataYGPU27(Parameter* para, vf::gpu::Communicator& comm, int level)
+//void exchangePostCollDataYGPU27(Parameter* para, vf::parallel::Communicator& comm, int level)
 //{
 //    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
 //    {
@@ -607,7 +607,7 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 //        //////////////////////////////////////////////////////////////////////////
 //        para->cudaCopyProcessNeighborYFsDH(level, i);
 //        //////////////////////////////////////////////////////////////////////////
-//        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], 
+//        comm.receiveSend(para->getParH(level)->sendProcessNeighborY[i].f[0], 
 //                            para->getParH(level)->sendProcessNeighborY[i].numberOfFs,
 //                            para->getParH(level)->recvProcessNeighborY[i].f[0],
 //                            para->getParH(level)->recvProcessNeighborY[i].numberOfFs,
@@ -636,7 +636,7 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //// Z
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//void exchangePreCollDataZGPU27(Parameter* para, vf::gpu::Communicator& comm, int level)
+//void exchangePreCollDataZGPU27(Parameter* para, vf::parallel::Communicator& comm, int level)
 //{
 //    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
 //    {
@@ -654,7 +654,7 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 //        //////////////////////////////////////////////////////////////////////////
 //        para->cudaCopyProcessNeighborZFsDH(level, i);
 //        //////////////////////////////////////////////////////////////////////////
-//        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], 
+//        comm.receiveSend(para->getParH(level)->sendProcessNeighborZ[i].f[0], 
 //                            para->getParH(level)->sendProcessNeighborZ[i].numberOfFs,
 //                            para->getParH(level)->recvProcessNeighborZ[i].f[0],
 //                            para->getParH(level)->recvProcessNeighborZ[i].numberOfFs,
@@ -676,7 +676,7 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 //    }
 //}
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//void exchangePostCollDataZGPU27(Parameter* para, vf::gpu::Communicator& comm, int level)
+//void exchangePostCollDataZGPU27(Parameter* para, vf::parallel::Communicator& comm, int level)
 //{
 //    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
 //    {
@@ -694,7 +694,7 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 //        //////////////////////////////////////////////////////////////////////////
 //        para->cudaCopyProcessNeighborZFsDH(level, i);
 //        //////////////////////////////////////////////////////////////////////////
-//        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], 
+//        comm.receiveSend(para->getParH(level)->sendProcessNeighborZ[i].f[0], 
 //                            para->getParH(level)->sendProcessNeighborZ[i].numberOfFs,
 //                            para->getParH(level)->recvProcessNeighborZ[i].f[0],
 //                            para->getParH(level)->recvProcessNeighborZ[i].numberOfFs,
@@ -771,7 +771,7 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // X
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePreCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
+void exchangePreCollDataADXGPU27(Parameter* para, vf::parallel::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
 {
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Device to Host
@@ -794,7 +794,7 @@ void exchangePreCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm, C
     //start non blocking MPI receive
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
     {
-        comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADX[i].f[0],
+        comm.receiveNonBlocking(para->getParH(level)->recvProcessNeighborADX[i].f[0],
                             para->getParH(level)->recvProcessNeighborADX[i].numberOfFs,
                             para->getParH(level)->recvProcessNeighborADX[i].rankNeighbor);
     }
@@ -816,21 +816,18 @@ void exchangePreCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm, C
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
     {
-        comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0],
+        comm.send(para->getParH(level)->sendProcessNeighborADX[i].f[0],
                           para->getParH(level)->sendProcessNeighborADX[i].numberOfFs,
                           para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor);
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //Wait
-    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-    {
-        comm.waitGPU(i);
-    }
+    comm.waitAll();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //reset the request array
     if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
     {
-        comm.resetRequest();
+        comm.resetRequests();
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Host to Device
@@ -852,7 +849,7 @@ void exchangePreCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm, C
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePostCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
+void exchangePostCollDataADXGPU27(Parameter* para, vf::parallel::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
 {
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Device to Host
@@ -875,7 +872,7 @@ void exchangePostCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm,
     //start non blocking MPI receive
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
     {
-        comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADX[i].f[0],
+        comm.receiveNonBlocking(para->getParH(level)->recvProcessNeighborADX[i].f[0],
                             para->getParH(level)->recvProcessNeighborADX[i].numberOfFs,
                             para->getParH(level)->recvProcessNeighborADX[i].rankNeighbor);
     }
@@ -897,21 +894,18 @@ void exchangePostCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm,
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
     {
-        comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0],
+        comm.send(para->getParH(level)->sendProcessNeighborADX[i].f[0],
                           para->getParH(level)->sendProcessNeighborADX[i].numberOfFs,
                           para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor);
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //Wait
-    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-    {
-        comm.waitGPU(i);
-    }
+    comm.waitAll();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //reset the request array
     if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
     {
-        comm.resetRequest();
+        comm.resetRequests();
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Host to Device
@@ -940,7 +934,7 @@ void exchangePostCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm,
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Y
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePreCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
+void exchangePreCollDataADYGPU27(Parameter* para, vf::parallel::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
 {
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Device to Host
@@ -963,7 +957,7 @@ void exchangePreCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm, C
     //start non blocking MPI receive
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
     {
-        comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADY[i].f[0],
+        comm.receiveNonBlocking(para->getParH(level)->recvProcessNeighborADY[i].f[0],
                             para->getParH(level)->recvProcessNeighborADY[i].numberOfFs,
                             para->getParH(level)->recvProcessNeighborADY[i].rankNeighbor);
     }
@@ -985,21 +979,18 @@ void exchangePreCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm, C
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
     {
-        comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0],
+        comm.send(para->getParH(level)->sendProcessNeighborADY[i].f[0],
                           para->getParH(level)->sendProcessNeighborADY[i].numberOfFs,
                           para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor);
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //Wait
-    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-    {
-        comm.waitGPU(i);
-    }
+    comm.waitAll();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //reset the request array
     if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
     {
-        comm.resetRequest();
+        comm.resetRequests();
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Host to Device
@@ -1021,7 +1012,7 @@ void exchangePreCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm, C
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePostCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
+void exchangePostCollDataADYGPU27(Parameter* para, vf::parallel::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
 {
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Device to Host
@@ -1044,7 +1035,7 @@ void exchangePostCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm,
     //start non blocking MPI receive
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
     {
-        comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADY[i].f[0],
+        comm.receiveNonBlocking(para->getParH(level)->recvProcessNeighborADY[i].f[0],
                             para->getParH(level)->recvProcessNeighborADY[i].numberOfFs,
                             para->getParH(level)->recvProcessNeighborADY[i].rankNeighbor);
     }
@@ -1066,21 +1057,18 @@ void exchangePostCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm,
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
     {
-        comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0],
+        comm.send(para->getParH(level)->sendProcessNeighborADY[i].f[0],
                           para->getParH(level)->sendProcessNeighborADY[i].numberOfFs,
                           para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor);
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //Wait
-    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-    {
-        comm.waitGPU(i);
-    }
+    comm.waitAll();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //reset the request array
     if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
     {
-        comm.resetRequest();
+        comm.resetRequests();
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Host to Device
@@ -1109,7 +1097,7 @@ void exchangePostCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm,
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Z
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePreCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
+void exchangePreCollDataADZGPU27(Parameter* para, vf::parallel::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
 {
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Device to Host
@@ -1132,7 +1120,7 @@ void exchangePreCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm, C
     //start non blocking MPI receive
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
     {
-        comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADZ[i].f[0],
+        comm.receiveNonBlocking(para->getParH(level)->recvProcessNeighborADZ[i].f[0],
                             para->getParH(level)->recvProcessNeighborADZ[i].numberOfFs,
                             para->getParH(level)->recvProcessNeighborADZ[i].rankNeighbor);
     }
@@ -1154,21 +1142,18 @@ void exchangePreCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm, C
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
     {
-        comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0],
+        comm.send(para->getParH(level)->sendProcessNeighborADZ[i].f[0],
                           para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs,
                           para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor);
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //Wait
-    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-    {
-        comm.waitGPU(i);
-    }
+    comm.waitAll();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //reset the request array
     if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
     {
-        comm.resetRequest();
+        comm.resetRequests();
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Host to Device
@@ -1190,7 +1175,7 @@ void exchangePreCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm, C
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePostCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
+void exchangePostCollDataADZGPU27(Parameter* para, vf::parallel::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
 {
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Device to Host
@@ -1213,7 +1198,7 @@ void exchangePostCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm,
     //start non blocking MPI receive
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
     {
-        comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADZ[i].f[0],
+        comm.receiveNonBlocking(para->getParH(level)->recvProcessNeighborADZ[i].f[0],
                             para->getParH(level)->recvProcessNeighborADZ[i].numberOfFs,
                             para->getParH(level)->recvProcessNeighborADZ[i].rankNeighbor);
     }
@@ -1235,21 +1220,18 @@ void exchangePostCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm,
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
     {
-        comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0],
+        comm.send(para->getParH(level)->sendProcessNeighborADZ[i].f[0],
                           para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs,
                           para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor);
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //Wait
-    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-    {
-        comm.waitGPU(i);
-    }
+    comm.waitAll();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //reset the request array
     if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
     {
-        comm.resetRequest();
+        comm.resetRequests();
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Host to Device
@@ -1325,7 +1307,7 @@ void exchangePostCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm,
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // X
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangeCollDataF3XGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
+void exchangeCollDataF3XGPU(Parameter* para, vf::parallel::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
 {
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Device to Host
@@ -1349,7 +1331,7 @@ void exchangeCollDataF3XGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMe
     //start non blocking MPI receive
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
     {
-        comm.nbRecvDataGPU(
+        comm.receiveNonBlocking(
             para->getParH(level)->recvProcessNeighborF3X[i].g[0],
             para->getParH(level)->recvProcessNeighborF3X[i].numberOfGs,
             para->getParH(level)->recvProcessNeighborF3X[i].rankNeighbor);
@@ -1358,22 +1340,19 @@ void exchangeCollDataF3XGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMe
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
     {
-        comm.sendDataGPU(
+        comm.send(
             para->getParH(level)->sendProcessNeighborF3X[i].g[0],
             para->getParH(level)->sendProcessNeighborF3X[i].numberOfGs,
             para->getParH(level)->sendProcessNeighborF3X[i].rankNeighbor);
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //Wait
-    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-    {
-        comm.waitGPU(i);
-    }
+    comm.waitAll();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //reset the request array
     if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
     {
-        comm.resetRequest();
+        comm.resetRequests();
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Host to Device
@@ -1403,7 +1382,7 @@ void exchangeCollDataF3XGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMe
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Y
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangeCollDataF3YGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
+void exchangeCollDataF3YGPU(Parameter* para, vf::parallel::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
 {
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Device to Host
@@ -1427,7 +1406,7 @@ void exchangeCollDataF3YGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMe
     //start non blocking MPI receive
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
     {
-        comm.nbRecvDataGPU(
+        comm.receiveNonBlocking(
             para->getParH(level)->recvProcessNeighborF3Y[i].g[0],
             para->getParH(level)->recvProcessNeighborF3Y[i].numberOfGs,
             para->getParH(level)->recvProcessNeighborF3Y[i].rankNeighbor);
@@ -1436,22 +1415,19 @@ void exchangeCollDataF3YGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMe
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
     {
-        comm.sendDataGPU(
+        comm.send(
             para->getParH(level)->sendProcessNeighborF3Y[i].g[0],
             para->getParH(level)->sendProcessNeighborF3Y[i].numberOfGs,
             para->getParH(level)->sendProcessNeighborF3Y[i].rankNeighbor);
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //Wait
-    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-    {
-        comm.waitGPU(i);
-    }
+    comm.waitAll();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //reset the request array
     if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
     {
-        comm.resetRequest();
+        comm.resetRequests();
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Host to Device
@@ -1481,7 +1457,7 @@ void exchangeCollDataF3YGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMe
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Z
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangeCollDataF3ZGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
+void exchangeCollDataF3ZGPU(Parameter* para, vf::parallel::Communicator& comm, CudaMemoryManager* cudaMemoryManager, int level)
 {
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Device to Host
@@ -1505,7 +1481,7 @@ void exchangeCollDataF3ZGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMe
     //start non blocking MPI receive
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
     {
-        comm.nbRecvDataGPU(
+        comm.receiveNonBlocking(
             para->getParH(level)->recvProcessNeighborF3Z[i].g[0],
             para->getParH(level)->recvProcessNeighborF3Z[i].numberOfGs,
             para->getParH(level)->recvProcessNeighborF3Z[i].rankNeighbor);
@@ -1514,22 +1490,19 @@ void exchangeCollDataF3ZGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMe
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
     {
-        comm.sendDataGPU(
+        comm.send(
             para->getParH(level)->sendProcessNeighborF3Z[i].g[0],
             para->getParH(level)->sendProcessNeighborF3Z[i].numberOfGs,
             para->getParH(level)->sendProcessNeighborF3Z[i].rankNeighbor);
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //Wait
-    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-    {
-        comm.waitGPU(i);
-    }
+    comm.waitAll();
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //reset the request array
     if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
     {
-        comm.resetRequest();
+        comm.resetRequests();
     }
     ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //copy Host to Device
diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h
index 8302ffdc4..8b03b2b10 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h
+++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h
@@ -1,13 +1,17 @@
 #ifndef EXCHANGEDATA27_H
 #define EXCHANGEDATA27_H
 
-#include "Communication/Communicator.h"
 #include "GPU/CudaMemoryManager.h"
 #include "GPU/GPU_Interface.h"
 #include "LBM/LB.h"
 #include "Parameter/Parameter.h"
 #include "Parameter/CudaStreamManager.h"
 
+namespace vf::parallel
+{
+class Communicator;
+}
+
 //! \file ExchangeData27.h
 //! \ingroup GPU
 //! \author Martin Schoenherr, Anna Wellmann
@@ -15,9 +19,9 @@
 
 //////////////////////////////////////////////////////////////////////////
 // 1D domain decomposition
-void exchangePreCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, 
+void exchangePreCollDataGPU27(Parameter *para, vf::parallel::Communicator& comm, CudaMemoryManager *cudaMemoryManager, 
                                          int level);
-void exchangePostCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, 
+void exchangePostCollDataGPU27(Parameter *para, vf::parallel::Communicator& comm, CudaMemoryManager *cudaMemoryManager, 
                                           int level);
 //////////////////////////////////////////////////////////////////////////
 // 3D domain decomposition
@@ -62,7 +66,7 @@ void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, CudaStre
 //! \param CudaMemoryManager is needed for moving the data between host and device
 //! \param sendProcessNeighborDev, recvProcessNeighborDev, sendProcessNeighborHost, recvProcessNeighborHost are pointers
 //! to the send and receive arrays, both on the device and the host
-void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangeCollDataXGPU27(Parameter *para, vf::parallel::Communicator& comm, CudaMemoryManager *cudaMemoryManager,
                                        int level, CudaStreamIndex streamIndex,
                                        std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
                                        std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
@@ -70,14 +74,14 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe
                                        std::vector<ProcessNeighbor27> *recvProcessNeighborHost);
 //! \brief Calls exchangeCollDataXGPU27() for exchanging all nodes
 //! \details Used in the communication after collision step
-void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm,
+void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::parallel::Communicator& comm,
                                                CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex);
 //! \brief Calls exchangeCollDataXGPU27() for exchanging the nodes, which are part of the communication between the two
 //! interpolation processes on refined grids 
 //! \details Only exchange nodes which are part of the interpolation process on
 //! refined grids. This function is used in the exchange which takes place after the interpolation fine to coarse and
 //! before the interpolation coarse to fine. See [master thesis of Anna Wellmann]
-void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm,
+void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::parallel::Communicator& comm,
                                                 CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex);
 //! \brief Distribute the receive nodes (x direction) from the buffer on the gpu
 //! \details Needed to exchange all nodes, used in the communication after collision step
@@ -94,15 +98,15 @@ void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, CudaS
 void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex);
 void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex);
 
-void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangeCollDataYGPU27(Parameter *para, vf::parallel::Communicator& comm, CudaMemoryManager *cudaMemoryManager,
                                        int level,CudaStreamIndex streamIndex,
                                        std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
                                        std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
                                        std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
                                        std::vector<ProcessNeighbor27> *recvProcessNeighborHos);
-void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm,
+void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::parallel::Communicator& comm,
                                                CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex);
-void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm,
+void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::parallel::Communicator& comm,
                                                 CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex);
 void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex);
 void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex);
@@ -111,15 +115,15 @@ void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, CudaS
 void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex);
 void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex);
 
-void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangeCollDataZGPU27(Parameter *para, vf::parallel::Communicator& comm, CudaMemoryManager *cudaMemoryManager,
                                        int level, CudaStreamIndex streamIndex,
                                        std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
                                        std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
                                        std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
                                        std::vector<ProcessNeighbor27> *recvProcessNeighborHost);
-void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm,
+void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::parallel::Communicator& comm,
                                                CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex);
-void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm,
+void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::parallel::Communicator& comm,
                                                 CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex);
 
 void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex);
@@ -127,28 +131,25 @@ void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, CudaS
 
 //////////////////////////////////////////////////////////////////////////
 // 3D domain decomposition convection diffusion
-void exchangePreCollDataADXGPU27(Parameter *para, vf::gpu::Communicator &comm,
+void exchangePreCollDataADXGPU27(Parameter *para, vf::parallel::Communicator& comm,
                                             CudaMemoryManager *cudaMemoryManager, int level);
-void exchangePreCollDataADYGPU27(Parameter *para, vf::gpu::Communicator &comm,
+void exchangePreCollDataADYGPU27(Parameter *para, vf::parallel::Communicator& comm,
                                             CudaMemoryManager *cudaMemoryManager, int level);
-void exchangePreCollDataADZGPU27(Parameter *para, vf::gpu::Communicator &comm,
+void exchangePreCollDataADZGPU27(Parameter *para, vf::parallel::Communicator& comm,
                                             CudaMemoryManager *cudaMemoryManager, int level);
-void exchangePostCollDataADXGPU27(Parameter *para, vf::gpu::Communicator &comm,
+void exchangePostCollDataADXGPU27(Parameter *para, vf::parallel::Communicator& comm,
                                              CudaMemoryManager *cudaMemoryManager, int level);
-void exchangePostCollDataADYGPU27(Parameter *para, vf::gpu::Communicator &comm,
+void exchangePostCollDataADYGPU27(Parameter *para, vf::parallel::Communicator& comm,
                                              CudaMemoryManager *cudaMemoryManager, int level);
-void exchangePostCollDataADZGPU27(Parameter *para, vf::gpu::Communicator &comm,
+void exchangePostCollDataADZGPU27(Parameter *para, vf::parallel::Communicator& comm,
                                              CudaMemoryManager *cudaMemoryManager, int level);
 //////////////////////////////////////////////////////////////////////////
 // 3D domain decomposition F3 - K18/K20
-void exchangeCollDataF3XGPU(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangeCollDataF3XGPU(Parameter *para, vf::parallel::Communicator& comm, CudaMemoryManager *cudaMemoryManager,
                                        int level);
-void exchangeCollDataF3YGPU(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangeCollDataF3YGPU(Parameter *para, vf::parallel::Communicator& comm, CudaMemoryManager *cudaMemoryManager,
                                        int level);
-void exchangeCollDataF3ZGPU(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager,
+void exchangeCollDataF3ZGPU(Parameter *para, vf::parallel::Communicator& comm, CudaMemoryManager *cudaMemoryManager,
                                        int level);
-//////////////////////////////////////////////////////////////////////////
-void barrierGPU(vf::gpu::Communicator &comm);
-//////////////////////////////////////////////////////////////////////////
 
 #endif
diff --git a/src/gpu/VirtualFluids_GPU/Communication/MpiCommunicator.cpp b/src/gpu/VirtualFluids_GPU/Communication/MpiCommunicator.cpp
deleted file mode 100644
index 8af5931ce..000000000
--- a/src/gpu/VirtualFluids_GPU/Communication/MpiCommunicator.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-#include "MpiCommunicator.h"
-
-#include <mpi.h>
-#include <vector>
-
-#include <logger/Logger.h>
-
-#if defined (_WIN32) || defined (_WIN64)
-   #include <Winsock2.h>
-#elif defined (__unix__)
-   #include <unistd.h>
-#endif
-//lib for windows Ws2_32.lib
-
-namespace vf::gpu
-{
-
-
-MpiCommunicator::MpiCommunicator()
-{
-    int mpiInitialized = 0; // false
-    MPI_Initialized(&mpiInitialized);
-    if (!mpiInitialized) {
-        MPI_Init(NULL, NULL);
-        VF_LOG_TRACE("vf::gpu::MpiCommunicator(): MPI_Init");
-    }
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &PID);
-    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-
-    commGPU = MPI_COMM_WORLD;
-    requestGPU.resize(0);
-    rcount = 0;
-
-    // Get a new communicator for a decomposition of the domain
-    int isperiodic[1] = { 0 };
-    MPI_Cart_create(MPI_COMM_WORLD, 1, &numprocs, isperiodic, 1, &comm1d);
-
-    // Get my position in this communicator, and my neighbors
-    MPI_Cart_shift(comm1d, 0, 1, &nbrbottom, &nbrtop);
-}
-
-MpiCommunicator::~MpiCommunicator()
-{
-    // proof if MPI is finalized
-    int _mpiFinalized = 0; // false
-    MPI_Finalized(&_mpiFinalized);
-    if (!_mpiFinalized) {
-        MPI_Finalize();
-        VF_LOG_TRACE("vf::gpu::~MpiCommunicator(): MPI_Finalize");
-    }
-}
-
-
-// C++11 thread safe singelton implementation:
-// https://stackoverflow.com/questions/1661529/is-meyers-implementation-of-the-singleton-pattern-thread-safe
-MpiCommunicator& MpiCommunicator::getInstance()
-{
-    static MpiCommunicator comm;
-    return comm;
-}
-
-void MpiCommunicator::exchngBottomToTop(float *sbuf, float *rbuf, int count)
-{
-    MPI_Sendrecv(sbuf, count, MPI_FLOAT, nbrtop, 0, rbuf, count, MPI_FLOAT, nbrbottom, 0, comm1d, status);
-}
-void MpiCommunicator::exchngTopToBottom(float *sbuf, float *rbuf, int count)
-{
-    MPI_Sendrecv(sbuf, count, MPI_FLOAT, nbrbottom, 0, rbuf, count, MPI_FLOAT, nbrtop, 0, comm1d, status);
-}
-void MpiCommunicator::waitAll() { MPI_Waitall(4, request, status); }
-void MpiCommunicator::exchngData(float *sbuf_t, float *rbuf_t, float *sbuf_b, float *rbuf_b, int count)
-{
-    MPI_Sendrecv(sbuf_t, count, MPI_FLOAT, nbrtop, 0, rbuf_t, count, MPI_FLOAT, nbrbottom, 0, comm1d, status);
-    MPI_Sendrecv(sbuf_b, count, MPI_FLOAT, nbrbottom, 0, rbuf_b, count, MPI_FLOAT, nbrtop, 0, comm1d, status);
-}
-void MpiCommunicator::exchngDataNB(float *sbuf_t, int count_st, float *rbuf_t, int count_rt, float *sbuf_b, int count_sb,
-                                float *rbuf_b, int count_rb)
-{
-    MPI_Irecv(rbuf_t, count_rt, MPI_FLOAT, nbrbottom, 0, comm1d, &request[0]);
-    MPI_Irecv(rbuf_b, count_rb, MPI_FLOAT, nbrtop, 0, comm1d, &request[1]);
-    MPI_Isend(sbuf_t, count_st, MPI_FLOAT, nbrtop, 0, comm1d, &request[2]);
-    MPI_Isend(sbuf_b, count_sb, MPI_FLOAT, nbrbottom, 0, comm1d, &request[3]);
-    MPI_Waitall(4, request, status);
-}
-//////////////////////////////////////////////////////////////////////////
-// Crap by Martin Sch.
-void MpiCommunicator::exchngDataGPU(real *sbuf, int count_s, real *rbuf, int count_r, int nb_rank)
-{
-    MPI_Status MSstatus;
-    MPI_Send(sbuf, count_s, MPI_Type_GPU, nb_rank, 0, commGPU);
-    MPI_Recv(rbuf, count_r, MPI_Type_GPU, nb_rank, 0, commGPU, &MSstatus);
-    ////test only - please don't use
-    // MPI_Sendrecv(sbuf, count_s, MPI_Type_GPU, nb_rank, 0, rbuf, count_r, MPI_Type_GPU, nb_rank, 0, comm1d,
-    // MPI_STATUSES_IGNORE);
-}
-void MpiCommunicator::sendRecvGPU(real *sbuf, int count_s, real *rbuf, int count_r, int nb_rank)
-{
-    // test only - please don't use
-    MPI_Sendrecv(sbuf, count_s, MPI_Type_GPU, nb_rank, 0, rbuf, count_r, MPI_Type_GPU, nb_rank, 0, commGPU,
-                 MPI_STATUSES_IGNORE);
-}
-void MpiCommunicator::nbRecvDataGPU(real *rbuf, int count_r, int nb_rank)
-{
-    // printf("\n Start Recv Rank: %d, neighbor Rank: %d, request = %d \n", PID, nb_rank, (int)requestGPU.size());
-    // fflush(stdout);
-
-    requestGPU.push_back(0);
-    MPI_Irecv(rbuf, count_r, MPI_Type_GPU, nb_rank, 0, commGPU, &requestGPU[rcount]);
-    rcount++;
-
-    // printf("\n End Recv - Rank: %d , neighbor Rank: %d \n", PID, nb_rank);
-    // fflush(stdout);
-}
-void MpiCommunicator::nbSendDataGPU(real *sbuf, int count_s, int nb_rank)
-{
-    // printf("\n Start Send Rank: %d, neighbor Rank: %d, request = %d \n", PID, nb_rank, (int)requestGPU.size());
-    // fflush(stdout);
-
-    requestGPU.push_back(0);
-    MPI_Isend(sbuf, count_s, MPI_Type_GPU, nb_rank, 0, commGPU, &requestGPU[rcount]);
-    rcount++;
-
-    // printf("\n End Send - Rank: %d , neighbor Rank: %d \n", PID, nb_rank);
-    // fflush(stdout);
-}
-void MpiCommunicator::waitallGPU()
-{
-    // printf("\n Start Waitall Rank: %d, request = %d \n", PID, (int)requestGPU.size());
-    // fflush(stdout);
-    if (requestGPU.size() > 0) {
-        MPI_Waitall(static_cast<int>(requestGPU.size()), &requestGPU[0], MPI_STATUSES_IGNORE);
-        requestGPU.resize(0);
-        rcount = 0;
-    }
-    // printf("\n End Waitall \n");
-    // fflush(stdout);
-}
-void MpiCommunicator::sendDataGPU(real *sbuf, int count_s, int nb_rank)
-{
-    MPI_Send(sbuf, count_s, MPI_Type_GPU, nb_rank, 0, commGPU);
-}
-void MpiCommunicator::waitGPU(int id) { MPI_Wait(&requestGPU[id], MPI_STATUSES_IGNORE); }
-void MpiCommunicator::resetRequest()
-{
-    if (requestGPU.size() > 0) {
-        requestGPU.resize(0);
-        rcount = 0;
-    }
-}
-void MpiCommunicator::barrierGPU()
-{
-    // printf("\n Start Waitall Rank: %d, request = %d \n", PID, (int)requestGPU.size());
-    // fflush(stdout);
-    if (requestGPU.size() > 0) {
-        MPI_Barrier(commGPU);
-    }
-    // printf("\n End Waitall \n");
-    // fflush(stdout);
-}
-void MpiCommunicator::barrier() { MPI_Barrier(commGPU); }
-
-//////////////////////////////////////////////////////////////////////////
-void MpiCommunicator::exchngDataGeo(int *sbuf_t, int *rbuf_t, int *sbuf_b, int *rbuf_b, int count)
-{
-    MPI_Irecv(rbuf_t, count, MPI_INT, nbrbottom, 0, comm1d, &request[0]);
-    MPI_Irecv(rbuf_b, count, MPI_INT, nbrtop, 0, comm1d, &request[1]);
-    MPI_Isend(sbuf_t, count, MPI_INT, nbrtop, 0, comm1d, &request[2]);
-    MPI_Isend(sbuf_b, count, MPI_INT, nbrbottom, 0, comm1d, &request[3]);
-    MPI_Waitall(4, request, status);
-}
-int MpiCommunicator::getPID() const { return PID; }
-int MpiCommunicator::getNumberOfProcess() const { return numprocs; }
-int MpiCommunicator::getNeighbourTop() { return nbrtop; }
-int MpiCommunicator::getNeighbourBottom() { return nbrbottom; }
-MPI_Comm MpiCommunicator::getMpiCommunicator() { return comm1d; }
-void MpiCommunicator::distributeGeometry(unsigned int *dataRoot, unsigned int *dataNode, int dataSizePerNode)
-{
-    MPI_Scatter(dataRoot, dataSizePerNode, MPI_UNSIGNED, dataNode, dataSizePerNode, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
-}
-int MpiCommunicator::mapCudaDevice(const int &rank, const int &size, const std::vector<unsigned int> &devices,
-                                const int &maxdev)
-{
-    int device        = -1;
-    char *host        = (char *)malloc(sizeof(char) * size * 255);
-    unsigned int *map = (unsigned int *)malloc(sizeof(unsigned int) * size);
-
-    char hostname[255];
-    gethostname(hostname, 254);
-    hostname[254] = 0;
-
-    MPI_Gather(hostname, 255, MPI_BYTE, host, 255, MPI_BYTE, 0, MPI_COMM_WORLD);
-
-    int i, j;
-    if (rank == 0) {
-        for (i = 0; i < size; i++) {
-            int counter = 0;
-            for (j = 0; j < i; j++) {
-                if (strcmp(&host[i * 255], &host[j * 255]) == 0)
-                    counter++;
-            }
-            if (counter >= maxdev) {
-                VF_LOG_CRITICAL("More processes than GPUs!");
-                exit(1);
-            }
-            map[i] = devices[counter];
-        }
-    }
-
-    MPI_Scatter(map, 1, MPI_UNSIGNED, &device, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
-
-    VF_LOG_INFO("Rank: {} runs on host: {} with GPU: {}", rank, hostname, device);
-
-    free(map);
-    free(host);
-    return device;
-}
-
-double MpiCommunicator::reduceSum(double quantityPerProcess)
-{ 
-    double *buffer_send = &quantityPerProcess;
-    double *buffer_recv = (double *)malloc(sizeof(double));
-
-    MPI_Reduce(buffer_send, buffer_recv, 1, MPI_DOUBLE, MPI_SUM, 0, commGPU);
-
-    return *buffer_recv;
-}
-
-void MpiCommunicator::receive_send(uint *buffer_receive, int size_buffer_recv, int neighbor_rank_recv, uint *buffer_send,
-                         int size_buffer_send, int neighbor_rank_send) const
-{
-    MPI_Request recv_request;
-    MPI_Irecv(buffer_receive, size_buffer_recv, MPI_UNSIGNED, neighbor_rank_recv, 0, commGPU, &recv_request);
-    //printf("receive_send PID: %i,   nbRev: nb_rank_recv: %i", this->getPID(), nb_rank_r);
-    //fflush(stdout);
-    MPI_Send(buffer_send, size_buffer_send, MPI_UNSIGNED, neighbor_rank_send, 0, commGPU);
-    //printf("receive_send PID: %i,   sendUintGPU: nb_rank_send: %i", this->getPID(), nb_rank_s);
-    //fflush(stdout);
-    MPI_Wait(&recv_request, MPI_STATUSES_IGNORE);
-}
-
-} // namespace vf::gpu
diff --git a/src/gpu/VirtualFluids_GPU/Communication/MpiCommunicator.h b/src/gpu/VirtualFluids_GPU/Communication/MpiCommunicator.h
deleted file mode 100644
index c6a71c0bf..000000000
--- a/src/gpu/VirtualFluids_GPU/Communication/MpiCommunicator.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef MPIMpiCommunicator_GPU_H
-#define MPIMpiCommunicator_GPU_H
-
-#include <vector>
-
-#include <mpi.h>
-
-#include "VirtualFluids_GPU_export.h"
-
-#include "Communicator.h"
-#include <basics/DataTypes.h>
-
-//////////////////////////////////
-#ifdef VF_DOUBLE_ACCURACY
-#define MPI_Type_GPU MPI_DOUBLE
-#else
-#define MPI_Type_GPU MPI_FLOAT
-#endif
-//////////////////////////////////
-
-namespace vf::gpu
-{
-
-class VIRTUALFLUIDS_GPU_EXPORT MpiCommunicator : public Communicator
-{
-public:
-    static MpiCommunicator &getInstance();
-    MpiCommunicator(const MpiCommunicator &) = delete;
-    MpiCommunicator &operator=(const MpiCommunicator &) = delete;
-    ~MpiCommunicator() override;
-
-    void exchngBottomToTop(float *sbuf, float *rbuf, int count);
-    void exchngTopToBottom(float *sbuf, float *rbuf, int count);
-    void waitAll() override;
-    void distributeGeometry(unsigned int *dataRoot, unsigned int *dataNode, int dataSizePerNode);
-    int getPID() const override;
-    int getNumberOfProcess() const override;
-    int getNeighbourTop();
-    int getNeighbourBottom();
-    void exchngData(float *sbuf_t, float *rbuf_t, float *sbuf_b, float *rbuf_b, int count) override;
-    void exchngDataNB(float *sbuf_t, int count_st, float *rbuf_t, int count_rt, float *sbuf_b, int count_sb,
-                      float *rbuf_b, int count_rb);
-    //////////////////////////////////////////////////////////////////////////
-    void exchngDataGPU(real *sbuf, int count_s, real *rbuf, int count_r, int nb_rank) override;
-    void sendRecvGPU(real *sbuf, int count_s, real *rbuf, int count_r, int nb_rank);
-    void nbRecvDataGPU(real *rbuf, int count_r, int nb_rank) override;
-    void nbSendDataGPU(real *sbuf, int count_s, int nb_rank) override;
-    void waitallGPU() override;
-    void sendDataGPU(real *sbuf, int count_s, int nb_rank) override;
-    void waitGPU(int id) override;
-    void resetRequest() override;
-    void barrierGPU();
-    void barrier();
-    //////////////////////////////////////////////////////////////////////////
-    void exchngDataGeo(int *sbuf_t, int *rbuf_t, int *sbuf_b, int *rbuf_b, int count);
-    MPI_Comm getMpiCommunicator();
-    int mapCudaDevice(const int &rank, const int &size, const std::vector<unsigned int> &devices, const int &maxdev) override;
-    double reduceSum(double quantityPerProcess) override;
-    //////////////////////////////////////////////////////////////////////////
-    void receive_send(uint *buffer_receive, int size_buffer_recv, int neighbor_rank_recv, uint *buffer_send,
-                      int size_buffer_send, int neighbor_rank_send) const override;
-
-private:
-    int numprocs, PID;
-    int nbrbottom, nbrtop;
-    MPI_Comm comm1d, commGPU;
-    MPI_Status status[4];
-    MPI_Request request[4];
-    //////////////////////////////////////////////////////////////////////////
-    std::vector<MPI_Request> requestGPU;
-    int rcount;
-    //////////////////////////////////////////////////////////////////////////
-    MpiCommunicator();
-};
-
-} // namespace vf::gpu
-
-#endif
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.cpp
index 9b2d1c4f5..e96c96ec2 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.cpp
@@ -9,7 +9,7 @@
 #include <GPU/CudaMemoryManager.h>
 
 
-std::shared_ptr<GridProvider> GridProvider::makeGridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::gpu::CommunicationRoutine& communicator)
+std::shared_ptr<GridProvider> GridProvider::makeGridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::parallel::Communicator& communicator)
 {
     return std::shared_ptr<GridProvider>(new GridGenerator(builder, para, cudaMemoryManager, communicator));
 }
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h
index ee6c93a5f..28d2f39e8 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h
@@ -9,9 +9,9 @@
 #include "PointerDefinitions.h"
 #include "VirtualFluids_GPU_export.h"
 #include "gpu/GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
-namespace vf::gpu
+namespace vf::parallel
 {
-class CommunicationRoutine;
+class Communicator;
 }
 
 class Parameter;
@@ -21,7 +21,7 @@ class CudaMemoryManager;
 class VIRTUALFLUIDS_GPU_EXPORT GridProvider
 {
 public:
-    static std::shared_ptr<GridProvider> makeGridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::gpu::CommunicationRoutine& communicator);
+    static std::shared_ptr<GridProvider> makeGridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::parallel::Communicator& communicator);
     static std::shared_ptr<GridProvider> makeGridReader(FILEFORMAT format, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaMemoryManager);
 
     virtual void allocArrays_CoordNeighborGeo() = 0;
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
index e3c86317c..db07322dd 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
@@ -15,15 +15,16 @@
 #include "GridGenerator/TransientBCSetter/TransientBCSetter.h"
 
 #include "utilities/communication.h"
-#include "Communication/CommunicationRoutine.h"
+
+#include <parallel/Communicator.h>
 
 #include <logger/Logger.h>
 
 using namespace vf::lbm::dir;
 
 GridGenerator::GridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para,
-                             std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::gpu::CommunicationRoutine &communicator)
-    : mpiProcessID(communicator.getPID()), builder(builder)
+                             std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::parallel::Communicator &communicator)
+    : mpiProcessID(communicator.getProcessID()), builder(builder)
 {
     this->para = para;
     this->cudaMemoryManager = cudaMemoryManager;
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h
index b03de24ec..9c0d50a06 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h
@@ -45,9 +45,9 @@ class Parameter;
 class GridBuilder;
 class IndexRearrangementForStreams;
 class InterpolationCellGrouper;
-namespace vf::gpu
+namespace vf::parallel
 {
-class CommunicationRoutine;
+class Communicator;
 }
 
 //! \class GridGenerator derived class of GridProvider
@@ -67,7 +67,7 @@ private:
     const uint mpiProcessID;
 
 public:
-    VIRTUALFLUIDS_GPU_EXPORT GridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::gpu::CommunicationRoutine& communicator);
+    VIRTUALFLUIDS_GPU_EXPORT GridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::parallel::Communicator& communicator);
     ~GridGenerator() override;
     //! \brief overwrites the default IndexRearrangementForStreams
     void setIndexRearrangementForStreams(std::unique_ptr<IndexRearrangementForStreams>&& indexRearrangement);
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGeneratorTest.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGeneratorTest.cpp
index 8685ea9db..8fc0f78d1 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGeneratorTest.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGeneratorTest.cpp
@@ -1,17 +1,16 @@
 #include "GridGenerator.h"
 #include <gmock/gmock.h>
 
-#include "Communication/CommunicationRoutine.h"
-#include "Communication/MpiCommunicator.h"
 #include "DataTypes.h"
 #include "GPU/CudaMemoryManager.h"
 #include "IndexRearrangementForStreams.h"
+#include "NullCommunicator.h"
 #include "Parameter/Parameter.h"
 #include "gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
 #include "gpu/GridGenerator/grid/GridImp.h"
 #include "gpu/GridGenerator/utilities/communication.h"
 
-#include "Communication/CommunicationRoutineMocks.h"
+#include <parallel/NullCommunicator.h>
 
 namespace GridGeneratorTest
 {
@@ -72,7 +71,7 @@ class IndexRearrangementForStreamsDouble : public IndexRearrangementForStreams
 {
 public:
     IndexRearrangementForStreamsDouble(std::shared_ptr<Parameter> para, std::shared_ptr<GridBuilder> builder,
-                                       vf::gpu::CommunicationRoutine &communicator)
+                                       vf::parallel::Communicator &communicator)
         : IndexRearrangementForStreams(para, builder, communicator){};
 
     void initCommunicationArraysForCommAfterFinetoCoarseX(uint level, int indexOfProcessNeighbor,
@@ -116,7 +115,7 @@ private:
         para->setNumprocs(2);
 
         builder = std::make_shared<LevelGridBuilderStub>(nullptr);
-        vf::gpu::test::CommunicationRoutineTestDouble communicator;
+        vf::parallel::NullCommunicator communicator;
 
         gridGenerator = std::make_shared<GridGenerator>(builder, para, std::make_shared<CudaMemoryManagerDouble>(para),
                                                         communicator);
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.cpp
index d59fa8d01..bfd112b7c 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.cpp
@@ -1,17 +1,18 @@
 #include "IndexRearrangementForStreams.h"
 
-#include "Communication/Communicator.h"
 #include "Logger.h"
 #include "Parameter/Parameter.h"
 #include <GridGenerator/grid/Grid.h>
 #include <GridGenerator/grid/GridBuilder/GridBuilder.h>
 
+#include <parallel/Communicator.h>
+
 #include <algorithm>
 #include <iostream>
 
 IndexRearrangementForStreams::IndexRearrangementForStreams(std::shared_ptr<Parameter> para,
                                                            std::shared_ptr<GridBuilder> builder,
-                                                           vf::gpu::CommunicationRoutine &communicator)
+                                                           vf::parallel::Communicator &communicator)
     : para(para), builder(builder), communicator(communicator)
 {
 }
@@ -108,7 +109,7 @@ std::vector<uint> IndexRearrangementForStreams::exchangeIndicesForCommAfterFtoCX
     std::vector<uint> recvIndicesForCommAfterFtoCPositions(
         (size_t)para->getParH(level)->sendProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].numberOfNodes * 2, 0);
 
-    communicator.receive_send(
+    communicator.receiveSend(
         recvIndicesForCommAfterFtoCPositions.data(), (int)recvIndicesForCommAfterFtoCPositions.size(),
         para->getParH(level)->recvProcessNeighborX[indexOfProcessNeighbor].rankNeighbor,
         sendIndicesForCommAfterFtoCPositions.data(), (int)sendIndicesForCommAfterFtoCPositions.size(),
@@ -135,7 +136,7 @@ std::vector<uint> IndexRearrangementForStreams::exchangeIndicesForCommAfterFtoCY
     std::vector<uint> recvIndicesForCommAfterFtoCPositions(
         (size_t)para->getParH(level)->sendProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].numberOfNodes * 2, 0);
 
-    communicator.receive_send(
+    communicator.receiveSend(
         recvIndicesForCommAfterFtoCPositions.data(), (int)recvIndicesForCommAfterFtoCPositions.size(),
         para->getParH(level)->recvProcessNeighborY[indexOfProcessNeighbor].rankNeighbor,
         sendIndicesForCommAfterFtoCPositions.data(), (int)sendIndicesForCommAfterFtoCPositions.size(),
@@ -162,7 +163,7 @@ std::vector<uint> IndexRearrangementForStreams::exchangeIndicesForCommAfterFtoCZ
     std::vector<uint> recvIndicesForCommAfterFtoCPositions(
         (size_t)para->getParH(level)->sendProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].numberOfNodes * 2, 0);
 
-    communicator.receive_send(
+    communicator.receiveSend(
         recvIndicesForCommAfterFtoCPositions.data(), (int)recvIndicesForCommAfterFtoCPositions.size(),
         para->getParH(level)->recvProcessNeighborZ[indexOfProcessNeighbor].rankNeighbor,
         sendIndicesForCommAfterFtoCPositions.data(), (int)sendIndicesForCommAfterFtoCPositions.size(),
@@ -368,7 +369,7 @@ void IndexRearrangementForStreams::reorderSendIndicesForCommAfterFtoC(
     for (uint i = 0; i < (uint)sendIndicesOther.size(); i++)
         sendIndices[i + numberOfSendNodesAfterFtoC] = sendIndicesOther[i];
 
-    VF_LOG_INFO("Reorder send indices: process {}, numberOfSendNodesAfterFtoC {}", communicator.getPID(),
+    VF_LOG_INFO("Reorder send indices: process {}, numberOfSendNodesAfterFtoC {}", communicator.getProcessID(),
                 numberOfSendNodesAfterFtoC);
 
     if (numberOfSendNodesAfterFtoC + sendIndicesOther.size() != numberOfSendIndices) {
@@ -514,7 +515,7 @@ void IndexRearrangementForStreams::reorderRecvIndicesForCommAfterFtoC(
     for (uint i = 0; i < (uint)recvIndicesOther.size(); i++)
         recvIndices[i + numberOfRecvNodesAfterFtoC] = recvIndicesOther[i];
 
-    VF_LOG_INFO("Reorder send indices: process {}, numberOfRecvNodesAfterFtoC {}", communicator.getPID(),
+    VF_LOG_INFO("Reorder send indices: process {}, numberOfRecvNodesAfterFtoC {}", communicator.getProcessID(),
                 numberOfRecvNodesAfterFtoC);
 
     if (numberOfRecvNodesAfterFtoC + recvIndicesOther.size() != numberOfRecvIndices) {
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.h b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.h
index 0b0401d34..421e5aa4c 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.h
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.h
@@ -13,16 +13,16 @@
 
 class Parameter;
 class GridBuilder;
-namespace vf::gpu
+namespace vf::parallel
 {
-class CommunicationRoutine;
+class Communicator;
 }
 
 class IndexRearrangementForStreams
 {
 public:
     //! \brief Construct IndexRearrangementForStreams object
-    IndexRearrangementForStreams(std::shared_ptr<Parameter> para, std::shared_ptr<GridBuilder> builder, vf::gpu::CommunicationRoutine& communicator);
+    IndexRearrangementForStreams(std::shared_ptr<Parameter> para, std::shared_ptr<GridBuilder> builder, vf::parallel::Communicator& communicator);
 
     virtual ~IndexRearrangementForStreams() = default;
 
@@ -133,7 +133,7 @@ protected:
 private:
     std::shared_ptr<GridBuilder> builder;
     std::shared_ptr<Parameter> para;
-    vf::gpu::CommunicationRoutine& communicator;
+    vf::parallel::Communicator &communicator;
 
     // used for tests
     friend class IndexRearrangementForStreamsTest_reorderSendIndices;
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreamsTest.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreamsTest.cpp
index a8bc58488..bcdb22b9c 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreamsTest.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreamsTest.cpp
@@ -6,7 +6,6 @@
 
 #include "Utilities/testUtilitiesGPU.h"
 
-#include "Communication/Communicator.h"
 #include "DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.h"
 #include "Parameter/Parameter.h"
 #include "basics/config/ConfigurationFile.h"
@@ -14,7 +13,7 @@
 #include "gpu/GridGenerator/grid/GridImp.h"
 #include "gpu/GridGenerator/utilities/communication.h"
 
-#include "Communication/CommunicationRoutineMocks.h"
+#include <parallel/NullCommunicator.h>
 
 namespace indexRearrangementTests
 {
@@ -152,7 +151,7 @@ private:
             IndexRearrangementForStreams(para, builder, communicator));
     };
 
-    vf::gpu::test::CommunicationRoutineTestDouble communicator;
+    vf::parallel::NullCommunicator communicator;
 };
 
 TEST_F(IndexRearrangementForStreamsTest_reorderSendIndices, reorderSendIndicesForCommAfterFtoCX)
@@ -174,19 +173,19 @@ TEST_F(IndexRearrangementForStreamsTest_reorderSendIndices, reorderSendIndicesFo
 // Test exchangeIndicesForCommAfterFtoC
 //////////////////////////////////////////////////////////////////////////
 
-class CommunicationRoutineDouble : public vf::gpu::CommunicationRoutine
+class CommunicatorDouble : public vf::parallel::NullCommunicator
 {
 public:
-    void receive_send(uint *buffer_receive, int, int, uint *, int, int) const override
+    void receiveSend(uint *buffer_receive, int, int, uint *, int, int) const override
     {
         for (int i = 0; i < (int)receivedIndices.size(); ++i) {
             *(buffer_receive + i) = receivedIndices[i];
         }
     }
 
-    int getPID() const override
+    void receiveSend(real *buffer_send, int size_buffer_send, real *buffer_receive, int size_buffer_recv,
+                     int neighbor_rank) const override
     {
-        return 0;
     }
 
     void setReceivedIndices(const std::vector<uint>& receivedIndices)
@@ -202,9 +201,9 @@ class IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCX : public
 {
 
 public:
-    void createTestSubject(vf::gpu::CommunicationRoutine &CommunicationRoutine)
+    void createTestSubject(vf::parallel::Communicator &Communicator)
     {
-        sut = std::make_unique<IndexRearrangementForStreams>(para, builder, CommunicationRoutine);
+        sut = std::make_unique<IndexRearrangementForStreams>(para, builder, Communicator);
     }
 
 protected:
@@ -243,7 +242,7 @@ private:
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCX, emptyRecvInX)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     communicator.setReceivedIndices(std::vector<uint>());
     createTestSubject(communicator);
 
@@ -253,7 +252,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCX, emptyR
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCX, zeroRecvIndexX)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     communicator.setReceivedIndices({ 0 });
     createTestSubject(communicator);
 
@@ -263,7 +262,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCX, zeroRe
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCX, oneRecvIndexX)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     std::vector<uint> expected = { 10 };
     std::vector<uint> receivedIndicesByComm(4, 0);
     std::copy(expected.begin(), expected.end(), receivedIndicesByComm.begin());
@@ -277,7 +276,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCX, oneRec
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCX, threeRecvIndicesX)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     std::vector<uint> expected = { 10, 20, 30 };
     std::vector<uint> receivedIndicesByComm(5, 0);
     std::copy(expected.begin(), expected.end(), receivedIndicesByComm.begin());
@@ -292,7 +291,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCX, threeR
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCX, sixRecvIndicesX)
 {
     // this test shows the limits of the current approach. The last index is always deleted
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     std::vector<uint> expected = { 10, 20, 30, 40, 50 };
     std::vector<uint> receivedIndicesByComm = { 10, 20, 30, 40, 50, 60 };
     communicator.setReceivedIndices(receivedIndicesByComm);
@@ -305,7 +304,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCX, sixRec
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCX, recvIndicesXContainZero)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     std::vector<uint> expected = { 0, 20, 30, 40 };
     std::vector<uint> receivedIndicesByComm(6, 0);
     std::copy(expected.begin(), expected.end(), receivedIndicesByComm.begin());
@@ -321,9 +320,9 @@ class IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCY : public
 {
 
 public:
-    void createTestSubject(vf::gpu::CommunicationRoutine &CommunicationRoutine)
+    void createTestSubject(vf::parallel::Communicator &Communicator)
     {
-        sut = std::make_unique<IndexRearrangementForStreams>(para, builder, CommunicationRoutine);
+        sut = std::make_unique<IndexRearrangementForStreams>(para, builder, Communicator);
     }
 
 protected:
@@ -362,7 +361,7 @@ private:
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCY, emptyRecvInY)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     communicator.setReceivedIndices(std::vector<uint>());
     createTestSubject(communicator);
 
@@ -372,7 +371,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCY, emptyR
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCY, zeroRecvIndexY)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     communicator.setReceivedIndices({ 0 });
     createTestSubject(communicator);
 
@@ -382,7 +381,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCY, zeroRe
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCY, oneRecvIndexY)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     std::vector<uint> expected = { 10 };
     std::vector<uint> receivedIndicesByComm(4, 0);
     std::copy(expected.begin(), expected.end(), receivedIndicesByComm.begin());
@@ -396,7 +395,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCY, oneRec
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCY, threeRecvIndicesY)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     std::vector<uint> expected = { 10, 20, 30 };
     std::vector<uint> receivedIndicesByComm(5, 0);
     std::copy(expected.begin(), expected.end(), receivedIndicesByComm.begin());
@@ -411,7 +410,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCY, threeR
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCY, sixRecvIndicesY)
 {
     // this test shows the limits of the current approach. The last index is always deleted
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     std::vector<uint> expected = { 10, 20, 30, 40, 50 };
     std::vector<uint> receivedIndicesByComm = { 10, 20, 30, 40, 50, 60 };
     communicator.setReceivedIndices(receivedIndicesByComm);
@@ -424,7 +423,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCY, sixRec
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCY, recvIndicesYContainZero)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     std::vector<uint> expected = { 0, 20, 30, 40 };
     std::vector<uint> receivedIndicesByComm(6, 0);
     std::copy(expected.begin(), expected.end(), receivedIndicesByComm.begin());
@@ -440,9 +439,9 @@ class IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCZ : public
 {
 
 public:
-    void createTestSubject(vf::gpu::CommunicationRoutine &CommunicationRoutine)
+    void createTestSubject(vf::parallel::Communicator &Communicator)
     {
-        sut = std::make_unique<IndexRearrangementForStreams>(para, builder, CommunicationRoutine);
+        sut = std::make_unique<IndexRearrangementForStreams>(para, builder, Communicator);
     }
 
 protected:
@@ -481,7 +480,7 @@ private:
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCZ, emptyRecvInZ)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     communicator.setReceivedIndices(std::vector<uint>());
     createTestSubject(communicator);
 
@@ -491,7 +490,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCZ, emptyR
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCZ, zeroRecvIndexZ)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     communicator.setReceivedIndices({ 0 });
     createTestSubject(communicator);
 
@@ -501,7 +500,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCZ, zeroRe
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCZ, oneRecvIndexZ)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     std::vector<uint> expected = { 10 };
     std::vector<uint> receivedIndicesBZComm(4, 0);
     std::copy(expected.begin(), expected.end(), receivedIndicesBZComm.begin());
@@ -515,7 +514,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCZ, oneRec
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCZ, threeRecvIndicesZ)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     std::vector<uint> expected = { 10, 20, 30 };
     std::vector<uint> receivedIndicesBZComm(5, 0);
     std::copy(expected.begin(), expected.end(), receivedIndicesBZComm.begin());
@@ -530,7 +529,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCZ, threeR
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCZ, sixRecvIndicesYZ)
 {
     // this test shows the limits of the current approach. The last index is always deleted
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     std::vector<uint> expected = { 10, 20, 30, 40, 50 };
     std::vector<uint> receivedIndicesByComm = { 10, 20, 30, 40, 50, 60 };
     communicator.setReceivedIndices(receivedIndicesByComm);
@@ -543,7 +542,7 @@ TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCZ, sixRec
 
 TEST_F(IndexRearrangementForStreamsTest_exchangeIndicesForCommAfterFtoCZ, recvIndicesZContainZero)
 {
-    CommunicationRoutineDouble communicator;
+    CommunicatorDouble communicator;
     std::vector<uint> expected = { 0, 20, 30, 40 };
     std::vector<uint> receivedIndicesByComm(6, 0);
     std::copy(expected.begin(), expected.end(), receivedIndicesByComm.begin());
@@ -614,7 +613,7 @@ private:
             IndexRearrangementForStreams(para, builder, communicator));
     };
 
-    vf::gpu::test::CommunicationRoutineTestDouble communicator;
+    vf::parallel::NullCommunicator communicator;
 };
 
 TEST_F(IndexRearrangementForStreamsTest_reorderRecvIndicesX, noSendIndicesForCommunicationAfterScalingFineToCoarse_receiveIndicesAreUnchanged)
diff --git a/src/gpu/VirtualFluids_GPU/Init/VfReader.cpp b/src/gpu/VirtualFluids_GPU/Init/VfReader.cpp
index 46f6254f7..1406abff0 100644
--- a/src/gpu/VirtualFluids_GPU/Init/VfReader.cpp
+++ b/src/gpu/VirtualFluids_GPU/Init/VfReader.cpp
@@ -1,7 +1,6 @@
 #include "Init/VfReader.h"
 
 #include "Parameter/Parameter.h"
-#include "Communication/Communicator.h"
 #include "Init/PositionReader.h"
 #include "GPU/CudaMemoryManager.h"
 
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
index dddc795cc..65906f4e2 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
@@ -4,10 +4,8 @@
 
 #include <helper_timer.h>
 
-
 #include "Factories/GridScalingFactory.h"
 #include "LBM/LB.h"
-#include "Communication/Communicator.h"
 #include "Communication/ExchangeData27.h"
 #include "Parameter/Parameter.h"
 #include "Parameter/CudaStreamManager.h"
@@ -62,6 +60,7 @@
 
 #include <logger/Logger.h>
 
+#include <parallel/Communicator.h>
 
 
 std::string getFileName(const std::string& fname, int step, int myID)
@@ -70,7 +69,7 @@ std::string getFileName(const std::string& fname, int step, int myID)
 }
 
 Simulation::Simulation(std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> memoryManager,
-                       vf::gpu::Communicator &communicator, GridProvider &gridProvider, BoundaryConditionFactory* bcFactory, GridScalingFactory* scalingFactory)
+                       vf::parallel::Communicator &communicator, GridProvider &gridProvider, BoundaryConditionFactory* bcFactory, GridScalingFactory* scalingFactory)
     : para(para), cudaMemoryManager(memoryManager), communicator(communicator), kernelFactory(std::make_unique<KernelFactoryImp>()),
       preProcessorFactory(std::make_shared<PreProcessorFactoryImp>()), dataWriter(std::make_unique<FileWriter>())
 {
@@ -79,7 +78,7 @@ Simulation::Simulation(std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemo
 }
 
 Simulation::Simulation(std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> memoryManager,
-                       vf::gpu::Communicator &communicator, GridProvider &gridProvider, BoundaryConditionFactory* bcFactory, SPtr<TurbulenceModelFactory> tmFactory, GridScalingFactory* scalingFactory)
+                       vf::parallel::Communicator &communicator, GridProvider &gridProvider, BoundaryConditionFactory* bcFactory, SPtr<TurbulenceModelFactory> tmFactory, GridScalingFactory* scalingFactory)
     : para(para), cudaMemoryManager(memoryManager), communicator(communicator), kernelFactory(std::make_unique<KernelFactoryImp>()),
       preProcessorFactory(std::make_shared<PreProcessorFactoryImp>()), dataWriter(std::make_unique<FileWriter>())
 {
@@ -90,8 +89,7 @@ void Simulation::init(GridProvider &gridProvider, BoundaryConditionFactory *bcFa
 {
     gridProvider.initalGridInformations();
 
-    vf::cuda::verifyAndSetDevice(
-        communicator.mapCudaDevice(para->getMyProcessID(), para->getNumprocs(), para->getDevices(), para->getMaxDev()));
+    vf::cuda::verifyAndSetDevice(communicator.mapCudaDevicesOnHosts(para->getDevices(), para->getMaxDev()));
 
     para->initLBMSimulationParameter();
 
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.h b/src/gpu/VirtualFluids_GPU/LBM/Simulation.h
index ba2a32170..146ab4cf6 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.h
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.h
@@ -6,11 +6,13 @@
 
 #include <PointerDefinitions.h>
 
-#include "Utilities/Buffer2D.hpp"
 #include "LBM/LB.h"
+#include "Utilities/Buffer2D.hpp"
 
-
-namespace vf::gpu { class Communicator; }
+namespace vf::parallel
+{
+class Communicator;
+}
 
 class CudaMemoryManager;
 class Parameter;
@@ -37,9 +39,9 @@ class Simulation
 {
 public:
     Simulation(std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> memoryManager,
-               vf::gpu::Communicator &communicator, GridProvider &gridProvider, BoundaryConditionFactory* bcFactory, GridScalingFactory* scalingFactory = nullptr);	
+               vf::parallel::Communicator &communicator, GridProvider &gridProvider, BoundaryConditionFactory* bcFactory, GridScalingFactory* scalingFactory = nullptr);	
 	Simulation(std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> memoryManager,
-               vf::gpu::Communicator &communicator, GridProvider &gridProvider, BoundaryConditionFactory* bcFactory, SPtr<TurbulenceModelFactory> tmFactory, GridScalingFactory* scalingFactory = nullptr);
+               vf::parallel::Communicator &communicator, GridProvider &gridProvider, BoundaryConditionFactory* bcFactory, SPtr<TurbulenceModelFactory> tmFactory, GridScalingFactory* scalingFactory = nullptr);
 
     ~Simulation();
     void run();
@@ -76,7 +78,7 @@ private:
 	Buffer2D <int> geo_rbuf_b;
 
 
-	vf::gpu::Communicator& communicator;
+	vf::parallel::Communicator& communicator;
     SPtr<Parameter> para;
     std::shared_ptr<DataWriter> dataWriter;
 	std::shared_ptr<CudaMemoryManager> cudaMemoryManager;
diff --git a/src/gpu/VirtualFluids_GPU/Output/EdgeNodeDebugWriter.hpp b/src/gpu/VirtualFluids_GPU/Output/EdgeNodeDebugWriter.hpp
index ee5333dfc..eb43a5f1f 100644
--- a/src/gpu/VirtualFluids_GPU/Output/EdgeNodeDebugWriter.hpp
+++ b/src/gpu/VirtualFluids_GPU/Output/EdgeNodeDebugWriter.hpp
@@ -3,7 +3,7 @@
 
 #include <fstream>
 #include <sstream>
-#include <stdio.h>
+#include <cstdio>
 // #include <math.h>
 #include "StringUtilities/StringUtil.h"
 #include "lbm/constants/D3Q27.h"
@@ -13,7 +13,6 @@
 #include <basics/writer/WbWriterVtkXmlBinary.h>
 #include <cmath>
 
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
 
 namespace EdgeNodeDebugWriter
 {
@@ -25,7 +24,7 @@ void addCoordinatesToNodeVector(SPtr<LBMSimulationParameter> parH, std::vector<U
             nodesVec[indexInNodesVector] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
 }
 
-void writeEdgeNodesXZ_Send(SPtr<Parameter> para)
+void writeEdgeNodesXZ_Send(SPtr<Parameter> para, int processID = 0)
 {
     std::vector<UbTupleFloat3> nodesVec;
     std::vector<std::string> datanames = { "SparseIndex", "ProcessNeighbor", "IndexInSendVector", "AfterFtoC" };
@@ -54,14 +53,14 @@ void writeEdgeNodesXZ_Send(SPtr<Parameter> para)
             nodeCount++;
         }
         std::string filenameVec = para->getFName() + "_writeEdgeNodesXZ_Send_PID_" +
-                                  std::to_string(vf::gpu::MpiCommunicator::getInstance().getPID()) + "_" +
+                                  std::to_string(processID) + "_" +
                                   StringUtil::toString<int>(level);
 
         WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(filenameVec, nodesVec, datanames, nodedata);
     }
 }
 
-void writeEdgeNodesXZ_Recv(SPtr<Parameter> para)
+void writeEdgeNodesXZ_Recv(SPtr<Parameter> para, int processID = 0)
 {
     std::vector<UbTupleFloat3> nodesVec;
     std::vector<std::string> datanames = { "SparseIndex", "ProcessNeighbor", "IndexInRecvVector", "AfterFtoC" };
@@ -90,7 +89,7 @@ void writeEdgeNodesXZ_Recv(SPtr<Parameter> para)
             nodeCount++;
         }
         std::string filenameVec = para->getFName() + "_writeEdgeNodesXZ_Recv_PID_" +
-                                  std::to_string(vf::gpu::MpiCommunicator::getInstance().getPID()) + "_" +
+                                  std::to_string(processID) + "_" +
                                   StringUtil::toString<int>(level);
 
         WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(filenameVec, nodesVec, datanames, nodedata);
diff --git a/src/gpu/VirtualFluids_GPU/Output/InterfaceDebugWriter.hpp b/src/gpu/VirtualFluids_GPU/Output/InterfaceDebugWriter.hpp
index 4af9a50a1..da5307c3d 100644
--- a/src/gpu/VirtualFluids_GPU/Output/InterfaceDebugWriter.hpp
+++ b/src/gpu/VirtualFluids_GPU/Output/InterfaceDebugWriter.hpp
@@ -11,8 +11,6 @@
 #include <basics/writer/WbWriterVtkXmlBinary.h>
 #include <cmath>
 
-#include "VirtualFluids_GPU/Communication/MpiCommunicator.h"
-
 namespace InterfaceDebugWriter
 {
 
@@ -603,7 +601,7 @@ void checkForSendNodeZ(int pos, int &sendDir, int &sendDirectionInCommAfterFtoC,
                            para->getParH(level)->sendProcessNeighborsAfterFtoCZ, 8.0);
 }
 
-void writeInterfaceFCC_Send(Parameter *para)
+void writeInterfaceFCC_Send(Parameter *para, int processID = 0)
 {
     std::vector<UbTupleFloat3> nodesVec;
     int nodeNumberVec = 0;
@@ -650,14 +648,14 @@ void writeInterfaceFCC_Send(Parameter *para)
             nodeCount++;
         }
         std::string filenameVec = para->getFName() + "_writeInterfaceFCC_Send_PID_" +
-                                  std::to_string(vf::gpu::MpiCommunicator::getInstance().getPID()) + "_" +
+                                  std::to_string(processID) + "_" +
                                   StringUtil::toString<int>(level);
 
         WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(filenameVec, nodesVec, datanames, nodedata);
     }
 }
 
-void writeInterfaceCFC_Recv(Parameter *para)
+void writeInterfaceCFC_Recv(Parameter *para, int processID = 0)
 {
     std::vector<UbTupleFloat3> nodesVec;
     int nodeNumberVec = 0;
@@ -703,7 +701,7 @@ void writeInterfaceCFC_Recv(Parameter *para)
             nodeCount++;
         }
         std::string filenameVec = para->getFName() + "_writeInterfaceCFC_Recv_PID_" +
-                                  std::to_string(vf::gpu::MpiCommunicator::getInstance().getPID()) + "_" +
+                                  std::to_string(processID) + "_" +
                                   StringUtil::toString<int>(level);
 
         WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(filenameVec, nodesVec, datanames, nodedata);
@@ -718,7 +716,7 @@ void addToNodesVector(const int level, const int pos, std::vector<UbTupleFloat3>
     nodesVec.push_back(makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
 }
 
-void writeSendNodesStream(Parameter *para)
+void writeSendNodesStream(Parameter *para, int processID = 0)
 {
     std::vector<UbTupleFloat3> nodesVec;
 
@@ -808,14 +806,14 @@ void writeSendNodesStream(Parameter *para)
             }
         }
         std::string filenameVec = para->getFName() + "_writeSendNodesStreams_PID_" +
-                                  std::to_string(vf::gpu::MpiCommunicator::getInstance().getPID()) + "_" +
+                                  std::to_string(processID) + "_" +
                                   StringUtil::toString<int>(level);
 
         WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(filenameVec, nodesVec, datanames, nodedata);
     }
 }
 
-void writeRecvNodesStream(Parameter *para)
+void writeRecvNodesStream(Parameter *para, int processID = 0)
 {
     std::vector<UbTupleFloat3> nodesVec;
 
@@ -894,7 +892,7 @@ void writeRecvNodesStream(Parameter *para)
         // Recv are nodes ghost nodes and therefore they can't be coarse cells for the interpolation from coarse to fine
 
         std::string filenameVec = para->getFName() + "_writeRecvNodesStreams_PID_" +
-                                  std::to_string(vf::gpu::MpiCommunicator::getInstance().getPID()) + "_" +
+                                  std::to_string(processID) + "_" +
                                   StringUtil::toString<int>(level);
 
         WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(filenameVec, nodesVec, datanames, nodedata);
diff --git a/src/gpu/VirtualFluids_GPU/Output/NeighborDebugWriter.hpp b/src/gpu/VirtualFluids_GPU/Output/NeighborDebugWriter.hpp
index e506a56bb..a05aad821 100644
--- a/src/gpu/VirtualFluids_GPU/Output/NeighborDebugWriter.hpp
+++ b/src/gpu/VirtualFluids_GPU/Output/NeighborDebugWriter.hpp
@@ -11,7 +11,6 @@
 
 #include "StringUtilities/StringUtil.h"
 #include "Utilities/FindNeighbors.h"
-#include "gpu/VirtualFluids_GPU/Communication/Communicator.h"
 
 namespace NeighborDebugWriter
 {
diff --git a/src/gpu/VirtualFluids_GPU/Output/QDebugVtkWriter.hpp b/src/gpu/VirtualFluids_GPU/Output/QDebugVtkWriter.hpp
index d075c78e5..5448db132 100644
--- a/src/gpu/VirtualFluids_GPU/Output/QDebugVtkWriter.hpp
+++ b/src/gpu/VirtualFluids_GPU/Output/QDebugVtkWriter.hpp
@@ -11,7 +11,6 @@
 #include <logger/Logger.h>
 
 #include "gpu/GridGenerator/grid/NodeValues.h"
-#include "gpu/VirtualFluids_GPU/Communication/Communicator.h"
 #include "gpu/VirtualFluids_GPU/LBM/LB.h"
 #include "gpu/VirtualFluids_GPU/Parameter/Parameter.h"
 #include "gpu/VirtualFluids_GPU/Utilities/FindNeighbors.h"
diff --git a/src/gpu/VirtualFluids_GPU/Output/Timer.cpp b/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
index f6efff584..a3048e62d 100644
--- a/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
+++ b/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
@@ -4,7 +4,8 @@
 
 #include "UbScheduler.h"
 #include "Parameter/Parameter.h"
-#include "VirtualFluids_GPU/Communication/Communicator.h"
+
+#include <parallel/Communicator.h>
 
 void Timer::initTimer()
 {
@@ -31,7 +32,7 @@ void Timer::resetTimer()
         this->totalElapsedTime = 0.0;
 }
 
-void Timer::outputPerformance(uint t, Parameter* para, vf::gpu::Communicator& communicator)
+void Timer::outputPerformance(uint t, Parameter* para, vf::parallel::Communicator& communicator)
 {
     real fnups      = 0.0;
     real bandwidth  = 0.0;
@@ -42,18 +43,18 @@ void Timer::outputPerformance(uint t, Parameter* para, vf::gpu::Communicator& co
         bandwidth   += (27.0+1.0) * 4.0 * 1000.0 * (t-para->getTimestepStart()) * para->getParH(lev)->numberOfNodes  / (this->totalElapsedTime*1.0E9);
     }
 
-    if(this->firstOutput && communicator.getPID() == 0) //only display the legend once
+    if(this->firstOutput && communicator.getProcessID() == 0) //only display the legend once
     {
         VF_LOG_INFO("PID \t --- {} ---  Processing time (ms) \t Nups in Mio \t Bandwidth in GB/sec", this->name );
         this->firstOutput = false;
     }
 
-    VF_LOG_INFO(" {} \t --- {} --- {:>8.1f}/ {:<8.1f} \t   {:5.1f} \t       {:4.1f}",  communicator.getPID(), this->name, this->elapsedTime, this->totalElapsedTime, fnups, bandwidth);
+    VF_LOG_INFO(" {} \t --- {} --- {:>8.1f}/ {:<8.1f} \t   {:5.1f} \t       {:4.1f}",  communicator.getProcessID(), this->name, this->elapsedTime, this->totalElapsedTime, fnups, bandwidth);
 
     // When using multiple GPUs, sum the nups of all processes
-    if (communicator.getNumberOfProcess() > 1) {
-        double nupsSum =  communicator.reduceSum(fnups);
-        if (communicator.getPID() == 0)
-            VF_LOG_INFO("Sum of all {} processes: Nups in Mio: {:.1f}", communicator.getNumberOfProcess(), nupsSum);
+    if (communicator.getNumberOfProcesses() > 1) {
+        double nupsSum = communicator.reduceSum(fnups);
+        if (communicator.getProcessID() == 0)
+            VF_LOG_INFO("Sum of all {} processes: Nups in Mio: {:.1f}", communicator.getNumberOfProcesses(), nupsSum);
     }
 }
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Output/Timer.h b/src/gpu/VirtualFluids_GPU/Output/Timer.h
index 55ada64ad..fd76c6670 100644
--- a/src/gpu/VirtualFluids_GPU/Output/Timer.h
+++ b/src/gpu/VirtualFluids_GPU/Output/Timer.h
@@ -6,14 +6,15 @@
 #include "Parameter/Parameter.h"
 #include <logger/Logger.h>
 
-namespace vf::gpu{
-    class Communicator;
+namespace vf::parallel
+{
+class Communicator;
 }
 class Parameter;
 
 class Timer
 {
-    public:
+public:
     Timer(std::string _name): name(_name)
     {
         this->initTimer();
@@ -29,13 +30,12 @@ class Timer
     void startTimer();
     void stopTimer();
     void resetTimer();
-    void outputPerformance(uint t, Parameter* para, vf::gpu::Communicator& communicator);
+    void outputPerformance(uint t, Parameter* para, vf::parallel::Communicator& communicator);
 
     float getElapsedTime(){ return this->elapsedTime; }
     float getTotalElapsedTime(){ return this->totalElapsedTime; }
 
-    private:
-    
+private:
     cudaEvent_t start_t, stop_t;
     float elapsedTime = 0.0;
     float totalElapsedTime = 0.0;
@@ -44,6 +44,4 @@ class Timer
     bool firstOutput = true;
 };
 
-
-
 #endif 
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/ParameterTest.cpp b/src/gpu/VirtualFluids_GPU/Parameter/ParameterTest.cpp
index bed52dc97..12ba280b9 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/ParameterTest.cpp
+++ b/src/gpu/VirtualFluids_GPU/Parameter/ParameterTest.cpp
@@ -9,13 +9,14 @@
 #include "PointerDefinitions.h"
 #include "basics/config/ConfigurationFile.h"
 
+#include "DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "Factories/BoundaryConditionFactory.h"
 #include "Factories/GridScalingFactory.h"
-#include "Communication/Communicator.h"
-#include "DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
 #include "GPU/CudaMemoryManager.h"
 #include "gpu/GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
 
+#include <parallel/Communicator.h>
+
 TEST(ParameterTest, passingEmptyFileWithoutPath_ShouldNotThrow)
 {
     // assuming that the config files is stored parallel to this file.
@@ -212,7 +213,7 @@ class MockGridGenerator : public GridGenerator
 
 public:
     MockGridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para,
-                      std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::gpu::Communicator &communicator)
+                      std::shared_ptr<CudaMemoryManager> cudaMemoryManager, vf::parallel::Communicator &communicator)
         : GridGenerator(builder, para, cudaMemoryManager, communicator)
     {
     }
diff --git a/src/mpi/CMakeLists.txt b/src/mpi/CMakeLists.txt
deleted file mode 100644
index de1d58f5b..000000000
--- a/src/mpi/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-
-vf_add_library(NAME mpi PUBLIC_LINK logger PRIVATE_LINK MPI::MPI_CXX basics)
diff --git a/src/parallel/CMakeLists.txt b/src/parallel/CMakeLists.txt
new file mode 100644
index 000000000..742ebc363
--- /dev/null
+++ b/src/parallel/CMakeLists.txt
@@ -0,0 +1,6 @@
+
+vf_add_library(PUBLIC_LINK logger MPI::MPI_CXX basics)
+
+if(MSVC)
+    target_link_libraries(parallel PRIVATE ws2_32)
+endif()
diff --git a/src/mpi/Communicator.cpp b/src/parallel/Communicator.cpp
similarity index 99%
rename from src/mpi/Communicator.cpp
rename to src/parallel/Communicator.cpp
index 937f4d819..b7708cb5a 100644
--- a/src/mpi/Communicator.cpp
+++ b/src/parallel/Communicator.cpp
@@ -34,7 +34,7 @@
 #include "Communicator.h"
 #include <basics/utilities/UbException.h>
 
-namespace vf::mpi 
+namespace vf::parallel
 {
 std::mutex Communicator::instantiation_mutex = std::mutex();
 std::shared_ptr<Communicator> Communicator::instance = std::shared_ptr<Communicator>();
diff --git a/src/mpi/Communicator.h b/src/parallel/Communicator.h
similarity index 81%
rename from src/mpi/Communicator.h
rename to src/parallel/Communicator.h
index bcec064a2..81aaee952 100644
--- a/src/mpi/Communicator.h
+++ b/src/parallel/Communicator.h
@@ -34,14 +34,15 @@
 #ifndef MPI_COMMUNICATOR_H
 #define MPI_COMMUNICATOR_H
 
-#include <string>
-#include <vector>
 #include <memory>
-#include <sstream>
 #include <mutex>
+#include <sstream>
+#include <string>
+#include <vector>
 
+#include <basics/DataTypes.h>
 
-namespace vf::mpi 
+namespace vf::parallel
 {
 
 //! \brief An abstract class for communication between processes in parallel computation
@@ -56,10 +57,9 @@ public:
 
     virtual int getBundleID()                      = 0;
     virtual int getNumberOfBundles()               = 0;
-    virtual int getProcessID()                     = 0;
+    virtual int getProcessID() const                  = 0;
     virtual int getProcessID(int bundle, int rank) = 0;
-    virtual int getNumberOfProcesses()             = 0;
-    virtual bool isRoot()                          = 0;
+    virtual bool isRoot() const                          = 0;
     virtual void *getNativeCommunicator()          = 0;
 
     virtual void sendSerializedObject(std::stringstream &ss, int target)    = 0;
@@ -92,6 +92,20 @@ public:
     virtual void broadcast(std::vector<double> &values)   = 0;
     virtual void broadcast(std::vector<long int> &values) = 0;
 
+    virtual void receiveSend(uint *buffer_receive, int size_buffer_recv, int neighbor_rank_recv, uint *buffer_send,
+                             int size_buffer_send, int neighbor_rank_send) const = 0;
+    virtual int getNumberOfProcesses() const = 0;
+    virtual void send(real *sbuf, int count_s, int nb_rank) const = 0;
+    virtual double reduceSum(double quantityPerProcess) const = 0;
+    virtual int mapCudaDevicesOnHosts(const std::vector<unsigned int> &devices, int numberOfDevices) const = 0;
+    virtual void receiveSend(real *buffer_send, int size_buffer_send, real *buffer_receive, int size_buffer_recv,
+                             int neighbor_rank) const = 0;
+    virtual void receiveNonBlocking(real *rbuf, int count_r, int sourceRank) = 0;
+    virtual void sendNonBlocking(real *sbuf, int count_s, int destinationRank) = 0;
+    virtual void send(real *sbuf, int count_s, int destinationRank) = 0;
+    virtual void waitAll() = 0;
+    virtual void resetRequests() = 0;
+
 protected:
     Communicator() = default;
 
diff --git a/src/mpi/MPICommunicator.cpp b/src/parallel/MPICommunicator.cpp
similarity index 61%
rename from src/mpi/MPICommunicator.cpp
rename to src/parallel/MPICommunicator.cpp
index 4e7a155ef..08fa2878e 100644
--- a/src/mpi/MPICommunicator.cpp
+++ b/src/parallel/MPICommunicator.cpp
@@ -1,12 +1,21 @@
 #if defined VF_MPI
+#if defined (_WIN32) || defined (_WIN64)
+   #include <Winsock2.h>
+#elif defined (__unix__)
+   #include <unistd.h>
+#endif
 
 #include "MPICommunicator.h"
+
 #include <mpi.h>
 
 #include <sstream>
+
+#include <logger/Logger.h>
+
 using namespace std;
 
-namespace vf::mpi 
+namespace vf::parallel
 {
 std::shared_ptr<Communicator> MPICommunicator::getInstance()
 {
@@ -22,13 +31,11 @@ MPICommunicator::MPICommunicator()
     // proof if MPI is initialized
     int mpiInitialized = 0; // false
     MPI_Initialized(&mpiInitialized);
-    if (!mpiInitialized) {
+    if (mpiInitialized == 0) {
         MPI_Init(NULL, NULL);
-        // MPI_Init_thread(NULL, NULL, MPI_THREAD_FUNNELED, NULL);
     }
     MPI_Comm_rank(MPI_COMM_WORLD, &PID);
     MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-    // numprocs = 1000;
     comm = MPI_COMM_WORLD;
     root = 0;
 }
@@ -38,9 +45,8 @@ MPICommunicator::~MPICommunicator()
     // proof if MPI is finalized
     int _mpiFinalized = 0; // false
     MPI_Finalized(&_mpiFinalized);
-    if (!_mpiFinalized) {
+    if (_mpiFinalized == 0) {
         MPI_Finalize();
-        // UBLOG(logINFO, "MPI_Finalize()");
     }
 }
 //////////////////////////////////////////////////////////////////////////
@@ -88,11 +94,11 @@ std::vector<unsigned long long> MPICommunicator::gather(std::vector<unsigned lon
     return gather<unsigned long long>(values);
 }
 //////////////////////////////////////////////////////////////////////////
-int MPICommunicator::getProcessID() { return PID; }
+int MPICommunicator::getProcessID() const { return PID; }
 //////////////////////////////////////////////////////////////////////////
 int MPICommunicator::getProcessID(int /*bundle*/, int /*rank*/) { return PID; }
 //////////////////////////////////////////////////////////////////////////
-int MPICommunicator::getNumberOfProcesses() { return numprocs; }
+int MPICommunicator::getNumberOfProcesses() const { return numprocs; }
 //////////////////////////////////////////////////////////////////////////
 void *MPICommunicator::getNativeCommunicator() { return &comm; }
 //////////////////////////////////////////////////////////////////////////
@@ -108,7 +114,7 @@ int MPICommunicator::getProcessRoot() { return 0; }
 //////////////////////////////////////////////////////////////////////////
 int MPICommunicator::getNumberOfProcessesInBundle(int /*bundle*/) { return numprocs; }
 //////////////////////////////////////////////////////////////////////////
-bool MPICommunicator::isRoot() { return PID == root; }
+bool MPICommunicator::isRoot() const { return PID == root; }
 //////////////////////////////////////////////////////////////////////////
 void MPICommunicator::sendSerializedObject(std::stringstream &ss, int target)
 {
@@ -169,6 +175,120 @@ void MPICommunicator::broadcast(double &value) { broadcast<double>(value); }
 //////////////////////////////////////////////////////////////////////////
 void MPICommunicator::broadcast(long int &value) { broadcast<long int>(value); }
 
+void MPICommunicator::receiveSend(uint *buffer_receive, int size_buffer_recv,
+                                  int neighbor_rank_recv, uint *buffer_send, int size_buffer_send,
+                                  int neighbor_rank_send) const
+{
+    MPI_Request recv_request;
+    MPI_Irecv(buffer_receive, size_buffer_recv, MPI_UNSIGNED, neighbor_rank_recv, 0, comm,
+              &recv_request);
+    // printf("receive_send PID: %i,   nbRev: nb_rank_recv: %i", this->getPID(), nb_rank_r);
+    // fflush(stdout);
+    MPI_Send(buffer_send, size_buffer_send, MPI_UNSIGNED, neighbor_rank_send, 0, comm);
+    // printf("receive_send PID: %i,   sendUintGPU: nb_rank_send: %i", this->getPID(), nb_rank_s);
+    // fflush(stdout);
+    MPI_Wait(&recv_request, MPI_STATUSES_IGNORE); // TODO: Do we have a benefit here or could we simply do a blocking receiv.
+}
+
+void MPICommunicator::receiveSend(real *buffer_send, int size_buffer_send, real *buffer_receive, int size_buffer_recv,
+                     int neighbor_rank) const
+{
+    MPI_Send(buffer_send, size_buffer_send, VF_MPI_REAL, neighbor_rank, 0, comm);
+    MPI_Recv(buffer_receive, size_buffer_recv, VF_MPI_REAL, neighbor_rank, 0, comm, MPI_STATUS_IGNORE);
+}
+
+void MPICommunicator::send(real *sbuf, int count_s, int nb_rank) const
+{
+    MPI_Send(sbuf, count_s, VF_MPI_REAL, nb_rank, 0, comm);
+}
+
+double MPICommunicator::reduceSum(double quantityPerProcess) const
+{
+    double *buffer_send = &quantityPerProcess;
+    double *buffer_recv = (double *)malloc(sizeof(double));
+
+    MPI_Reduce(buffer_send, buffer_recv, 1, MPI_DOUBLE, MPI_SUM, 0, comm);
+
+    return *buffer_recv;
+}
+
+int MPICommunicator::mapCudaDevicesOnHosts(const std::vector<unsigned int> &devices, int numberOfDevices) const
+{
+    int device        = -1;
+    char *host        = (char *)malloc(sizeof(char) * getNumberOfProcesses() * 255);
+    unsigned int *map = (unsigned int *)malloc(sizeof(unsigned int) * getNumberOfProcesses());
+
+    char hostname[255];
+    gethostname(hostname, 254);
+    hostname[254] = 0;
+
+    MPI_Gather(hostname, 255, MPI_BYTE, host, 255, MPI_BYTE, 0, MPI_COMM_WORLD);
+
+    int i, j;
+    if (isRoot()) {
+        for (i = 0; i < getNumberOfProcesses(); i++) {
+            int counter = 0;
+            for (j = 0; j < i; j++) {
+                if (strcmp(&host[i * 255], &host[j * 255]) == 0)
+                    counter++;
+            }
+            if (counter >= numberOfDevices) {
+                VF_LOG_CRITICAL("More processes than GPUs!");
+                exit(1);
+            }
+            map[i] = devices[counter];
+        }
+    }
+
+    MPI_Scatter(map, 1, MPI_UNSIGNED, &device, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
+
+    VF_LOG_INFO("Rank: {} runs on host: {} with GPU: {}", getProcessID(), hostname, device);
+
+    free(map);
+    free(host);
+    return device;
+}
+
+void MPICommunicator::receiveNonBlocking(real *rbuf, int count_r, int sourceRank)
+{
+    // printf("\n Start Recv Rank: %d, neighbor Rank: %d, request = %d \n", PID, nb_rank, (int)requestGPU.size());
+    // fflush(stdout);
+
+    MPI_Request request;
+    MPI_Irecv(rbuf, count_r, VF_MPI_REAL, sourceRank, 0, comm, &request);
+    requests.push_back(request);
+
+    // printf("\n End Recv - Rank: %d , neighbor Rank: %d \n", PID, nb_rank);
+    // fflush(stdout);
+}
+
+void MPICommunicator::sendNonBlocking(real *sbuf, int count_s, int destinationRank)
+{
+    // printf("\n Start Send Rank: %d, neighbor Rank: %d, request = %d \n", PID, nb_rank, (int)requestGPU.size());
+    // fflush(stdout);
+
+    MPI_Request request;
+    MPI_Isend(sbuf, count_s, VF_MPI_REAL, destinationRank, 0, comm, &request);
+    requests.push_back(request);
+    // printf("\n End Send - Rank: %d , neighbor Rank: %d \n", PID, nb_rank);
+    // fflush(stdout);
+}
+
+void MPICommunicator::send(real *sbuf, int count_s, int destinationRank)
+{
+    MPI_Send(sbuf, count_s, VF_MPI_REAL, destinationRank, 0, comm);
+}
+
+void MPICommunicator::waitAll()
+{
+    MPI_Waitall((int)requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+}
+
+void MPICommunicator::resetRequests()
+{
+    requests.clear();
+}
+
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/mpi/MPICommunicator.h b/src/parallel/MPICommunicator.h
similarity index 84%
rename from src/mpi/MPICommunicator.h
rename to src/parallel/MPICommunicator.h
index 941bdac8f..d011fa130 100644
--- a/src/mpi/MPICommunicator.h
+++ b/src/parallel/MPICommunicator.h
@@ -4,14 +4,22 @@
 #define MPI_MPICOMMUNICATOR_H
 
 #include "Communicator.h"
-#include <PointerDefinitions.h>
+#include <basics/PointerDefinitions.h>
 #include <basics/utilities/UbException.h>
 #include <basics/utilities/UbLogger.h>
 #include <mpi.h>
 #include <string>
 #include <vector>
 
-namespace vf::mpi 
+//////////////////////////////////
+#ifdef VF_DOUBLE_ACCURACY
+#define VF_MPI_REAL MPI_DOUBLE
+#else
+#define VF_MPI_REAL MPI_FLOAT
+#endif
+//////////////////////////////////
+
+namespace vf::parallel
 {
 
 //! \brief A class uses MPI library to communication.
@@ -27,15 +35,15 @@ public:
     static std::shared_ptr<Communicator> getInstance();
     int getBundleID() override;
     int getNumberOfBundles() override;
-    int getProcessID() override;
+    int getProcessID() const override;
     int getProcessID(int bundle, int rank) override;
-    int getNumberOfProcesses() override;
+    int getNumberOfProcesses() const override;
     void *getNativeCommunicator() override;
     int getRoot() override;
     int getBundleRoot() override;
     int getProcessRoot() override;
     int getNumberOfProcessesInBundle(int bundle) override;
-    bool isRoot() override;
+    bool isRoot() const override;
     void abort(int errorcode) override;
 
     void sendSerializedObject(std::stringstream &ss, int target) override;
@@ -75,12 +83,30 @@ public:
     template <class T>
     void broadcast(T &value);
 
+    void receiveSend(uint *buffer_receive, int size_buffer_recv, int neighbor_rank_recv, uint *buffer_send,
+                     int size_buffer_send, int neighbor_rank_send) const override;
+
+    void send(real *sbuf, int count_s, int nb_rank) const override;
+    double reduceSum(double quantityPerProcess) const override;
+
+    int mapCudaDevicesOnHosts(const std::vector<unsigned int> &devices, int numberOfDevices) const override;
+    void receiveSend(real *buffer_send, int size_buffer_send, real *buffer_receive, int size_buffer_recv,
+                     int neighbor_rank) const override;
+
+    void receiveNonBlocking(real *rbuf, int count_r, int sourceRank) override;
+    void sendNonBlocking(real *sbuf, int count_s, int destinationRank) override;
+    void send(real *sbuf, int count_s, int destinationRank) override;
+    void waitAll() override;
+    void resetRequests() override;
+
 private:
     MPICommunicator();
 
     int numprocs, PID;
     MPI_Comm comm;
     int root;
+
+    std::vector<MPI_Request> requests;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -96,6 +122,8 @@ std::vector<T> MPICommunicator::gather(std::vector<T> &values)
         mpiDataType = MPI_INT;
     else if ((std::string) typeid(T).name() == (std::string) typeid(unsigned long long).name())
         mpiDataType = MPI_UNSIGNED_LONG_LONG;
+    else if ((std::string) typeid(T).name() == (std::string) typeid(char).name())
+        mpiDataType = MPI_CHAR;
     else
         throw UbException(UB_EXARGS, "no MpiDataType for T" + (std::string) typeid(T).name());
 
@@ -209,6 +237,7 @@ void MPICommunicator::broadcast(T &value)
 }
 //////////////////////////////////////////////////////////////////////////
 
+
 #endif
 
 }
diff --git a/src/mpi/NullCommunicator.cpp b/src/parallel/NullCommunicator.cpp
similarity index 80%
rename from src/mpi/NullCommunicator.cpp
rename to src/parallel/NullCommunicator.cpp
index 267942895..b319fbd25 100644
--- a/src/mpi/NullCommunicator.cpp
+++ b/src/parallel/NullCommunicator.cpp
@@ -33,7 +33,7 @@
 
 #include "NullCommunicator.h"
 
-namespace vf::mpi
+namespace vf::parallel
 {
 
     std::shared_ptr<Communicator> NullCommunicator::getInstance()
@@ -49,7 +49,7 @@ namespace vf::mpi
     //////////////////////////////////////////////////////////////////////////
     int NullCommunicator::getNumberOfBundles() { return 0; }
     //////////////////////////////////////////////////////////////////////////
-    int NullCommunicator::getProcessID() { return 0; }
+    int NullCommunicator::getProcessID() const { return 0; }
     //////////////////////////////////////////////////////////////////////////
     int NullCommunicator::getNumberOfProcesses() { return 0; }
     //////////////////////////////////////////////////////////////////////////
@@ -69,7 +69,7 @@ namespace vf::mpi
     void NullCommunicator::receiveSerializedObject(std::stringstream &ss, int source) {}
 
     int NullCommunicator::getProcessID(int bundle, int rank) { return 0; }
-    bool NullCommunicator::isRoot() {return true; }
+    bool NullCommunicator::isRoot() const { return true; }
 
     int NullCommunicator::getNumberOfProcessesInBundle(int bundle) {return 0;}
     void NullCommunicator::barrier() {}
@@ -94,4 +94,45 @@ namespace vf::mpi
     void NullCommunicator::broadcast(std::vector<float> &values){ }
     void NullCommunicator::broadcast(std::vector<double> &values){ }
     void NullCommunicator::broadcast(std::vector<long int> &values){ }
+
+    void NullCommunicator::receiveSend(uint *buffer_receive, int size_buffer_recv, int neighbor_rank_recv,
+                uint *buffer_send, int size_buffer_send,
+                int neighbor_rank_send) const {}
+
+    void NullCommunicator::send(real *sbuf, int count_s, int nb_rank) const {};
+    double NullCommunicator::reduceSum(double quantityPerProcess) const { return 0.0; };
+    int NullCommunicator::getNumberOfProcesses() const
+    {
+        return 1;
+    }
+
+    int NullCommunicator::mapCudaDevicesOnHosts(const std::vector<unsigned int> &devices, int numberOfDevices) const
+    {
+        return 0;
+    }
+
+    void NullCommunicator::receiveSend(real *buffer_send, int size_buffer_send, real *buffer_receive, int size_buffer_recv,
+                     int neighbor_rank) const
+    {
+    }
+
+
+    void NullCommunicator::receiveNonBlocking(real *rbuf, int count_r, int sourceRank)
+    {
+    }
+    void NullCommunicator::sendNonBlocking(real *sbuf, int count_s, int destinationRank)
+    {
+    }
+
+    void NullCommunicator::send(real *sbuf, int count_s, int destinationRank)
+    {
+    }
+
+    void NullCommunicator::waitAll()
+    {
+    }
+
+    void NullCommunicator::resetRequests()
+    {
+    }
 }
diff --git a/src/mpi/NullCommunicator.h b/src/parallel/NullCommunicator.h
similarity index 56%
rename from src/mpi/NullCommunicator.h
rename to src/parallel/NullCommunicator.h
index 836f801ab..312a40f08 100644
--- a/src/mpi/NullCommunicator.h
+++ b/src/parallel/NullCommunicator.h
@@ -36,7 +36,7 @@
 
 #include "Communicator.h"
 
-namespace vf::mpi
+namespace vf::parallel
 {
 
 //! \brief A class implements Communicator for shared memory.
@@ -46,43 +46,61 @@ class NullCommunicator : public Communicator
 public:
     static std::shared_ptr<Communicator> getInstance();
 
-    int getBundleID();
-    int getNumberOfBundles();
-    int getProcessID();
-    int getProcessID(int bundle, int rank);
+    int getBundleID() override;
+    int getNumberOfBundles() override;
+    int getProcessID() const override;
+    int getProcessID(int bundle, int rank) override;
     int getNumberOfProcesses();
-    bool isRoot();
-    void *getNativeCommunicator();
-
-    void sendSerializedObject(std::stringstream &ss, int target);
-    void receiveSerializedObject(std::stringstream &ss, int source);
-
-    int getRoot();
-    int getBundleRoot();
-    int getProcessRoot();
-    int getNumberOfProcessesInBundle(int bundle);
-    void barrier();
-    void abort(int errorcode);
-
-    std::vector<std::string> gather(const std::string &str);
-    std::vector<int> gather(std::vector<int> &values);
-    std::vector<float> gather(std::vector<float> &values);
-    std::vector<double> gather(std::vector<double> &values);
-    std::vector<unsigned long long> gather(std::vector<unsigned long long> &values);
-
-    void allGather(std::vector<int> &svalues, std::vector<int> &rvalues);
-    void allGather(std::vector<float> &svalues, std::vector<float> &rvalues);
-    void allGather(std::vector<double> &svalues, std::vector<double> &rvalues);
-    void allGather(std::vector<unsigned long long> &svalues, std::vector<unsigned long long> &rvalues);
-
-    void broadcast(int &value);
-    void broadcast(float &value);
-    void broadcast(double &value);
-    void broadcast(long int &value);
-    void broadcast(std::vector<int> &values);
-    void broadcast(std::vector<float> &values);
-    void broadcast(std::vector<double> &values);
-    void broadcast(std::vector<long int> &values);
+    bool isRoot() const override;
+    void *getNativeCommunicator() override;
+
+    void sendSerializedObject(std::stringstream &ss, int target) override;
+    void receiveSerializedObject(std::stringstream &ss, int source) override;
+
+    int getRoot() override;
+    int getBundleRoot() override;
+    int getProcessRoot() override;
+    int getNumberOfProcessesInBundle(int bundle) override;
+    void barrier() override;
+    void abort(int errorcode) override;
+
+    std::vector<std::string> gather(const std::string &str) override;
+    std::vector<int> gather(std::vector<int> &values) override;
+    std::vector<float> gather(std::vector<float> &values) override;
+    std::vector<double> gather(std::vector<double> &values) override;
+    std::vector<unsigned long long> gather(std::vector<unsigned long long> &values) override;
+
+    void allGather(std::vector<int> &svalues, std::vector<int> &rvalues) override;
+    void allGather(std::vector<float> &svalues, std::vector<float> &rvalues) override;
+    void allGather(std::vector<double> &svalues, std::vector<double> &rvalues) override;
+    void allGather(std::vector<unsigned long long> &svalues, std::vector<unsigned long long> &rvalues) override;
+
+    void broadcast(int &value) override;
+    void broadcast(float &value) override;
+    void broadcast(double &value) override;
+    void broadcast(long int &value) override;
+    void broadcast(std::vector<int> &values) override;
+    void broadcast(std::vector<float> &values) override;
+    void broadcast(std::vector<double> &values) override;
+    void broadcast(std::vector<long int> &values) override;
+
+    void receiveSend(uint *buffer_receive, int size_buffer_recv, int neighbor_rank_recv, uint *buffer_send,
+                     int size_buffer_send, int neighbor_rank_send) const override;
+
+    void send(real *sbuf, int count_s, int nb_rank) const override;
+
+    double reduceSum(double quantityPerProcess) const override;
+    int getNumberOfProcesses() const override;
+    int mapCudaDevicesOnHosts(const std::vector<unsigned int> &devices, int numberOfDevices) const override;
+
+    void receiveSend(real *buffer_send, int size_buffer_send, real *buffer_receive, int size_buffer_recv,
+                     int neighbor_rank) const override;
+
+    void receiveNonBlocking(real *rbuf, int count_r, int sourceRank) override;
+    void sendNonBlocking(real *sbuf, int count_s, int destinationRank) override;
+    void send(real *sbuf, int count_s, int destinationRank) override;
+    void waitAll() override;
+    void resetRequests() override;
 };
 
 }
-- 
GitLab