diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2cf3e6a6497efc437fee5a3281fa72c75ee59eae..84b1c5ee16b28e2b173b6d8731e09de49320f58e 100755 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -275,6 +275,7 @@ trigger-regression-tests: regression_test_4gpu: image: python:latest stage: test + needs: [] rules: - if: $REMOTE_USER && $REMOTE_HOST && $PRIVATE_KEY && $CI_PIPELINE_SOURCE == "schedule" @@ -287,7 +288,10 @@ regression_test_4gpu: - pip install "fieldcompare[all]" script: - - hpc-rocket launch --watch regression-tests/multigpu_test/rocket4GPU.yml + - hpc-rocket launch regression-tests/multigpu_test/rocket4GPU.yml |& tee hpcrocket4GPU.log + - hpc-rocket watch regression-tests/multigpu_test/rocket4GPU.yml $(python regression-tests/multigpu_test/utilities/parsejobid.py hpcrocket4GPU.log) + - hpc-rocket finalize regression-tests/multigpu_test/rocket4GPU.yml + - cat output/4GPU/slurm4GPU.out - git clone --depth 1 --filter=blob:none --sparse https://github.com/irmb/test_data - cd test_data - git sparse-checkout set regression_tests/gpu/DrivenCavity_4GPU_2Levels regression_tests/gpu/SphereScaling_4GPU_2Levels @@ -295,10 +299,19 @@ regression_test_4gpu: - fieldcompare dir output/4GPU test_data/regression_tests/gpu/DrivenCavity_4GPU_2Levels --include-files "DrivenCavityMultiGPU*.vtu" - fieldcompare dir output/4GPU test_data/regression_tests/gpu/SphereScaling_4GPU_2Levels --include-files "SphereScaling*.vtu" + after_script: + - hpc-rocket cancel regression-tests/multigpu_test/rocket4GPU.yml $(python regression-tests/multigpu_test/utilities/parsejobid.py hpcrocket4GPU.log) + + artifacts: + when: always + paths: + - output/4GPU/slurm4GPU.out + expire_in: 1 week ############################################################################### regression_test_8gpu: image: python:latest stage: test + needs: [] rules: - if: $REMOTE_USER && $REMOTE_HOST && $PRIVATE_KEY && $CI_PIPELINE_SOURCE == "schedule" @@ -311,13 +324,25 @@ regression_test_8gpu: - pip install "fieldcompare[all]" script: - - hpc-rocket launch --watch regression-tests/multigpu_test/rocket8GPU.yml + - hpc-rocket launch regression-tests/multigpu_test/rocket8GPU.yml |& tee hpcrocket8GPU.log + - hpc-rocket watch regression-tests/multigpu_test/rocket8GPU.yml $(python regression-tests/multigpu_test/utilities/parsejobid.py hpcrocket8GPU.log) + - hpc-rocket finalize regression-tests/multigpu_test/rocket8GPU.yml + - cat output/8GPU/slurm8GPU.out - git clone --depth 1 --filter=blob:none --sparse https://github.com/irmb/test_data - cd test_data - git sparse-checkout set regression_tests/gpu/DrivenCavity_8GPU_2Levels regression_tests/gpu/SphereScaling_8GPU_2Levels - cd .. - fieldcompare dir output/8GPU test_data/regression_tests/gpu/DrivenCavity_8GPU_2Levels --include-files "DrivenCavityMultiGPU*.vtu" - fieldcompare dir output/8GPU test_data/regression_tests/gpu/SphereScaling_8GPU_2Levels --include-files "SphereScaling*.vtu" + + after_script: + - hpc-rocket cancel regression-tests/multigpu_test/rocket8GPU.yml $(python regression-tests/multigpu_test/utilities/parsejobid.py hpcrocket8GPU.log) + + artifacts: + when: always + paths: + - output/8GPU/slurm8GPU.out + expire_in: 1 week ############################################################################### ## Benchmark ## ############################################################################### diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU_regressionTest.txt b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU_regressionTest.txt index c5789cdf96049b7c0a31ce693c29cd2db4952a58..9b8c6e42de49997cc218a2a0ee6f832b903d142b 100644 --- a/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU_regressionTest.txt +++ b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU_regressionTest.txt @@ -1,7 +1,7 @@ ################################################## #GPU Mapping ################################################## -Devices="0 1 2 3" +Devices=0 1 2 3 NumberOfDevices=4 ################################################## diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix8GPU_regressionTest.txt b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix8GPU_regressionTest.txt index c5789cdf96049b7c0a31ce693c29cd2db4952a58..9b8c6e42de49997cc218a2a0ee6f832b903d142b 100644 --- a/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix8GPU_regressionTest.txt +++ b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix8GPU_regressionTest.txt @@ -1,7 +1,7 @@ ################################################## #GPU Mapping ################################################## -Devices="0 1 2 3" +Devices=0 1 2 3 NumberOfDevices=4 ################################################## diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_regressionTest.txt b/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_regressionTest.txt index c5789cdf96049b7c0a31ce693c29cd2db4952a58..9b8c6e42de49997cc218a2a0ee6f832b903d142b 100644 --- a/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_regressionTest.txt +++ b/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_regressionTest.txt @@ -1,7 +1,7 @@ ################################################## #GPU Mapping ################################################## -Devices="0 1 2 3" +Devices=0 1 2 3 NumberOfDevices=4 ################################################## diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_regressionTest.txt b/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_regressionTest.txt index c5789cdf96049b7c0a31ce693c29cd2db4952a58..9b8c6e42de49997cc218a2a0ee6f832b903d142b 100644 --- a/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_regressionTest.txt +++ b/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_regressionTest.txt @@ -1,7 +1,7 @@ ################################################## #GPU Mapping ################################################## -Devices="0 1 2 3" +Devices=0 1 2 3 NumberOfDevices=4 ################################################## diff --git a/regression-tests/multigpu_test/rocket4GPU.yml b/regression-tests/multigpu_test/rocket4GPU.yml index 33b7178372f0ce869565e8eee567a75ac8759ee4..a7ece055707c26a7a27e9c12be3447dc1b77855f 100755 --- a/regression-tests/multigpu_test/rocket4GPU.yml +++ b/regression-tests/multigpu_test/rocket4GPU.yml @@ -37,7 +37,7 @@ copy: collect: - from: multigpu_test/output/4GPU/ - to: output/4GPU + to: output/4GPU/ overwrite: true - from: multigpu_test/slurm4GPU.out @@ -45,8 +45,7 @@ collect: overwrite: true clean: - - multigpu_test/output/* - - multigpu_test/src/* + - multigpu_test/* sbatch: multigpu_test/slurm4GPU.job continue_if_job_fails: true diff --git a/regression-tests/multigpu_test/rocket8GPU.yml b/regression-tests/multigpu_test/rocket8GPU.yml index 9c59b4ce52984ad552bb3d32233a4d694589721f..4b434fc8a2433dab513649800dbe3f160d986edd 100755 --- a/regression-tests/multigpu_test/rocket8GPU.yml +++ b/regression-tests/multigpu_test/rocket8GPU.yml @@ -37,7 +37,7 @@ copy: collect: - from: multigpu_test/output/8GPU/ - to: output/8GPU + to: output/8GPU/ overwrite: true - from: multigpu_test/slurm8GPU.out @@ -45,8 +45,7 @@ collect: overwrite: true clean: - - multigpu_test/output/* - - multigpu_test/src/* + - multigpu_test/* sbatch: multigpu_test/slurm8GPU.job continue_if_job_fails: true diff --git a/regression-tests/multigpu_test/slurm4GPU.job b/regression-tests/multigpu_test/slurm4GPU.job index 70b33f07f4a4a7be7f5b50990098f3322238af4a..0be42c51bac9a341b56eb705f9bdb518883f507d 100755 --- a/regression-tests/multigpu_test/slurm4GPU.job +++ b/regression-tests/multigpu_test/slurm4GPU.job @@ -2,14 +2,19 @@ #SBATCH --partition=gpu01_queue #SBATCH --nodes=1 -#SBATCH --time=10:00:00 +#SBATCH --time=03:00:00 #SBATCH --job-name=Regr4GPU #SBATCH --ntasks-per-node=4 #SBATCH --gres=gpu:4 #SBATCH --output=multigpu_test/slurm4GPU.out ##SBATCH --exclusive -module purge +echo "SLURM_JOBID="$SLURM_JOBID +echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST +echo "SLURM_NNODES"=$SLURM_NNODES +echo "SLURM_TASKS_PER_NODE"=$SLURM_TASKS_PER_NODE + +module purge module load comp/gcc/10.2.0 module load mpi/openmpi/4.0.5_gcc_9.3/openmpi module load cuda/11.3 @@ -19,7 +24,7 @@ PATH=/home/irmb/tools/cmake-3.20.3-linux-x86_64/bin:$PATH module list cd multigpu_test -rm -r build && mkdir -p build +rm -rf build && mkdir -p build cd build cmake .. -DBUILD_VF_GPU=ON -DCMAKE_CUDA_ARCHITECTURES=60 -DUSER_APPS=apps/gpu/LBM/DrivenCavityMultiGPU\;apps/gpu/LBM/SphereScaling make -j 16 diff --git a/regression-tests/multigpu_test/slurm8GPU.job b/regression-tests/multigpu_test/slurm8GPU.job index b91d7d473d935d4d0fbe8deba344dbaa58cf5080..bb7bf55c70eb6b178eff3f52e18c35d7cafd6938 100755 --- a/regression-tests/multigpu_test/slurm8GPU.job +++ b/regression-tests/multigpu_test/slurm8GPU.job @@ -2,14 +2,19 @@ #SBATCH --partition=gpu01_queue #SBATCH --nodes=2 -#SBATCH --time=10:00:00 +#SBATCH --time=03:00:00 #SBATCH --job-name=Regr8GPU #SBATCH --ntasks-per-node=4 #SBATCH --gres=gpu:4 #SBATCH --output=multigpu_test/slurm8GPU.out ##SBATCH --exclusive -module purge +echo "SLURM_JOBID="$SLURM_JOBID +echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST +echo "SLURM_NNODES"=$SLURM_NNODES +echo "SLURM_TASKS_PER_NODE"=$SLURM_TASKS_PER_NODE + +module purge module load comp/gcc/10.2.0 module load mpi/openmpi/4.0.5_gcc_9.3/openmpi module load cuda/11.3 @@ -19,7 +24,7 @@ PATH=/home/irmb/tools/cmake-3.20.3-linux-x86_64/bin:$PATH module list cd multigpu_test -rm -r build && mkdir -p build +rm -rf build && mkdir -p build cd build cmake .. -DBUILD_VF_GPU=ON -DCMAKE_CUDA_ARCHITECTURES=60 -DUSER_APPS=apps/gpu/LBM/DrivenCavityMultiGPU\;apps/gpu/LBM/SphereScaling make -j 16 diff --git a/regression-tests/multigpu_test/utilities/parsejobid.py b/regression-tests/multigpu_test/utilities/parsejobid.py new file mode 100644 index 0000000000000000000000000000000000000000..f209f6d3d0ac20243a94d16f89d2aaddeeae24f7 --- /dev/null +++ b/regression-tests/multigpu_test/utilities/parsejobid.py @@ -0,0 +1,20 @@ +from pathlib import Path +import sys + +LAUNCH_MESSAGE = "Launched job" + + +def parsejobid(file: str) -> str: + file_path = Path(file) + if not file_path.exists(): + raise FileNotFoundError(file) + + text_content = file_path.read_text().strip() + launch_line = next( + filter(lambda line: LAUNCH_MESSAGE in line, text_content.splitlines()) + ) + return launch_line.split()[-1].strip() + + +if __name__ == "__main__": + print(parsejobid(sys.argv[1])) diff --git a/src/cuda/DeviceInfo.cpp b/src/cuda/DeviceInfo.cpp index 20ea2c4f6ba098b17e444f55625a6791e46141e5..81fea1cf21418ef98217293985f8c1101f8635be 100644 --- a/src/cuda/DeviceInfo.cpp +++ b/src/cuda/DeviceInfo.cpp @@ -15,7 +15,7 @@ void verifyNumberOfDevices(int deviceId) int device_count = 0; cudaError_t errorId = cudaGetDeviceCount(&device_count); if(errorId != cudaSuccess) { - VF_LOG_CRITICAL("Error while accessing the device count: {}", cudaGetErrorString(errorId)); + VF_LOG_CRITICAL("Device {}: Error while accessing the device count: {}", deviceId, cudaGetErrorString(errorId)); } if (deviceId > device_count) { throw std::runtime_error("chosen gpudevice >= device_count ... exiting\n"); @@ -28,13 +28,13 @@ void verifyComputeCapability(int deviceId) cudaError_t errorId = cudaGetDeviceProperties(&deviceProp, deviceId); if(errorId != cudaSuccess){ - VF_LOG_CRITICAL("Error while accessing the device properties occurs: {}", cudaGetErrorString(errorId)); + VF_LOG_CRITICAL("Device {}: Error while accessing the device properties occurs: {}", deviceId, cudaGetErrorString(errorId)); } VF_LOG_INFO("[compute capability] = [{}.{}]\n", deviceProp.major, deviceProp.minor); if (deviceProp.major > 999) { - throw std::runtime_error("warning, CUDA Device Emulation (CPU) detected, exiting\n"); + throw std::runtime_error("Warning, CUDA Device Emulation (CPU) detected, exiting\n"); } } @@ -43,13 +43,13 @@ void setCudaDevice(int deviceId) // choose a cuda device for kernel execution cudaError_t errorId = cudaSetDevice(deviceId); if (errorId != cudaSuccess) { - VF_LOG_CRITICAL("Error while setting the device to {}: {}", deviceId, cudaGetErrorString(errorId)); + VF_LOG_CRITICAL("Device {}: Error while setting the device to: {}", deviceId, cudaGetErrorString(errorId)); } else { int device; // double check that device was properly selected errorId = cudaGetDevice(&device); if(errorId != cudaSuccess) { - VF_LOG_CRITICAL("Error while getting the device: {}", cudaGetErrorString(errorId)); + VF_LOG_CRITICAL("Device {}: Error while getting the device: {}", deviceId, cudaGetErrorString(errorId)); } } } @@ -70,7 +70,7 @@ void printCudaInformation(int deviceId) cudaError_t errorId = cudaGetDeviceProperties(&prop, deviceId); if(errorId != cudaSuccess){ - VF_LOG_CRITICAL("Error while accessing the device properties occurs: {}", cudaGetErrorString(errorId)); + VF_LOG_CRITICAL("Device {}: Error while accessing the device properties occurs: {}", deviceId, cudaGetErrorString(errorId)); } printf(" --- General Information for device %d ---\n", deviceId); diff --git a/src/logger/Logger.h b/src/logger/Logger.h index 3a25fea02eb7d5ea1ab9bffebea08bfc9f512b04..f3c41c0e3bfb8aa12c94677040273fdaaaff64a3 100644 --- a/src/logger/Logger.h +++ b/src/logger/Logger.h @@ -39,10 +39,10 @@ // The default log level is set to trace. Supported levels: trace < debug < info < warning < critical // // The logging is realized in 3 different log sinks: -// 1. colorded console output +// 1. colored console output // 2. a daily log file // 3. a log file from the last run of VirtualFluids -// The default file path is relativ to executed command logs/ +// The default file path is relative to executed command logs/ // File path can be changed via changeLogPath() #define VF_LOG_TRACE(...) spdlog::trace(__VA_ARGS__) @@ -57,7 +57,7 @@ namespace vf::logging class Logger { public: - // initalizing the above named logger + // initializing the above named logger static void initializeLogger(); // changing the path of the log files