diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index aafc918d018fd9d60beab6a57a5f71d6a8d079f6..26196513dc02377203cbc09896c43f894d957ad7 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -359,6 +359,9 @@ include_what_you_use_clang_10:
 cppcheck:
   stage: analyze
 
+  only:
+    - develop@irmb/VirtualFluids_dev
+
   image: irmb/virtualfluids-deps-ubuntu20.04
 
   needs: []
@@ -385,6 +388,9 @@ cppcheck:
 lizard:
   stage: analyze
 
+  only:
+    - develop@irmb/VirtualFluids_dev
+
   image: irmb/virtualfluids-python-deps-ubuntu20.04
 
   needs: []
@@ -409,6 +415,9 @@ lizard:
 gcov_gcc_9:
   stage: analyze
 
+  only:
+    - develop@irmb/VirtualFluids_dev
+
   image: irmb/virtualfluids-python-deps-ubuntu20.04
 
   needs: []
@@ -450,6 +459,9 @@ gcov_gcc_9:
 clang-tidy:
   stage: analyze
 
+  only:
+    - develop@irmb/VirtualFluids_dev
+
   image: irmb/virtualfluids-python-deps-ubuntu20.04
 
   needs: []
diff --git a/3rdParty/cuda_samples/README b/3rdParty/cuda_samples/README
new file mode 100644
index 0000000000000000000000000000000000000000..5db13e7bda8365d792e3e68840d02c442348596e
--- /dev/null
+++ b/3rdParty/cuda_samples/README
@@ -0,0 +1,2 @@
+# 3rd party cuda
+The files in this folder are added from here https://github.com/NVIDIA/cuda-samples/blob/v11.2/Common/.
\ No newline at end of file
diff --git a/3rdParty/cuda_samples/exception.h b/3rdParty/cuda_samples/exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..84e348b59fb6892439e8057b03093bac48cadcd4
--- /dev/null
+++ b/3rdParty/cuda_samples/exception.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* CUda UTility Library */
+#ifndef COMMON_EXCEPTION_H_
+#define COMMON_EXCEPTION_H_
+
+// includes, system
+#include <stdlib.h>
+#include <exception>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template <class Std_Exception>
+class Exception : public Std_Exception {
+public:
+    //! @brief Static construction interface
+    //! @return Alwayss throws ( Located_Exception<Exception>)
+    //! @param file file in which the Exception occurs
+    //! @param line line in which the Exception occurs
+    //! @param detailed details on the code fragment causing the Exception
+    static void throw_it(const char *file, const int line,
+                         const char *detailed = "-");
+
+    //! Static construction interface
+    //! @return Alwayss throws ( Located_Exception<Exception>)
+    //! @param file file in which the Exception occurs
+    //! @param line line in which the Exception occurs
+    //! @param detailed details on the code fragment causing the Exception
+    static void throw_it(const char *file, const int line,
+                         const std::string &detailed);
+
+    //! Destructor
+    virtual ~Exception() throw();
+
+private:
+    //! Constructor, default (private)
+    Exception();
+
+    //! Constructor, standard
+    //! @param str string returned by what()
+    explicit Exception(const std::string &str);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template <class Exception_Typ>
+inline void handleException(const Exception_Typ &ex) {
+    std::cerr << ex.what() << std::endl;
+
+    exit(EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION(msg) \
+  Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION(msg) \
+  Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION(msg) \
+  Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const char *detailed) {
+    std::stringstream s;
+
+    // Quiet heavy-weight but exceptions are not for
+    // performance / release versions
+    s << "Exception in file '" << file << "' in line " << line << "\n"
+      << "Detailed description: " << detailed << "\n";
+
+    throw Exception(s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const std::string &msg) {
+    throw_it(file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n") {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception(const std::string &s) : Std_Exception(s) {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() {}
+
+// functions, exported
+
+#endif  // COMMON_EXCEPTION_H_
\ No newline at end of file
diff --git a/3rdParty/cuda_samples/helper_cuda.h b/3rdParty/cuda_samples/helper_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..81d3f9e76983a32daff4c2649ab0880e29f9881a
--- /dev/null
+++ b/3rdParty/cuda_samples/helper_cuda.h
@@ -0,0 +1,967 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <helper_string.h>
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header
+// files, please refer the CUDA examples for examples of the needed CUDA
+// headers, which may change depending on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+#endif
+
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+
+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "CUSPARSE_STATUS_SUCCESS";
+
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "CUSPARSE_STATUS_INVALID_VALUE";
+
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return "CUSOLVER_STATUS_MAPPING_ERROR";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return "CUSOLVER_STATUS_ZERO_PIVOT";
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error) {
+  switch (error) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+    case NVJPEG_STATUS_SUCCESS:
+      return "NVJPEG_STATUS_SUCCESS";
+
+    case NVJPEG_STATUS_NOT_INITIALIZED:
+      return "NVJPEG_STATUS_NOT_INITIALIZED";
+
+    case NVJPEG_STATUS_INVALID_PARAMETER:
+      return "NVJPEG_STATUS_INVALID_PARAMETER";
+
+    case NVJPEG_STATUS_BAD_JPEG:
+      return "NVJPEG_STATUS_BAD_JPEG";
+
+    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+
+    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+
+    case NVJPEG_STATUS_EXECUTION_FAILED:
+      return "NVJPEG_STATUS_EXECUTION_FAILED";
+
+    case NVJPEG_STATUS_ARCH_MISMATCH:
+      return "NVJPEG_STATUS_ARCH_MISMATCH";
+
+    case NVJPEG_STATUS_INTERNAL_ERROR:
+      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error) {
+  switch (error) {
+    case NPP_NOT_SUPPORTED_MODE_ERROR:
+      return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_RESIZE_NO_OPERATION_ERROR:
+      return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_BAD_ARG_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFF_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECT_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUAD_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEM_ALLOC_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_INPUT:
+      return "NPP_INVALID_INPUT";
+
+    case NPP_POINTER_ERROR:
+      return "NPP_POINTER_ERROR";
+
+    case NPP_WARNING:
+      return "NPP_WARNING";
+
+    case NPP_ODD_ROI_WARNING:
+      return "NPP_ODD_ROI_WARNING";
+#else
+
+    // These are for CUDA 5.5 or higher
+    case NPP_BAD_ARGUMENT_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFFICIENT_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECTANGLE_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUADRANGLE_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEMORY_ALLOCATION_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_HOST_POINTER_ERROR:
+      return "NPP_INVALID_HOST_POINTER_ERROR";
+
+    case NPP_INVALID_DEVICE_POINTER_ERROR:
+      return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_TEXTURE_BIND_ERROR:
+      return "NPP_TEXTURE_BIND_ERROR";
+
+    case NPP_WRONG_INTERSECTION_ROI_ERROR:
+      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+    case NPP_NOT_EVEN_STEP_ERROR:
+      return "NPP_NOT_EVEN_STEP_ERROR";
+
+    case NPP_INTERPOLATION_ERROR:
+      return "NPP_INTERPOLATION_ERROR";
+
+    case NPP_RESIZE_FACTOR_ERROR:
+      return "NPP_RESIZE_FACTOR_ERROR";
+
+    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_MEMFREE_ERR:
+      return "NPP_MEMFREE_ERR";
+
+    case NPP_MEMSET_ERR:
+      return "NPP_MEMSET_ERR";
+
+    case NPP_MEMCPY_ERR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERR:
+      return "NPP_MIRROR_FLIP_ERR";
+#else
+
+    case NPP_MEMFREE_ERROR:
+      return "NPP_MEMFREE_ERROR";
+
+    case NPP_MEMSET_ERROR:
+      return "NPP_MEMSET_ERROR";
+
+    case NPP_MEMCPY_ERROR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERROR:
+      return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+    case NPP_ALIGNMENT_ERROR:
+      return "NPP_ALIGNMENT_ERROR";
+
+    case NPP_STEP_ERROR:
+      return "NPP_STEP_ERROR";
+
+    case NPP_SIZE_ERROR:
+      return "NPP_SIZE_ERROR";
+
+    case NPP_NULL_POINTER_ERROR:
+      return "NPP_NULL_POINTER_ERROR";
+
+    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+    case NPP_NOT_IMPLEMENTED_ERROR:
+      return "NPP_NOT_IMPLEMENTED_ERROR";
+
+    case NPP_ERROR:
+      return "NPP_ERROR";
+
+    case NPP_SUCCESS:
+      return "NPP_SUCCESS";
+
+    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+    case NPP_MISALIGNED_DST_ROI_WARNING:
+      return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+    case NPP_DOUBLE_SIZE_WARNING:
+      return "NPP_DOUBLE_SIZE_WARNING";
+
+    case NPP_WRONG_INTERSECTION_ROI_WARNING:
+      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+    /* These are 6.0 or higher */
+    case NPP_LUT_PALETTE_BITSIZE_ERROR:
+      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_QUALITY_INDEX_ERROR:
+      return "NPP_QUALITY_INDEX_ERROR";
+
+    case NPP_CHANNEL_ORDER_ERROR:
+      return "NPP_CHANNEL_ORDER_ERROR";
+
+    case NPP_ZERO_MASK_VALUE_ERROR:
+      return "NPP_ZERO_MASK_VALUE_ERROR";
+
+    case NPP_NUMBER_OF_CHANNELS_ERROR:
+      return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+    case NPP_COI_ERROR:
+      return "NPP_COI_ERROR";
+
+    case NPP_DIVISOR_ERROR:
+      return "NPP_DIVISOR_ERROR";
+
+    case NPP_CHANNEL_ERROR:
+      return "NPP_CHANNEL_ERROR";
+
+    case NPP_STRIDE_ERROR:
+      return "NPP_STRIDE_ERROR";
+
+    case NPP_ANCHOR_ERROR:
+      return "NPP_ANCHOR_ERROR";
+
+    case NPP_MASK_SIZE_ERROR:
+      return "NPP_MASK_SIZE_ERROR";
+
+    case NPP_MOMENT_00_ZERO_ERROR:
+      return "NPP_MOMENT_00_ZERO_ERROR";
+
+    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+    case NPP_THRESHOLD_ERROR:
+      return "NPP_THRESHOLD_ERROR";
+
+    case NPP_CONTEXT_MATCH_ERROR:
+      return "NPP_CONTEXT_MATCH_ERROR";
+
+    case NPP_FFT_FLAG_ERROR:
+      return "NPP_FFT_FLAG_ERROR";
+
+    case NPP_FFT_ORDER_ERROR:
+      return "NPP_FFT_ORDER_ERROR";
+
+    case NPP_SCALE_RANGE_ERROR:
+      return "NPP_SCALE_RANGE_ERROR";
+
+    case NPP_DATA_TYPE_ERROR:
+      return "NPP_DATA_TYPE_ERROR";
+
+    case NPP_OUT_OFF_RANGE_ERROR:
+      return "NPP_OUT_OFF_RANGE_ERROR";
+
+    case NPP_DIVIDE_BY_ZERO_ERROR:
+      return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+    case NPP_RANGE_ERROR:
+      return "NPP_RANGE_ERROR";
+
+    case NPP_NO_MEMORY_ERROR:
+      return "NPP_NO_MEMORY_ERROR";
+
+    case NPP_ERROR_RESERVED:
+      return "NPP_ERROR_RESERVED";
+
+    case NPP_NO_OPERATION_WARNING:
+      return "NPP_NO_OPERATION_WARNING";
+
+    case NPP_DIVIDE_BY_ZERO_WARNING:
+      return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+    /* These are 7.0 or higher */
+    case NPP_OVERFLOW_ERROR:
+      return "NPP_OVERFLOW_ERROR";
+
+    case NPP_CORRUPTED_DATA_ERROR:
+      return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+  }
+
+  return "<unknown>";
+}
+#endif
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+    if (result) {
+        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+                static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+        exit(EXIT_FAILURE);
+    }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char *errorMessage, const char *file,
+                                 const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+  }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value) {
+    return (value >= 0 ? static_cast<int>(value + 0.5)
+                       : static_cast<int>(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor) {
+    // Defines for GPU Architecture types (using the SM version to determine
+    // the # of cores per SM
+    typedef struct {
+        int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+        // and m = SM minor version
+        int Cores;
+    } sSMtoCores;
+
+    sSMtoCores nGpuArchCoresPerSM[] = {
+        {0x30, 192},
+        {0x32, 192},
+        {0x35, 192},
+        {0x37, 192},
+        {0x50, 128},
+        {0x52, 128},
+        {0x53, 128},
+        {0x60,  64},
+        {0x61, 128},
+        {0x62, 128},
+        {0x70,  64},
+        {0x72,  64},
+        {0x75,  64},
+        {0x80,  64},
+        {0x86, 128},
+        {-1, -1}};
+
+    int index = 0;
+
+    while (nGpuArchCoresPerSM[index].SM != -1) {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+            return nGpuArchCoresPerSM[index].Cores;
+        }
+
+        index++;
+    }
+
+    // If we don't find the values, we default use the previous one
+    // to run properly
+    printf(
+        "MapSMtoCores for SM %d.%d is undefined."
+        "  Default to use %d Cores/SM\n",
+        major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+    return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char* _ConvertSMVer2ArchName(int major, int minor) {
+    // Defines for GPU Architecture types (using the SM version to determine
+    // the GPU Arch name)
+    typedef struct {
+        int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+        // and m = SM minor version
+        const char* name;
+    } sSMtoArchName;
+
+    sSMtoArchName nGpuArchNameSM[] = {
+        {0x30, "Kepler"},
+        {0x32, "Kepler"},
+        {0x35, "Kepler"},
+        {0x37, "Kepler"},
+        {0x50, "Maxwell"},
+        {0x52, "Maxwell"},
+        {0x53, "Maxwell"},
+        {0x60, "Pascal"},
+        {0x61, "Pascal"},
+        {0x62, "Pascal"},
+        {0x70, "Volta"},
+        {0x72, "Xavier"},
+        {0x75, "Turing"},
+        {0x80, "Ampere"},
+        {0x86, "Ampere"},
+        {-1, "Graphics Device"}};
+
+    int index = 0;
+
+    while (nGpuArchNameSM[index].SM != -1) {
+        if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
+            return nGpuArchNameSM[index].name;
+        }
+
+        index++;
+    }
+
+    // If we don't find the values, we default use the previous one
+    // to run properly
+    printf(
+        "MapSMtoArchName for SM %d.%d is undefined."
+        "  Default to use %s\n",
+        major, minor, nGpuArchNameSM[index - 1].name);
+    return nGpuArchNameSM[index - 1].name;
+}
+// end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID) {
+  int device_count;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuDeviceInit() CUDA error: "
+            "no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (devID < 0) {
+    devID = 0;
+  }
+
+  if (devID > device_count - 1) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            device_count);
+    fprintf(stderr,
+            ">> gpuDeviceInit (-device=%d) is not a valid"
+            " GPU device. <<\n",
+            devID);
+    fprintf(stderr, "\n");
+    return -devID;
+  }
+
+  int computeMode = -1, major = 0, minor = 0;
+  checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+  if (computeMode == cudaComputeModeProhibited) {
+    fprintf(stderr,
+            "Error: device is running in <Compute Mode "
+            "Prohibited>, no threads can use cudaSetDevice().\n");
+    return -1;
+  }
+
+  if (major < 1) {
+    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  checkCudaErrors(cudaSetDevice(devID));
+  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
+
+  return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId() {
+  int current_device = 0, sm_per_multiproc = 0;
+  int max_perf_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  uint64_t max_compute_perf = 0;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the best CUDA capable GPU device
+  current_device = 0;
+
+  while (current_device < device_count) {
+    int computeMode = -1, major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+
+    // If this GPU is not running on Compute Mode prohibited,
+    // then we can add it to the list
+    if (computeMode != cudaComputeModeProhibited) {
+      if (major == 9999 && minor == 9999) {
+        sm_per_multiproc = 1;
+      } else {
+        sm_per_multiproc =
+            _ConvertSMVer2Cores(major,  minor);
+      }
+      int multiProcessorCount = 0, clockRate = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
+      cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device);
+      if (result != cudaSuccess) {
+        // If cudaDevAttrClockRate attribute is not supported we
+        // set clockRate as 1, to consider GPU with most SMs and CUDA Cores.
+        if(result == cudaErrorInvalidValue) {
+          clockRate = 1;
+        }
+        else {
+          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result));
+          exit(EXIT_FAILURE);
+        }
+      }
+      uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
+
+      if (compute_perf > max_compute_perf) {
+        max_compute_perf = compute_perf;
+        max_perf_device = current_device;
+      }
+    } else {
+      devices_prohibited++;
+    }
+
+    ++current_device;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " all devices have compute mode prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return max_perf_device;
+}
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv) {
+  int devID = 0;
+
+  // If the command-line has a device number specified, use it
+  if (checkCmdLineFlag(argc, argv, "device")) {
+    devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+    if (devID < 0) {
+      printf("Invalid command line parameter\n ");
+      exit(EXIT_FAILURE);
+    } else {
+      devID = gpuDeviceInit(devID);
+
+      if (devID < 0) {
+        printf("exiting...\n");
+        exit(EXIT_FAILURE);
+      }
+    }
+  } else {
+    // Otherwise pick the device with highest Gflops/s
+    devID = gpuGetMaxGflopsDeviceId();
+    checkCudaErrors(cudaSetDevice(devID));
+    int major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+           devID, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+  }
+
+  return devID;
+}
+
+inline int findIntegratedGPU() {
+  int current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the integrated GPU which is compute capable
+  while (current_device < device_count) {
+    int computeMode = -1, integrated = -1;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device));
+    // If GPU is integrated and is not running on Compute Mode prohibited,
+    // then cuda can map to GLES resource
+    if (integrated && (computeMode != cudaComputeModeProhibited)) {
+      checkCudaErrors(cudaSetDevice(current_device));
+
+      int major = 0, minor = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+      checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+      return current_device;
+    } else {
+      devices_prohibited++;
+    }
+
+    current_device++;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "CUDA error:"
+            " No GLES-CUDA Interop capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return -1;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version) {
+  int dev;
+  int major = 0, minor = 0;
+
+  checkCudaErrors(cudaGetDevice(&dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
+
+  if ((major > major_version) ||
+      (major == major_version &&
+       minor >= minor_version)) {
+    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
+    return true;
+  } else {
+    printf(
+        "  No GPU device was found that can support "
+        "CUDA compute capability %d.%d.\n",
+        major_version, minor_version);
+    return false;
+  }
+}
+#endif
+
+// end of CUDA Helper Functions
+
+#endif  // COMMON_HELPER_CUDA_H_
\ No newline at end of file
diff --git a/3rdParty/cuda_samples/helper_functions.h b/3rdParty/cuda_samples/helper_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fc2ea47ba7d39a4bf6a882f65b16779de6de0ac
--- /dev/null
+++ b/3rdParty/cuda_samples/helper_functions.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing,
+// timers, image helpers, etc)
+#ifndef COMMON_HELPER_FUNCTIONS_H_
+#define COMMON_HELPER_FUNCTIONS_H_
+
+#ifdef WIN32
+#pragma warning(disable : 4996)
+#endif
+
+// includes, project
+#include <assert.h>
+#include <exception.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// includes, timer, string parsing, image helpers
+#include <helper_image.h>  // helper functions for image compare, dump, data comparisons
+#include <helper_string.h>  // helper functions for string parsing
+#include <helper_timer.h>   // helper functions for timers
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#endif  // COMMON_HELPER_FUNCTIONS_H_
\ No newline at end of file
diff --git a/3rdParty/cuda_samples/helper_image.h b/3rdParty/cuda_samples/helper_image.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb7190c21b39463222beb579337d14c1a54b0000
--- /dev/null
+++ b/3rdParty/cuda_samples/helper_image.h
@@ -0,0 +1,1009 @@
+//
+// Created by Soeren Peters on 05.02.21.
+//
+
+#ifndef VIRTUALFLUIDS_HELPER_IMAGE_H
+#define VIRTUALFLUIDS_HELPER_IMAGE_H
+
+#endif // VIRTUALFLUIDS_HELPER_IMAGE_H
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (image,bitmap)
+#ifndef COMMON_HELPER_IMAGE_H_
+#define COMMON_HELPER_IMAGE_H_
+
+#include <assert.h>
+#include <exception.h>
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#ifndef MIN
+#define MIN(a, b) ((a < b) ? a : b)
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a > b) ? a : b)
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#include <helper_string.h>
+
+// namespace unnamed (internal)
+namespace helper_image_internal {
+//! size of PGM file header
+const unsigned int PGMHeaderSize = 0x40;
+
+// types
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterFromUByte;
+
+//! Data converter from unsigned char / unsigned byte
+template <>
+struct ConverterFromUByte<unsigned char> {
+    //! Conversion operator
+    //! @return converted value
+    //! @param  val  value to convert
+    float operator()(const unsigned char &val) {
+        return static_cast<unsigned char>(val);
+    }
+};
+
+//! Data converter from unsigned char / unsigned byte to float
+template <>
+struct ConverterFromUByte<float> {
+    //! Conversion operator
+    //! @return converted value
+    //! @param  val  value to convert
+    float operator()(const unsigned char &val) {
+        return static_cast<float>(val) / 255.0f;
+    }
+};
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterToUByte;
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<unsigned char> {
+    //! Conversion operator (essentially a passthru
+    //! @return converted value
+    //! @param  val  value to convert
+    unsigned char operator()(const unsigned char &val) { return val; }
+};
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<float> {
+    //! Conversion operator
+    //! @return converted value
+    //! @param  val  value to convert
+    unsigned char operator()(const float &val) {
+        return static_cast<unsigned char>(val * 255.0f);
+    }
+};
+}  // namespace helper_image_internal
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#else
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#endif
+
+inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w,
+                      unsigned int *h, unsigned int *channels) {
+    FILE *fp = NULL;
+
+    if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) {
+        std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl;
+        return false;
+    }
+
+    // check header
+    char header[helper_image_internal::PGMHeaderSize];
+
+    if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+        std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
+        return false;
+    }
+
+    if (strncmp(header, "P5", 2) == 0) {
+        *channels = 1;
+    } else if (strncmp(header, "P6", 2) == 0) {
+        *channels = 3;
+    } else {
+        std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl;
+        *channels = 0;
+        return false;
+    }
+
+    // parse header, read maxval, width and height
+    unsigned int width = 0;
+    unsigned int height = 0;
+    unsigned int maxval = 0;
+    unsigned int i = 0;
+
+    while (i < 3) {
+        if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+            std::cerr << "__LoadPPM() : reading PGM header returned NULL"
+                      << std::endl;
+            return false;
+        }
+
+        if (header[0] == '#') {
+            continue;
+        }
+
+        if (i == 0) {
+            i += SSCANF(header, "%u %u %u", &width, &height, &maxval);
+        } else if (i == 1) {
+            i += SSCANF(header, "%u %u", &height, &maxval);
+        } else if (i == 2) {
+            i += SSCANF(header, "%u", &maxval);
+        }
+    }
+
+    // check if given handle for the data is initialized
+    if (NULL != *data) {
+        if (*w != width || *h != height) {
+            std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl;
+        }
+    } else {
+        *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height *
+                                        *channels);
+        *w = width;
+        *h = height;
+    }
+
+    // read and close file
+    if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) ==
+        0) {
+        std::cerr << "__LoadPPM() read data returned error." << std::endl;
+    }
+
+    fclose(fp);
+
+    return true;
+}
+
+template <class T>
+inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w,
+                       unsigned int *h) {
+    unsigned char *idata = NULL;
+    unsigned int channels;
+
+    if (true != __loadPPM(file, &idata, w, h, &channels)) {
+        return false;
+    }
+
+    unsigned int size = *w * *h * channels;
+
+    // initialize mem if necessary
+    // the correct size is checked / set in loadPGMc()
+    if (NULL == *data) {
+        *data = reinterpret_cast<T *>(malloc(sizeof(T) * size));
+    }
+
+    // copy and cast data
+    std::transform(idata, idata + size, *data,
+                   helper_image_internal::ConverterFromUByte<T>());
+
+    free(idata);
+
+    return true;
+}
+
+template <class T>
+inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w,
+                        unsigned int *h) {
+    unsigned char *idata = 0;
+    unsigned int channels;
+
+    if (__loadPPM(file, &idata, w, h, &channels)) {
+        // pad 4th component
+        int size = *w * *h;
+        // keep the original pointer
+        unsigned char *idata_orig = idata;
+        *data = reinterpret_cast<T *>(malloc(sizeof(T) * size * 4));
+        unsigned char *ptr = *data;
+
+        for (int i = 0; i < size; i++) {
+            *ptr++ = *idata++;
+            *ptr++ = *idata++;
+            *ptr++ = *idata++;
+            *ptr++ = 0;
+        }
+
+        free(idata_orig);
+        return true;
+    } else {
+        free(idata);
+        return false;
+    }
+}
+
+inline bool __savePPM(const char *file, unsigned char *data, unsigned int w,
+                      unsigned int h, unsigned int channels) {
+    assert(NULL != data);
+    assert(w > 0);
+    assert(h > 0);
+
+    std::fstream fh(file, std::fstream::out | std::fstream::binary);
+
+    if (fh.bad()) {
+        std::cerr << "__savePPM() : Opening file failed." << std::endl;
+        return false;
+    }
+
+    if (channels == 1) {
+        fh << "P5\n";
+    } else if (channels == 3) {
+        fh << "P6\n";
+    } else {
+        std::cerr << "__savePPM() : Invalid number of channels." << std::endl;
+        return false;
+    }
+
+    fh << w << "\n" << h << "\n" << 0xff << std::endl;
+
+    for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) {
+        fh << data[i];
+    }
+
+    fh.flush();
+
+    if (fh.bad()) {
+        std::cerr << "__savePPM() : Writing data failed." << std::endl;
+        return false;
+    }
+
+    fh.close();
+
+    return true;
+}
+
+template <class T>
+inline bool sdkSavePGM(const char *file, T *data, unsigned int w,
+                       unsigned int h) {
+    unsigned int size = w * h;
+    unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size);
+
+    std::transform(data, data + size, idata,
+                   helper_image_internal::ConverterToUByte<T>());
+
+    // write file
+    bool result = __savePPM(file, idata, w, h, 1);
+
+    // cleanup
+    free(idata);
+
+    return result;
+}
+
+inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w,
+                          unsigned int h) {
+    // strip 4th component
+    int size = w * h;
+    unsigned char *ndata =
+        (unsigned char *)malloc(sizeof(unsigned char) * size * 3);
+    unsigned char *ptr = ndata;
+
+    for (int i = 0; i < size; i++) {
+        *ptr++ = *data++;
+        *ptr++ = *data++;
+        *ptr++ = *data++;
+        data++;
+    }
+
+    bool result = __savePPM(file, ndata, w, h, 3);
+    free(ndata);
+    return result;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFile(const char *filename, T **data, unsigned int *len,
+                        bool verbose) {
+    // check input arguments
+    assert(NULL != filename);
+    assert(NULL != len);
+
+    // intermediate storage for the data read
+    std::vector<T> data_read;
+
+    // open file for reading
+    FILE *fh = NULL;
+
+    // check if filestream is valid
+    if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) {
+        printf("Unable to open input file: %s\n", filename);
+        return false;
+    }
+
+    // read all data elements
+    T token;
+
+    while (!feof(fh)) {
+        fscanf(fh, "%f", &token);
+        data_read.push_back(token);
+    }
+
+    // the last element is read twice
+    data_read.pop_back();
+    fclose(fh);
+
+    // check if the given handle is already initialized
+    if (NULL != *data) {
+        if (*len != data_read.size()) {
+            std::cerr << "sdkReadFile() : Initialized memory given but "
+                      << "size  mismatch with signal read "
+                      << "(data read / data init = " << (unsigned int)data_read.size()
+                      << " / " << *len << ")" << std::endl;
+
+            return false;
+        }
+    } else {
+        // allocate storage for the data read
+        *data = reinterpret_cast<T *>(malloc(sizeof(T) * data_read.size()));
+        // store signal size
+        *len = static_cast<unsigned int>(data_read.size());
+    }
+
+    // copy data
+    memcpy(*data, &data_read.front(), sizeof(T) * data_read.size());
+
+    return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len,
+                              unsigned int block_num, unsigned int block_size,
+                              bool verbose) {
+    // check input arguments
+    assert(NULL != filename);
+    assert(NULL != len);
+
+    // open file for reading
+    FILE *fh = fopen(filename, "rb");
+
+    if (fh == NULL && verbose) {
+        std::cerr << "sdkReadFile() : Opening file failed." << std::endl;
+        return false;
+    }
+
+    // check if the given handle is already initialized
+    // allocate storage for the data read
+    data[block_num] = reinterpret_cast<T *>(malloc(block_size));
+
+    // read all data elements
+    fseek(fh, block_num * block_size, SEEK_SET);
+    *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh);
+
+    fclose(fh);
+
+    return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename
+//! @return true if writing the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len,
+                         const S epsilon, bool verbose, bool append = false) {
+    assert(NULL != filename);
+    assert(NULL != data);
+
+    // open file for writing
+    //    if (append) {
+    std::fstream fh(filename, std::fstream::out | std::fstream::ate);
+
+    if (verbose) {
+        std::cerr << "sdkWriteFile() : Open file " << filename
+                  << " for write/append." << std::endl;
+    }
+
+    /*    } else {
+            std::fstream fh(filename, std::fstream::out);
+            if (verbose) {
+                std::cerr << "sdkWriteFile() : Open file " << filename << " for
+       write." << std::endl;
+            }
+        }
+    */
+
+    // check if filestream is valid
+    if (!fh.good()) {
+        if (verbose) {
+            std::cerr << "sdkWriteFile() : Opening file failed." << std::endl;
+        }
+
+        return false;
+    }
+
+    // first write epsilon
+    fh << "# " << epsilon << "\n";
+
+    // write data
+    for (unsigned int i = 0; (i < len) && (fh.good()); ++i) {
+        fh << data[i] << ' ';
+    }
+
+    // Check if writing succeeded
+    if (!fh.good()) {
+        if (verbose) {
+            std::cerr << "sdkWriteFile() : Writing file failed." << std::endl;
+        }
+
+        return false;
+    }
+
+    // file ends with nl
+    fh << std::endl;
+
+    return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  timer_interface to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareData(const T *reference, const T *data,
+                        const unsigned int len, const S epsilon,
+                        const float threshold) {
+    assert(epsilon >= 0);
+
+    bool result = true;
+    unsigned int error_count = 0;
+
+    for (unsigned int i = 0; i < len; ++i) {
+        float diff = static_cast<float>(reference[i]) - static_cast<float>(data[i]);
+        bool comp = (diff <= epsilon) && (diff >= -epsilon);
+        result &= comp;
+
+        error_count += !comp;
+
+#if 0
+
+        if (!comp) {
+      std::cerr << "ERROR, i = " << i << ",\t "
+                << reference[i] << " / "
+                << data[i]
+                << " (reference / data)\n";
+    }
+
+#endif
+    }
+
+    if (threshold == 0.0f) {
+        return (result) ? true : false;
+    } else {
+        if (error_count) {
+            printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+                   static_cast<float>(error_count) * 100 / static_cast<float>(len),
+                   error_count);
+        }
+
+        return (len * threshold > error_count) ? true : false;
+    }
+}
+
+#ifndef __MIN_EPSILON_ERROR
+#define __MIN_EPSILON_ERROR 1e-3f
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param epsilon    threshold % of (# of bytes) for pass/fail
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareDataAsFloatThreshold(const T *reference, const T *data,
+                                        const unsigned int len, const S epsilon,
+                                        const float threshold) {
+    assert(epsilon >= 0);
+
+    // If we set epsilon to be 0, let's set a minimum threshold
+    float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR);
+    int error_count = 0;
+    bool result = true;
+
+    for (unsigned int i = 0; i < len; ++i) {
+        float diff =
+            fabs(static_cast<float>(reference[i]) - static_cast<float>(data[i]));
+        bool comp = (diff < max_error);
+        result &= comp;
+
+        if (!comp) {
+            error_count++;
+        }
+    }
+
+    if (threshold == 0.0f) {
+        if (error_count) {
+            printf("total # of errors = %d\n", error_count);
+        }
+
+        return (error_count == 0) ? true : false;
+    } else {
+        if (error_count) {
+            printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+                   static_cast<float>(error_count) * 100 / static_cast<float>(len),
+                   error_count);
+        }
+
+        return ((len * threshold > error_count) ? true : false);
+    }
+}
+
+inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) {
+    printf("sdkDumpBin: <%s>\n", filename);
+    FILE *fp;
+    FOPEN(fp, filename, "wb");
+    fwrite(data, bytes, 1, fp);
+    fflush(fp);
+    fclose(fp);
+}
+
+inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file,
+                                  unsigned int nelements, const float epsilon,
+                                  const float threshold, char *exec_path) {
+    unsigned int *src_buffer, *ref_buffer;
+    FILE *src_fp = NULL, *ref_fp = NULL;
+
+    uint64_t error_count = 0;
+    size_t fsize = 0;
+
+    if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+        printf("compareBin2Bin <unsigned int> unable to open src_file: %s\n",
+               src_file);
+        error_count++;
+    }
+
+    char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+    if (ref_file_path == NULL) {
+        printf("compareBin2Bin <unsigned int>  unable to find <%s> in <%s>\n",
+               ref_file, exec_path);
+        printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+               ref_file);
+        printf("Aborting comparison!\n");
+        printf("  FAILED\n");
+        error_count++;
+
+        if (src_fp) {
+            fclose(src_fp);
+        }
+
+        if (ref_fp) {
+            fclose(ref_fp);
+        }
+    } else {
+        if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+            printf(
+                "compareBin2Bin <unsigned int>"
+                " unable to open ref_file: %s\n",
+                ref_file_path);
+            error_count++;
+        }
+
+        if (src_fp && ref_fp) {
+            src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+            ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+
+            fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp);
+            fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp);
+
+            printf(
+                "> compareBin2Bin <unsigned int> nelements=%d,"
+                " epsilon=%4.2f, threshold=%4.2f\n",
+                nelements, epsilon, threshold);
+            printf("   src_file <%s>, size=%d bytes\n", src_file,
+                   static_cast<int>(fsize));
+            printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+                   static_cast<int>(fsize));
+
+            if (!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements,
+                                                  epsilon, threshold)) {
+                error_count++;
+            }
+
+            fclose(src_fp);
+            fclose(ref_fp);
+
+            free(src_buffer);
+            free(ref_buffer);
+        } else {
+            if (src_fp) {
+                fclose(src_fp);
+            }
+
+            if (ref_fp) {
+                fclose(ref_fp);
+            }
+        }
+    }
+
+    if (error_count == 0) {
+        printf("  OK\n");
+    } else {
+        printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+    }
+
+    return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file,
+                                   unsigned int nelements, const float epsilon,
+                                   const float threshold, char *exec_path) {
+    float *src_buffer = NULL, *ref_buffer = NULL;
+    FILE *src_fp = NULL, *ref_fp = NULL;
+    size_t fsize = 0;
+
+    uint64_t error_count = 0;
+
+    if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+        printf("compareBin2Bin <float> unable to open src_file: %s\n", src_file);
+        error_count = 1;
+    }
+
+    char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+    if (ref_file_path == NULL) {
+        printf("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file,
+               exec_path);
+        printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+               exec_path);
+        printf("Aborting comparison!\n");
+        printf("  FAILED\n");
+        error_count++;
+
+        if (src_fp) {
+            fclose(src_fp);
+        }
+
+        if (ref_fp) {
+            fclose(ref_fp);
+        }
+    } else {
+        if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+            printf("compareBin2Bin <float> unable to open ref_file: %s\n",
+                   ref_file_path);
+            error_count = 1;
+        }
+
+        if (src_fp && ref_fp) {
+            src_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+            ref_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+
+            printf(
+                "> compareBin2Bin <float> nelements=%d, epsilon=%4.2f,"
+                " threshold=%4.2f\n",
+                nelements, epsilon, threshold);
+            fsize = fread(src_buffer, sizeof(float), nelements, src_fp);
+            printf("   src_file <%s>, size=%d bytes\n", src_file,
+                   static_cast<int>(fsize * sizeof(float)));
+            fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp);
+            printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+                   static_cast<int>(fsize * sizeof(float)));
+
+            if (!compareDataAsFloatThreshold<float, float>(
+                ref_buffer, src_buffer, nelements, epsilon, threshold)) {
+                error_count++;
+            }
+
+            fclose(src_fp);
+            fclose(ref_fp);
+
+            free(src_buffer);
+            free(ref_buffer);
+        } else {
+            if (src_fp) {
+                fclose(src_fp);
+            }
+
+            if (ref_fp) {
+                fclose(ref_fp);
+            }
+        }
+    }
+
+    if (error_count == 0) {
+        printf("  OK\n");
+    } else {
+        printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+    }
+
+    return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareL2fe(const float *reference, const float *data,
+                           const unsigned int len, const float epsilon) {
+    assert(epsilon >= 0);
+
+    float error = 0;
+    float ref = 0;
+
+    for (unsigned int i = 0; i < len; ++i) {
+        float diff = reference[i] - data[i];
+        error += diff * diff;
+        ref += reference[i] * reference[i];
+    }
+
+    float normRef = sqrtf(ref);
+
+    if (fabs(ref) < 1e-7) {
+#ifdef _DEBUG
+        std::cerr << "ERROR, reference l2-norm is 0\n";
+#endif
+        return false;
+    }
+
+    float normError = sqrtf(error);
+    error = normError / normRef;
+    bool result = error < epsilon;
+#ifdef _DEBUG
+
+    if (!result) {
+    std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon "
+              << epsilon << "\n";
+  }
+
+#endif
+
+    return result;
+}
+
+inline bool sdkLoadPPMub(const char *file, unsigned char **data,
+                         unsigned int *w, unsigned int *h) {
+    unsigned int channels;
+    return __loadPPM(file, data, w, h, &channels);
+}
+
+inline bool sdkLoadPPM4ub(const char *file, unsigned char **data,
+                          unsigned int *w, unsigned int *h) {
+    unsigned char *idata = 0;
+    unsigned int channels;
+
+    if (__loadPPM(file, &idata, w, h, &channels)) {
+        // pad 4th component
+        int size = *w * *h;
+        // keep the original pointer
+        unsigned char *idata_orig = idata;
+        *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4);
+        unsigned char *ptr = *data;
+
+        for (int i = 0; i < size; i++) {
+            *ptr++ = *idata++;
+            *ptr++ = *idata++;
+            *ptr++ = *idata++;
+            *ptr++ = 0;
+        }
+
+        free(idata_orig);
+        return true;
+    } else {
+        free(idata);
+        return false;
+    }
+}
+
+inline bool sdkComparePPM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+    unsigned char *src_data, *ref_data;
+    uint64_t error_count = 0;
+    unsigned int ref_width, ref_height;
+    unsigned int src_width, src_height;
+
+    if (src_file == NULL || ref_file == NULL) {
+        if (verboseErrors) {
+            std::cerr << "PPMvsPPM: src_file or ref_file is NULL."
+                         "  Aborting comparison\n";
+        }
+
+        return false;
+    }
+
+    if (verboseErrors) {
+        std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+        std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+    }
+
+    if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+        if (verboseErrors) {
+            std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file
+                      << "\n";
+        }
+
+        return false;
+    }
+
+    if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) {
+        std::cerr << "PPMvsPPM: unable to load src image file: " << src_file
+                  << "\n";
+        return false;
+    }
+
+    if (src_height != ref_height || src_width != ref_width) {
+        if (verboseErrors) {
+            std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width
+                      << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                      << ")\n";
+        }
+    }
+
+    if (verboseErrors) {
+        std::cerr << "PPMvsPPM: comparing images size (" << src_width << ","
+                  << src_height << ") epsilon(" << epsilon << "), threshold("
+                  << threshold * 100 << "%)\n";
+    }
+
+    if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon,
+                    threshold) == false) {
+        error_count = 1;
+    }
+
+    if (error_count == 0) {
+        if (verboseErrors) {
+            std::cerr << "    OK\n\n";
+        }
+    } else {
+        if (verboseErrors) {
+            std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+        }
+    }
+
+    // returns true if all pixels pass
+    return (error_count == 0) ? true : false;
+}
+
+inline bool sdkComparePGM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+    unsigned char *src_data = 0, *ref_data = 0;
+    uint64_t error_count = 0;
+    unsigned int ref_width, ref_height;
+    unsigned int src_width, src_height;
+
+    if (src_file == NULL || ref_file == NULL) {
+        if (verboseErrors) {
+            std::cerr << "PGMvsPGM: src_file or ref_file is NULL."
+                         "  Aborting comparison\n";
+        }
+
+        return false;
+    }
+
+    if (verboseErrors) {
+        std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+        std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+    }
+
+    if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+        if (verboseErrors) {
+            std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file
+                      << "\n";
+        }
+
+        return false;
+    }
+
+    if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) {
+        std::cerr << "PGMvsPGM: unable to load src image file: " << src_file
+                  << "\n";
+        return false;
+    }
+
+    if (src_height != ref_height || src_width != ref_width) {
+        if (verboseErrors) {
+            std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width
+                      << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                      << ")\n";
+        }
+    }
+
+    if (verboseErrors)
+        std::cerr << "PGMvsPGM: comparing images size (" << src_width << ","
+                  << src_height << ") epsilon(" << epsilon << "), threshold("
+                  << threshold * 100 << "%)\n";
+
+    if (compareData(ref_data, src_data, src_width * src_height, epsilon,
+                    threshold) == false) {
+        error_count = 1;
+    }
+
+    if (error_count == 0) {
+        if (verboseErrors) {
+            std::cerr << "    OK\n\n";
+        }
+    } else {
+        if (verboseErrors) {
+            std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+        }
+    }
+
+    // returns true if all pixels pass
+    return (error_count == 0) ? true : false;
+}
+
+#endif  // COMMON_HELPER_IMAGE_H_
\ No newline at end of file
diff --git a/3rdParty/cuda_samples/helper_string.h b/3rdParty/cuda_samples/helper_string.h
new file mode 100644
index 0000000000000000000000000000000000000000..c09935174ec057757c20263df080cb8d77b53f52
--- /dev/null
+++ b/3rdParty/cuda_samples/helper_string.h
@@ -0,0 +1,368 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef COMMON_HELPER_STRING_H_
+#define COMMON_HELPER_STRING_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else  // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string) {
+    int string_start = 0;
+
+    while (string[string_start] == delimiter) {
+        string_start++;
+    }
+
+    if (string_start >= static_cast<int>(strlen(string) - 1)) {
+        return 0;
+    }
+
+    return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension) {
+    int string_length = static_cast<int>(strlen(filename));
+
+    while (filename[string_length--] != '.') {
+        if (string_length == 0) break;
+    }
+
+    if (string_length > 0) string_length += 2;
+
+    if (string_length == 0)
+        *extension = NULL;
+    else
+        *extension = &filename[string_length];
+
+    return string_length;
+}
+
+inline bool checkCmdLineFlag(const int argc, const char **argv,
+                             const char *string_ref) {
+    bool bFound = false;
+
+    if (argc >= 1) {
+        for (int i = 1; i < argc; i++) {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+
+            const char *equal_pos = strchr(string_argv, '=');
+            int argv_length = static_cast<int>(
+                equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+            int length = static_cast<int>(strlen(string_ref));
+
+            if (length == argv_length &&
+                !STRNCASECMP(string_argv, string_ref, length)) {
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv,
+                                    const char *string_ref, T *value) {
+    bool bFound = false;
+
+    if (argc >= 1) {
+        for (int i = 1; i < argc; i++) {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = static_cast<int>(strlen(string_ref));
+
+            if (!STRNCASECMP(string_argv, string_ref, length)) {
+                if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    *value = (T)atoi(&string_argv[length + auto_inc]);
+                }
+
+                bFound = true;
+                i = argc;
+            }
+        }
+    }
+
+    return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv,
+                                 const char *string_ref) {
+    bool bFound = false;
+    int value = -1;
+
+    if (argc >= 1) {
+        for (int i = 1; i < argc; i++) {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = static_cast<int>(strlen(string_ref));
+
+            if (!STRNCASECMP(string_argv, string_ref, length)) {
+                if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    value = atoi(&string_argv[length + auto_inc]);
+                } else {
+                    value = 0;
+                }
+
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (bFound) {
+        return value;
+    } else {
+        return 0;
+    }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv,
+                                     const char *string_ref) {
+    bool bFound = false;
+    float value = -1;
+
+    if (argc >= 1) {
+        for (int i = 1; i < argc; i++) {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = static_cast<int>(strlen(string_ref));
+
+            if (!STRNCASECMP(string_argv, string_ref, length)) {
+                if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    value = static_cast<float>(atof(&string_argv[length + auto_inc]));
+                } else {
+                    value = 0.f;
+                }
+
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (bFound) {
+        return value;
+    } else {
+        return 0;
+    }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref,
+                                     char **string_retval) {
+    bool bFound = false;
+
+    if (argc >= 1) {
+        for (int i = 1; i < argc; i++) {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            char *string_argv = const_cast<char *>(&argv[i][string_start]);
+            int length = static_cast<int>(strlen(string_ref));
+
+            if (!STRNCASECMP(string_argv, string_ref, length)) {
+                *string_retval = &string_argv[length + 1];
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (!bFound) {
+        *string_retval = NULL;
+    }
+
+    return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename,
+                             const char *executable_path) {
+    // <executable_name> defines a variable that is replaced with the name of the
+    // executable
+
+    // Typical relative search paths to locate needed companion files (e.g. sample
+    // input data, or JIT source files) The origin for the relative search may be
+    // the .exe file, a .bat file launching an .exe, a browser .exe launching the
+    // .exe or .bat, etc
+    const char *searchPath[] = {
+        "./",                                          // same dir
+        "./data/",                                      // same dir
+        "../../../../Samples/<executable_name>/",       // up 4 in tree
+        "../../../Samples/<executable_name>/",          // up 3 in tree
+        "../../Samples/<executable_name>/",             // up 2 in tree
+        "../../../../Samples/<executable_name>/data/",  // up 4 in tree
+        "../../../Samples/<executable_name>/data/",     // up 3 in tree
+        "../../Samples/<executable_name>/data/",        // up 2 in tree
+        "../../../../Common/data/",                     // up 4 in tree
+        "../../../Common/data/",                        // up 3 in tree
+        "../../Common/data/"                            // up 2 in tree
+    };
+
+    // Extract the executable name
+    std::string executable_name;
+
+    if (executable_path != 0) {
+        executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+
+#else
+        // Linux & OSX path delimiter
+        size_t delimiter_pos = executable_name.find_last_of('/');
+        executable_name.erase(0, delimiter_pos + 1);
+#endif
+    }
+
+    // Loop over all search paths and return the first hit
+    for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+        std::string path(searchPath[i]);
+        size_t executable_name_pos = path.find("<executable_name>");
+
+        // If there is executable_name variable in the searchPath
+        // replace it with the value
+        if (executable_name_pos != std::string::npos) {
+            if (executable_path != 0) {
+                path.replace(executable_name_pos, strlen("<executable_name>"),
+                             executable_name);
+            } else {
+                // Skip this path entry if no executable argument is given
+                continue;
+            }
+        }
+
+#ifdef _DEBUG
+        printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+        // Test if the file exists
+        path.append(filename);
+        FILE *fp;
+        FOPEN(fp, path.c_str(), "rb");
+
+        if (fp != NULL) {
+            fclose(fp);
+            // File found
+            // returning an allocated array here for backwards compatibility reasons
+            char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
+            STRCPY(file_path, path.length() + 1, path.c_str());
+            return file_path;
+        }
+
+        if (fp) {
+            fclose(fp);
+        }
+    }
+
+    // File not found
+    return 0;
+}
+
+#endif  // COMMON_HELPER_STRING_H_
\ No newline at end of file
diff --git a/3rdParty/cuda_samples/helper_timer.h b/3rdParty/cuda_samples/helper_timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..51efd720993057092323ea224377e87c95d70ff2
--- /dev/null
+++ b/3rdParty/cuda_samples/helper_timer.h
@@ -0,0 +1,465 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Helper Timing Functions
+#ifndef COMMON_HELPER_TIMER_H_
+#define COMMON_HELPER_TIMER_H_
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include <exception.h>
+
+// Definition of the StopWatch Interface, this is used if we don't want to use
+// the CUT functions But rather in a self contained class interface
+class StopWatchInterface {
+public:
+    StopWatchInterface() {}
+    virtual ~StopWatchInterface() {}
+
+public:
+    //! Start time measurement
+    virtual void start() = 0;
+
+    //! Stop time measurement
+    virtual void stop() = 0;
+
+    //! Reset time counters to zero
+    virtual void reset() = 0;
+
+    //! Time in msec. after start. If the stop watch is still running (i.e. there
+    //! was no call to stop()) then the elapsed time is returned, otherwise the
+    //! time between the last start() and stop call is returned
+    virtual float getTime() = 0;
+
+    //! Mean time to date based on the number of times the stopwatch has been
+    //! _stopped_ (ie finished sessions) and the current total time
+    virtual float getAverageTime() = 0;
+};
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchWin()
+      : start_time(),
+        end_time(),
+        diff_time(0.0f),
+        total_time(0.0f),
+        running(false),
+        clock_sessions(0),
+        freq(0),
+        freq_set(false) {
+    if (!freq_set) {
+      // helper variable
+      LARGE_INTEGER temp;
+
+      // get the tick frequency from the OS
+      QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER *>(&temp));
+
+      // convert to type in which it is needed
+      freq = (static_cast<double>(temp.QuadPart)) / 1000.0;
+
+      // rememeber query
+      freq_set = true;
+    }
+  }
+
+  // Destructor
+  ~StopWatchWin() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  LARGE_INTEGER start_time;
+  //! End of measurement
+  LARGE_INTEGER end_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+
+  //! tick frequency
+  double freq;
+
+  //! flag if the frequency has been set
+  bool freq_set;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::start() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::stop() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&end_time));
+  diff_time = static_cast<float>(((static_cast<double>(end_time.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+
+  total_time += diff_time;
+  clock_sessions++;
+  running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    LARGE_INTEGER temp;
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&temp));
+    retval += static_cast<float>(((static_cast<double>(temp.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <sys/time.h>
+#include <ctime>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface {
+public:
+    //! Constructor, default
+    StopWatchLinux()
+        : start_time(),
+          diff_time(0.0),
+          total_time(0.0),
+          running(false),
+          clock_sessions(0) {}
+
+    // Destructor
+    virtual ~StopWatchLinux() {}
+
+public:
+    //! Start time measurement
+    inline void start();
+
+    //! Stop time measurement
+    inline void stop();
+
+    //! Reset time counters to zero
+    inline void reset();
+
+    //! Time in msec. after start. If the stop watch is still running (i.e. there
+    //! was no call to stop()) then the elapsed time is returned, otherwise the
+    //! time between the last start() and stop call is returned
+    inline float getTime();
+
+    //! Mean time to date based on the number of times the stopwatch has been
+    //! _stopped_ (ie finished sessions) and the current total time
+    inline float getAverageTime();
+
+private:
+    // helper functions
+
+    //! Get difference between start time and current time
+    inline float getDiffTime();
+
+private:
+    // member variables
+
+    //! Start of measurement
+    struct timeval start_time;
+
+    //! Time difference between the last start and stop
+    float diff_time;
+
+    //! TOTAL time difference between starts and stops
+    float total_time;
+
+    //! flag if the stop watch is running
+    bool running;
+
+    //! Number of times clock has been started
+    //! and stopped to allow averaging
+    int clock_sessions;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::start() {
+    gettimeofday(&start_time, 0);
+    running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::stop() {
+    diff_time = getDiffTime();
+    total_time += diff_time;
+    running = false;
+    clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::reset() {
+    diff_time = 0;
+    total_time = 0;
+    clock_sessions = 0;
+
+    if (running) {
+        gettimeofday(&start_time, 0);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getTime() {
+    // Return the TOTAL time to date
+    float retval = total_time;
+
+    if (running) {
+        retval += getDiffTime();
+    }
+
+    return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getAverageTime() {
+    return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getDiffTime() {
+    struct timeval t_time;
+    gettimeofday(&t_time, 0);
+
+    // time difference in milli-seconds
+    return static_cast<float>(1000.0 * (t_time.tv_sec - start_time.tv_sec) +
+                              (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif  // WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkCreateTimer(StopWatchInterface **timer_interface) {
+// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    *timer_interface = reinterpret_cast<StopWatchInterface *>(new StopWatchWin());
+#else
+    *timer_interface =
+        reinterpret_cast<StopWatchInterface *>(new StopWatchLinux());
+#endif
+    return (*timer_interface != NULL) ? true : false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) {
+    // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface) {
+        delete *timer_interface;
+        *timer_interface = NULL;
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStartTimer(StopWatchInterface **timer_interface) {
+    // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface) {
+        (*timer_interface)->start();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStopTimer(StopWatchInterface **timer_interface) {
+    // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface) {
+        (*timer_interface)->stop();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkResetTimer(StopWatchInterface **timer_interface) {
+    // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface) {
+        (*timer_interface)->reset();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) {
+    //  printf("sdkGetAverageTimerValue called object %08x\n", (void
+    //  *)*timer_interface);
+    if (*timer_interface) {
+        return (*timer_interface)->getAverageTime();
+    } else {
+        return 0.0f;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetTimerValue(StopWatchInterface **timer_interface) {
+    // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface) {
+        return (*timer_interface)->getTime();
+    } else {
+        return 0.0f;
+    }
+}
+
+#endif  // COMMON_HELPER_TIMER_H_
\ No newline at end of file
diff --git a/CMake/3rd/cuda.cmake b/CMake/3rd/cuda.cmake
index 7214deaa4d2a4324668e11d9620e467719b64c27..73d9b8647e2fa92dc760b60040ef727e4b4785f0 100644
--- a/CMake/3rd/cuda.cmake
+++ b/CMake/3rd/cuda.cmake
@@ -1,20 +1,12 @@
 
 function(linkCUDA)
 
-    find_path(CUDA_CUT_INCLUDE_DIR
-      helper_cuda.h
-      PATHS "$ENV{NVCUDASAMPLES_ROOT}" "${NVCUDASAMPLES_ROOT}"
-      PATH_SUFFIXES "common/inc" "Common"
-      DOC "Location of helper_cuda.h"
-      NO_DEFAULT_PATH
-    )
+    set(CUDA_CUT_INCLUDE_DIR "${VF_THIRD_DIR}/cuda_samples/")
 
     vf_get_library_name(library_name)
     target_include_directories(${library_name} PRIVATE ${CUDA_CUT_INCLUDE_DIR})
     target_include_directories(${library_name} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
-    message("CUDA INCLUDE PATH: ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
-
     # set the following properties only for specific targets
     # set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
     # set_property(TARGET ${targetName} PROPERTY CUDA_64_BIT_DEVICE_CODE ON)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 40056a9e1f8c9cdb1c2626474e24f76bc7335149..ac6b72f19cc2f46e00b9e88af46b068ce920c8ba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -101,7 +101,6 @@ if(BUILD_VF_GPU)
         set(CMAKE_CUDA_ARCHITECTURES 30)
     endif()
 
-    message("CUDA Compiler: ${CMAKE_CUDA_COMPILER}")
     message("CUDA Architecture: ${CMAKE_CUDA_ARCHITECTURES}")
 
     include (gpu.cmake)