diff --git a/CMakeLists.txt b/CMakeLists.txt
index cafae167f..ed2dd190d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -368,7 +368,8 @@ set(TARGET_CORE_SOURCES
     src/pipeline/node/DetectionNetwork.cpp
     src/pipeline/node/Script.cpp
     src/pipeline/node/Pool.cpp
-    src/pipeline/node/Benchmark.cpp
+    src/pipeline/node/BenchmarkIn.cpp
+    src/pipeline/node/BenchmarkOut.cpp
     src/pipeline/node/SpatialDetectionNetwork.cpp
     src/pipeline/node/SystemLogger.cpp
     src/pipeline/node/SpatialLocationCalculator.cpp
diff --git a/bindings/python/src/pipeline/node/BenchmarkBindings.cpp b/bindings/python/src/pipeline/node/BenchmarkBindings.cpp
index 781fbc1bc..79e6d23cc 100644
--- a/bindings/python/src/pipeline/node/BenchmarkBindings.cpp
+++ b/bindings/python/src/pipeline/node/BenchmarkBindings.cpp
@@ -30,9 +30,13 @@ void bind_benchmark(pybind11::module& m, void* pCallstack) {
     benchmarkOut.def_readonly("out", &BenchmarkOut::out, DOC(dai, node, BenchmarkOut, out))
         .def_readonly("input", &BenchmarkOut::input, DOC(dai, node, BenchmarkOut, input))
         .def("setNumMessagesToSend", &BenchmarkOut::setNumMessagesToSend, py::arg("num"), DOC(dai, node, BenchmarkOut, setNumMessagesToSend))
+        .def("setRunOnHost", &BenchmarkOut::setRunOnHost, py::arg("runOnHost"), DOC(dai, node, BenchmarkOut, setRunOnHost))
         .def("setFps", &BenchmarkOut::setFps, py::arg("fps"), DOC(dai, node, BenchmarkOut, setFps));
     benchmarkIn.def_readonly("input", &BenchmarkIn::input, DOC(dai, node, BenchmarkIn, input))
         .def_readonly("report", &BenchmarkIn::report, DOC(dai, node, BenchmarkIn, report))
         .def_readonly("passthrough", &BenchmarkIn::passthrough, DOC(dai, node, BenchmarkIn, passthrough))
-        .def("setNumMessagesToGet", &BenchmarkIn::setNumMessagesToGet, py::arg("num"), DOC(dai, node, BenchmarkIn, setNumMessagesToGet));
+        .def("setRunOnHost", &BenchmarkIn::setRunOnHost, py::arg("runOnHost"), DOC(dai, node, BenchmarkIn, setRunOnHost))
+        .def("logReportsAsWarnings", &BenchmarkIn::logReportsAsWarnings, py::arg("logReportsAsWarnings"), DOC(dai, node, BenchmarkIn, logReportsAsWarnings))
+        .def("measureIndividualLatencies", &BenchmarkIn::measureIndividualLatencies, py::arg("attachLatencies"), DOC(dai, node, BenchmarkIn, measureIndividualLatencies))
+        .def("sendReportEveryNMessages", &BenchmarkIn::sendReportEveryNMessages, py::arg("num"), DOC(dai, node, BenchmarkIn, sendReportEveryNMessages));
 }
diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
index 982160495..bda073a99 100644
--- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
+++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
@@ -4,4 +4,4 @@ set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot")
 
 # "version if applicable"
 # set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+93f7b75a885aa32f44c5e9f53b74470c49d2b1af")
-set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+81617bcfe7b7da9eda9654b5b3d3d3254b59a47d")
+set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+19b67f81b54c146d079d2cbd4485fa153337dc5a")
diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
index 9a0146539..20e2509ad 100644
--- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake
+++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
@@ -2,7 +2,7 @@
 set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot")
 
 # "full commit hash of device side binary"
-set(DEPTHAI_DEVICE_SIDE_COMMIT "c3e98b39b6a5445b2187b4109d03a146c6df37dd")
+set(DEPTHAI_DEVICE_SIDE_COMMIT "5e016a328ac84324fb3c6bd8904141191f29dc2e")
 
 # "version if applicable"
 set(DEPTHAI_DEVICE_SIDE_VERSION "")
diff --git a/examples/python/Benchmark/benchmark_camera.py b/examples/python/Benchmark/benchmark_camera.py
new file mode 100644
index 000000000..e89398673
--- /dev/null
+++ b/examples/python/Benchmark/benchmark_camera.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+import depthai as dai
+import time
+
+# Create pipeline
+with dai.Pipeline() as pipeline:
+    # Create the nodes
+    cam = pipeline.create(dai.node.Camera).build()
+    benchmarkIn = pipeline.create(dai.node.BenchmarkIn)
+    # benchmarkIn.setRunOnHost(True) # The node can also run on host and include the transfer limitation, default is False
+    output = cam.requestFullResolutionOutput()
+    output.link(benchmarkIn.input)
+
+    pipeline.start()
+    while pipeline.isRunning():
+        time.sleep(1) # Let the logger print out the FPS
diff --git a/examples/python/Benchmark/benchmark_nn.py b/examples/python/Benchmark/benchmark_nn.py
new file mode 100644
index 000000000..154584086
--- /dev/null
+++ b/examples/python/Benchmark/benchmark_nn.py
@@ -0,0 +1,51 @@
+import depthai as dai
+import numpy as np
+
+
+# First prepare the model for benchmarking
+device = dai.Device()
+modelPath = dai.getModelFromZoo(dai.NNModelDescription("yolov6-nano", platform=device.getPlatformAsString()))
+modelArhive = dai.NNArchive(modelPath)
+inputSize = modelArhive.getInputSize()
+type = modelArhive.getConfig().model.inputs[0].preprocessing.daiType
+
+if type:
+    try:
+        frameType = dai.ImgFrame.Type.__getattribute__(type)
+    except AttributeError:
+        type = None
+
+if not type:
+    if device.getPlatform() == dai.Platform.RVC2:
+        frameType = dai.ImgFrame.Type.BGR888p
+    else:
+        frameType = dai.ImgFrame.Type.BGR888i
+
+
+# Construct the input (white) image for benchmarking
+img = np.ones((inputSize[1], inputSize[0], 3), np.uint8) * 255
+inputFrame = dai.ImgFrame()
+inputFrame.setCvFrame(img, frameType)
+
+with dai.Pipeline(device) as p:
+    benchmarkOut = p.create(dai.node.BenchmarkOut)
+    benchmarkOut.setRunOnHost(False) # The node can run on host or on device
+    benchmarkOut.setFps(-1) # As fast as possible
+
+    neuralNetwork = p.create(dai.node.NeuralNetwork).build(benchmarkOut.out, modelArhive)
+
+    benchmarkIn = p.create(dai.node.BenchmarkIn)
+    benchmarkIn.setRunOnHost(False) # The node can run on host or on device
+    benchmarkIn.sendReportEveryNMessages(100)
+    benchmarkIn.logReportsAsWarnings(False)
+    neuralNetwork.out.link(benchmarkIn.input)
+
+    outputQueue = benchmarkIn.report.createOutputQueue()
+    inputQueue = benchmarkOut.input.createInputQueue()
+
+    p.start()
+    inputQueue.send(inputFrame) # Send the input image only once
+    while p.isRunning():
+        benchmarkReport = outputQueue.get()
+        assert isinstance(benchmarkReport, dai.BenchmarkReport)
+        print(f"FPS is {benchmarkReport.fps}")
diff --git a/examples/python/Benchmark/benchmark_simple.py b/examples/python/Benchmark/benchmark_simple.py
new file mode 100644
index 000000000..ac377c07b
--- /dev/null
+++ b/examples/python/Benchmark/benchmark_simple.py
@@ -0,0 +1,29 @@
+import depthai as dai
+
+with dai.Pipeline(createImplicitDevice=False) as p:
+    # Create a BenchmarkOut node
+    # It will listen on the input to get the first message and then send it out at a specified rate
+    # The node sends the same message out (creates new pointers), not deep copies.
+    benchmarkOut = p.create(dai.node.BenchmarkOut)
+    benchmarkOut.setRunOnHost(True) # The node can run on host or on device
+    benchmarkOut.setFps(30)
+
+    # Create a BenchmarkIn node
+    # This node is receiving the messages on the input and measuring the FPS and latency.
+    # In the case that the input is with BenchmarkOut, the latency measurement is not always possible, as the message is not deep copied,
+    # which means that the timestamps stay the same and latency virtually increases over time.
+    benchmarkIn = p.create(dai.node.BenchmarkIn)
+    benchmarkIn.setRunOnHost(True) # The node can run on host or on device
+    benchmarkIn.sendReportEveryNMessages(100)
+
+    benchmarkOut.out.link(benchmarkIn.input)
+    outputQueue = benchmarkIn.report.createOutputQueue()
+    inputQueue = benchmarkOut.input.createInputQueue()
+
+    p.start()
+    imgFrame = dai.ImgFrame()
+    inputQueue.send(imgFrame)
+    while p.isRunning():
+        benchmarkReport = outputQueue.get()
+        assert isinstance(benchmarkReport, dai.BenchmarkReport)
+        print(f"FPS is {benchmarkReport.fps}")
diff --git a/examples/python/CMakeLists.txt b/examples/python/CMakeLists.txt
index 2578d1b27..7ac0f52e6 100644
--- a/examples/python/CMakeLists.txt
+++ b/examples/python/CMakeLists.txt
@@ -40,6 +40,7 @@ function(add_python_example example_name python_script_path)
         # Python path (to find compiled module)
         "PYTHONPATH=$<TARGET_FILE_DIR:${TARGET_NAME}>${SYS_PATH_SEPARATOR}$ENV{PYTHONPATH}"
         "DEPTHAI_SEARCH_TIMEOUT=15000"
+        "DEPTHAI_CONNECT_TIMEOUT=15000"
         "DEPTHAI_RECONNECT_TIMEOUT=0"
         # ASAN in case of sanitizers
         "${ASAN_ENVIRONMENT_VARS}"
@@ -60,6 +61,7 @@ function(add_python_example example_name python_script_path)
             # Python path (to find compiled module)
             "PYTHONPATH=$<TARGET_FILE_DIR:${TARGET_NAME}>${SYS_PATH_SEPARATOR}$ENV{PYTHONPATH}"
             "DEPTHAI_SEARCH_TIMEOUT=30000"
+            "DEPTHAI_CONNECT_TIMEOUT=30000"
             "DEPTHAI_RECONNECT_TIMEOUT=0"
             # ASAN in case of sanitizers
             ${ASAN_ENVIRONMENT_VARS}
@@ -223,3 +225,15 @@ set_tests_properties(py_script_simple PROPERTIES FAIL_REGULAR_EXPRESSION "\\[err
 
 add_python_example(script_all_cameras Script/script_switch_all_cameras.py)
 dai_set_example_test_labels(script_all_cameras ondevice rvc2_all rvc4 ci)
+
+## Benchmark node
+add_python_example(benchmark_node Benchmark/benchmark_simple.py)
+dai_set_example_test_labels(benchmark_node ondevice rvc2_all rvc4 ci)
+set_tests_properties(py_benchmark_node PROPERTIES FAIL_REGULAR_EXPRESSION "\\[error\\];\\[critical\\]")
+
+add_python_example(benchmark_cameras Benchmark/benchmark_camera.py)
+dai_set_example_test_labels(benchmark_cameras ondevice rvc2_all rvc4 ci)
+set_tests_properties(py_benchmark_cameras PROPERTIES FAIL_REGULAR_EXPRESSION "\\[error\\];\\[critical\\]")
+
+add_python_example(benchmark_nn Benchmark/benchmark_nn.py)
+dai_set_example_test_labels(benchmark_nn ondevice rvc2_all rvc4 ci)
diff --git a/include/depthai/pipeline/datatype/BenchmarkReport.hpp b/include/depthai/pipeline/datatype/BenchmarkReport.hpp
index 4d496449f..b70e53ca9 100644
--- a/include/depthai/pipeline/datatype/BenchmarkReport.hpp
+++ b/include/depthai/pipeline/datatype/BenchmarkReport.hpp
@@ -3,22 +3,20 @@
 #include "depthai/pipeline/datatype/Buffer.hpp"
 namespace dai {
 
-// TODO(before mainline) - API not supported on RVC2
 /**
  * BenchmarkReport message.
  */
 class BenchmarkReport : public Buffer {
    public:
     BenchmarkReport() = default;
-    virtual ~BenchmarkReport() = default;
 
-    float fps;
-    float timeTotal;  // seconds
-    float numMessagesReceived;
-    float averageLatency;
+    float fps = 0.0f;
+    float timeTotal = 0.0f;  // seconds
+    float numMessagesReceived = 0;
+    float averageLatency = 0.0f;  // seconds
+
+    // Only filled if measureIndividualLatencies is set to true
     std::vector<float> latencies;
-    // TODO Add jitter, timestamps for start/end, possibly a vector of timestamps for all messages
-    // TODO BEFORE MAINLINE add setters and getters
 
     void serialize(std::vector<std::uint8_t>& metadata, DatatypeEnum& datatype) const override {
         metadata = utility::serialize(*this);
diff --git a/include/depthai/pipeline/node/BenchmarkIn.hpp b/include/depthai/pipeline/node/BenchmarkIn.hpp
index 80166c1b8..6b6aa9e94 100644
--- a/include/depthai/pipeline/node/BenchmarkIn.hpp
+++ b/include/depthai/pipeline/node/BenchmarkIn.hpp
@@ -3,13 +3,12 @@
 #include <depthai/pipeline/DeviceNode.hpp>
 
 // shared
-#include <depthai/properties/BenchmarkPropertiesIn.hpp>
+#include <depthai/properties/BenchmarkInProperties.hpp>
 
 namespace dai {
 namespace node {
 
-// TODO(before mainline) - API not supported on RVC2
-class BenchmarkIn : public DeviceNodeCRTP<DeviceNode, BenchmarkIn, BenchmarkPropertiesIn> {
+class BenchmarkIn : public DeviceNodeCRTP<DeviceNode, BenchmarkIn, BenchmarkInProperties>, public HostRunnable {
    public:
     constexpr static const char* NAME = "BenchmarkIn";
     using DeviceNodeCRTP::DeviceNodeCRTP;
@@ -30,11 +29,35 @@ class BenchmarkIn : public DeviceNodeCRTP<DeviceNode, BenchmarkIn, BenchmarkProp
     Output report{*this, {"report", DEFAULT_GROUP, {{{DatatypeEnum::BenchmarkReport, false}}}}};
 
     /**
-     * Set number of messages that the nodes retrieves before sending the report
-     * The passthrough keeps getting forwarded after the report is sent
-     * @param num of messages to get for report
+     * Specify how many messages to measure for each report
      */
-    void setNumMessagesToGet(int num);
+    void sendReportEveryNMessages(uint32_t n);
+
+    /**
+     * Specify whether to run on host or device
+     * By default, the node will run on device.
+     */
+    void setRunOnHost(bool runOnHost);
+
+    /**
+     * Check if the node is set to run on host
+     */
+    bool runOnHost() const override;
+
+    /**
+    * Log the reports as warnings
+    */
+    void logReportsAsWarnings(bool logReportsAsWarnings);
+
+    /**
+     * Attach latencies to the report
+     */
+    void measureIndividualLatencies(bool attachLatencies);
+
+    void run() override;
+
+   private:
+    bool runOnHostVar = false;
 };
 
 }  // namespace node
diff --git a/include/depthai/pipeline/node/BenchmarkOut.hpp b/include/depthai/pipeline/node/BenchmarkOut.hpp
index f21b4fdce..4cff5a8a8 100644
--- a/include/depthai/pipeline/node/BenchmarkOut.hpp
+++ b/include/depthai/pipeline/node/BenchmarkOut.hpp
@@ -3,12 +3,12 @@
 #include <depthai/pipeline/DeviceNode.hpp>
 
 // shared
-#include <depthai/properties/BenchmarkPropertiesOut.hpp>
+#include <depthai/properties/BenchmarkOutProperties.hpp>
 
 namespace dai {
 namespace node {
 
-class BenchmarkOut : public DeviceNodeCRTP<DeviceNode, BenchmarkOut, BenchmarkPropertiesOut> {
+class BenchmarkOut : public DeviceNodeCRTP<DeviceNode, BenchmarkOut, BenchmarkOutProperties>, public HostRunnable{
    public:
     constexpr static const char* NAME = "BenchmarkOut";
     using DeviceNodeCRTP::DeviceNodeCRTP;
@@ -34,7 +34,21 @@ class BenchmarkOut : public DeviceNodeCRTP<DeviceNode, BenchmarkOut, BenchmarkPr
      */
     void setFps(float fps);
 
-    void buildInternal() override;
+    /**
+     * Specify whether to run on host or device
+     * By default, the node will run on device.
+     */
+    void setRunOnHost(bool runOnHost);
+
+    /**
+     * Check if the node is set to run on host
+     */
+    bool runOnHost() const override;
+
+    void run() override;
+    
+   private:
+    bool runOnHostVar = false;
 };
 
 }  // namespace node
diff --git a/include/depthai/properties/BenchmarkInProperties.hpp b/include/depthai/properties/BenchmarkInProperties.hpp
new file mode 100644
index 000000000..93a9349b8
--- /dev/null
+++ b/include/depthai/properties/BenchmarkInProperties.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "depthai/common/ProcessorType.hpp"
+#include "depthai/common/optional.hpp"
+#include "depthai/pipeline/datatype/DatatypeEnum.hpp"
+#include "depthai/properties/Properties.hpp"
+
+namespace dai {
+
+/**
+ * Specify benchmark properties (number of messages to send/receive)
+ */
+struct BenchmarkInProperties : PropertiesSerializable<Properties, BenchmarkInProperties> {
+    /**
+     * Specify how many messages to measure for each report
+     */
+    uint32_t reportEveryNMessages = 50;
+
+    /**
+    * Specify whether the latenices are attached to the report individually
+    */
+    bool attachLatencies = false;
+
+    /**
+     * Send the reports also as logger warnings
+     */
+    bool logReportsAsWarnings = true;
+};
+
+DEPTHAI_SERIALIZE_EXT(BenchmarkInProperties, reportEveryNMessages, attachLatencies, logReportsAsWarnings);
+
+}  // namespace dai
diff --git a/include/depthai/properties/BenchmarkPropertiesOut.hpp b/include/depthai/properties/BenchmarkOutProperties.hpp
similarity index 71%
rename from include/depthai/properties/BenchmarkPropertiesOut.hpp
rename to include/depthai/properties/BenchmarkOutProperties.hpp
index 19ee57c22..def89ad74 100644
--- a/include/depthai/properties/BenchmarkPropertiesOut.hpp
+++ b/include/depthai/properties/BenchmarkOutProperties.hpp
@@ -10,11 +10,11 @@ namespace dai {
 /**
  * Specify benchmark properties (number of messages to send/receive)
  */
-struct BenchmarkPropertiesOut : PropertiesSerializable<Properties, BenchmarkPropertiesOut> {
+struct BenchmarkOutProperties : PropertiesSerializable<Properties, BenchmarkOutProperties> {
     /**
      * Number of messages to send
      */
-    int numMessages = 50;
+    int numMessages = -1;
 
     /**
      * FPS for sending, 0 means as fast as possible
@@ -22,6 +22,6 @@ struct BenchmarkPropertiesOut : PropertiesSerializable<Properties, BenchmarkProp
     float fps = 0;
 };
 
-DEPTHAI_SERIALIZE_EXT(BenchmarkPropertiesOut, numMessages, fps);
+DEPTHAI_SERIALIZE_EXT(BenchmarkOutProperties, numMessages, fps);
 
 }  // namespace dai
diff --git a/include/depthai/properties/BenchmarkProperties.hpp b/include/depthai/properties/BenchmarkProperties.hpp
deleted file mode 100644
index 2b02f7361..000000000
--- a/include/depthai/properties/BenchmarkProperties.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include "depthai/common/ProcessorType.hpp"
-#include "depthai/common/optional.hpp"
-#include "depthai/pipeline/datatype/DatatypeEnum.hpp"
-#include "depthai/properties/Properties.hpp"
-
-namespace dai {
-
-/**
- * Specify benchmark properties (number of messages to send/receive)
- */
-struct BenchmarkProperties : PropertiesSerializable<Properties, BenchmarkProperties> {
-    /**
-     * Number of messages to send
-     */
-    int numMessages = 50;
-};
-
-DEPTHAI_SERIALIZE_EXT(BenchmarkProperties, numMessages);
-
-}  // namespace dai
diff --git a/include/depthai/properties/BenchmarkPropertiesIn.hpp b/include/depthai/properties/BenchmarkPropertiesIn.hpp
deleted file mode 100644
index 8bbe350f3..000000000
--- a/include/depthai/properties/BenchmarkPropertiesIn.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include "depthai/common/ProcessorType.hpp"
-#include "depthai/common/optional.hpp"
-#include "depthai/pipeline/datatype/DatatypeEnum.hpp"
-#include "depthai/properties/Properties.hpp"
-
-namespace dai {
-
-/**
- * Specify benchmark properties (number of messages to send/receive)
- */
-struct BenchmarkPropertiesIn : PropertiesSerializable<Properties, BenchmarkPropertiesIn> {
-    /**
-     * Number of messages to send
-     */
-    int numMessages = 50;
-};
-
-DEPTHAI_SERIALIZE_EXT(BenchmarkPropertiesIn, numMessages);
-
-}  // namespace dai
diff --git a/src/pipeline/node/Benchmark.cpp b/src/pipeline/node/Benchmark.cpp
deleted file mode 100644
index f4af2492e..000000000
--- a/src/pipeline/node/Benchmark.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "depthai/pipeline/node/BenchmarkIn.hpp"
-#include "depthai/pipeline/node/BenchmarkOut.hpp"
-
-namespace dai {
-namespace node {
-
-void BenchmarkOut::setNumMessagesToSend(int num) {
-    properties.numMessages = num;
-}
-
-void BenchmarkOut::setFps(float fps) {
-    properties.fps = fps;
-}
-
-void BenchmarkIn::setNumMessagesToGet(int num) {
-    properties.numMessages = num;
-}
-
-void BenchmarkOut::buildInternal() {
-    properties.numMessages = -1;  // By default send messages indefinitely
-}
-
-}  // namespace node
-}  // namespace dai
diff --git a/src/pipeline/node/BenchmarkIn.cpp b/src/pipeline/node/BenchmarkIn.cpp
new file mode 100644
index 000000000..40bbf4bf1
--- /dev/null
+++ b/src/pipeline/node/BenchmarkIn.cpp
@@ -0,0 +1,136 @@
+#include "depthai/pipeline/node/BenchmarkIn.hpp"
+#include "depthai/pipeline/datatype/BenchmarkReport.hpp"
+#include <chrono>
+
+namespace dai {
+namespace node {
+
+void BenchmarkIn::sendReportEveryNMessages(uint32_t num) {
+    properties.reportEveryNMessages = num;
+}
+
+void BenchmarkIn::setRunOnHost(bool runOnHost) {
+    runOnHostVar = runOnHost;
+}
+
+bool BenchmarkIn::runOnHost() const {
+    return runOnHostVar;
+}
+
+void BenchmarkIn::logReportsAsWarnings(bool logReportsAsWarnings) {
+    properties.logReportsAsWarnings = logReportsAsWarnings;
+}
+
+void BenchmarkIn::measureIndividualLatencies(bool attachLatencies) {
+    properties.attachLatencies = attachLatencies;
+}
+
+void BenchmarkIn::run() {
+    using namespace std::chrono;
+
+    uint32_t numMessages = properties.reportEveryNMessages;
+
+    // Decide if we will store latencies or not
+    bool storeLatencies = false;
+    if(properties.attachLatencies) {
+        if(numMessages <= 1000) {
+            // We'll store latencies for this batch
+            storeLatencies = true;
+        } else {
+            // Warn upfront if user wanted latencies but # messages is too high
+            logger->warn("Number of messages > 1000, latencies not individually attached.");
+        }
+    }
+
+    uint32_t messageCount = 0;
+    float totalLatency = 0.0f;
+
+    std::vector<float> latencies;
+    if(storeLatencies) {
+        latencies.reserve(numMessages);
+    }
+
+    auto start = steady_clock::now();
+
+    while(isRunning()) {
+        auto inMessage = input.get<dai::Buffer>();
+
+        // If this is the first message of the batch, reset counters
+        if(messageCount == 0) {
+            start = steady_clock::now();
+            totalLatency = 0.0f;
+
+            // Clear vector if we are storing latencies
+            if(storeLatencies) {
+                latencies.clear();
+                latencies.reserve(numMessages);
+            }
+        }
+
+        if(messageCount < numMessages) {
+            auto currentTs = steady_clock::now();
+            auto messageTs = steady_clock::now();
+            if(runOnHostVar) {
+                messageTs = inMessage->getTimestamp();
+            } else {
+                messageTs = inMessage->getTimestampDevice();
+            }
+
+            duration<float> diff = currentTs - messageTs;
+            logger->trace("Frame latency: {} s", diff.count());
+
+            // Accumulate for average
+            totalLatency += diff.count();
+
+            // Optionally store individual latencies
+            if(storeLatencies) {
+                latencies.push_back(diff.count());
+            }
+
+            messageCount++;
+        } 
+        else {
+            // We reached our batch size, so time to compute and send the report
+            auto stop = steady_clock::now();
+            duration<float> durationS = stop - start;
+
+            auto reportMessage = std::make_shared<dai::BenchmarkReport>();
+            reportMessage->numMessagesReceived = numMessages;
+            reportMessage->timeTotal = durationS.count();
+            reportMessage->fps = numMessages / durationS.count();
+            reportMessage->averageLatency = totalLatency / numMessages;
+
+            // Attach latencies only if we're storing them
+            if(storeLatencies) {
+                reportMessage->latencies = latencies;
+            }
+
+            // Decide how to log (warn or info) once, then do all the logs
+            auto logFunc = [&](auto fmt, auto... args) {
+                if(properties.logReportsAsWarnings) {
+                    logger->warn(fmt, std::forward<decltype(args)>(args)...);
+                } else {
+                    logger->trace(fmt, std::forward<decltype(args)>(args)...);
+                }
+            };
+
+            // Unconditional logs, using chosen severity
+            logFunc("FPS: {}", reportMessage->fps);
+            logFunc("Messages took {} s", reportMessage->timeTotal);
+            logFunc("Average latency: {} s", reportMessage->averageLatency);
+
+            // Send out the report
+            report.send(reportMessage);
+            logger->trace("Sent report message");
+
+            // Reset for next batch
+            messageCount = 0;
+        }
+
+        // Passthrough the message
+        passthrough.send(inMessage);
+    }
+}
+
+}  // namespace node
+}  // namespace dai
diff --git a/src/pipeline/node/BenchmarkOut.cpp b/src/pipeline/node/BenchmarkOut.cpp
new file mode 100644
index 000000000..afbed9634
--- /dev/null
+++ b/src/pipeline/node/BenchmarkOut.cpp
@@ -0,0 +1,70 @@
+#include "depthai/pipeline/node/BenchmarkOut.hpp"
+
+namespace dai {
+namespace node {
+
+void BenchmarkOut::setNumMessagesToSend(int num) {
+    properties.numMessages = num;
+}
+
+void BenchmarkOut::setFps(float fps) {
+    properties.fps = fps;
+}
+
+void BenchmarkOut::setRunOnHost(bool runOnHost) {
+    runOnHostVar = runOnHost;
+}
+
+bool BenchmarkOut::runOnHost() const {
+    return runOnHostVar;
+}
+
+void BenchmarkOut::run() {
+    using namespace std::chrono;
+
+    logger->trace("Wait for the input message.");
+    auto inMessage = input.get();
+
+    bool useTiming = (properties.fps > 0);
+
+    auto frameDurationDouble = std::chrono::duration<double>(1.0 / properties.fps);
+    auto frameDuration = std::chrono::duration_cast<std::chrono::steady_clock::duration>(frameDurationDouble);
+
+    auto nextFrameTime = steady_clock::now();
+    for(int i = 0; (i < properties.numMessages || properties.numMessages == -1) && isRunning(); i++) {
+        auto imgMessage = std::dynamic_pointer_cast<dai::ImgFrame>(inMessage);
+        if(imgMessage != nullptr) {
+            logger->trace("Sending img message with id {}", i);
+
+            // Copying metadata and pointing to same data
+            auto newMessage = std::make_shared<dai::ImgFrame>();
+            newMessage->setMetadata(imgMessage);
+            newMessage->data = imgMessage->data;
+            if(runOnHostVar) {
+                newMessage->setTimestamp(steady_clock::now());
+            } else {
+                newMessage->setTimestampDevice(steady_clock::now());
+            }
+            out.send(newMessage);
+        } else {
+            logger->trace("Sending message with id {}", i);
+            out.send(inMessage);
+        }
+
+        if(useTiming) {
+            nextFrameTime += frameDuration;
+
+            auto now = steady_clock::now();
+            if(nextFrameTime > now) {
+                auto sleepTime = nextFrameTime - now;
+                std::this_thread::sleep_for(sleepTime);
+            }
+        }
+    }
+
+    logger->trace("Benchmark out sent all messages!");
+}
+
+
+}  // namespace node
+}  // namespace dai
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c83c632fd..20017176f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -400,3 +400,7 @@ dai_set_test_labels(stereo_depth_node_test ondevice rvc2_all rvc4 ci)
 # ImageManipV2 test
 dai_add_test(image_manip_v2_node_test src/ondevice_tests/pipeline/node/image_manip_v2_test.cpp)
 dai_set_test_labels(image_manip_v2_node_test ondevice rvc2_all rvc4 ci)
+
+# Benchmark tests
+dai_add_test(benchmark_test src/ondevice_tests/pipeline/node/benchmark_test.cpp)
+dai_set_test_labels(benchmark_test ondevice rvc2_all rvc4 ci)
diff --git a/tests/src/ondevice_tests/pipeline/node/benchmark_test.cpp b/tests/src/ondevice_tests/pipeline/node/benchmark_test.cpp
new file mode 100644
index 000000000..cdb3688ed
--- /dev/null
+++ b/tests/src/ondevice_tests/pipeline/node/benchmark_test.cpp
@@ -0,0 +1,80 @@
+#include <catch2/catch_all.hpp>
+#include <catch2/catch_test_macros.hpp>
+#include "depthai/capabilities/ImgFrameCapability.hpp"
+#include "depthai/common/CameraBoardSocket.hpp"
+#include "depthai/depthai.hpp"
+#include "depthai/pipeline/MessageQueue.hpp"
+#include "depthai/pipeline/datatype/ImgFrame.hpp"
+#include "depthai/pipeline/node/Camera.hpp"
+
+void testBenchmarkIn(bool benchmarkInRunOnHost, bool benchmarkOutRunOnHost, float fps, bool passthrough) {
+    // Create pipeline
+    dai::Pipeline p;
+    auto benchmarkIn = p.create<dai::node::BenchmarkIn>();
+    benchmarkIn->setRunOnHost(benchmarkInRunOnHost);
+    auto benchmarkOut = p.create<dai::node::BenchmarkOut>();
+    benchmarkOut->setRunOnHost(benchmarkOutRunOnHost);
+    benchmarkOut->setFps(fps);
+    benchmarkOut->out.link(benchmarkIn->input);
+
+    auto inputQueue = benchmarkOut->input.createInputQueue();
+    auto reportQueue = benchmarkIn->report.createOutputQueue();
+    std::shared_ptr<dai::MessageQueue> passthroughQueue;
+    if(passthrough) {
+        passthroughQueue = benchmarkIn->passthrough.createOutputQueue(10, false);
+    }
+    p.start();
+    auto inputFrame = std::make_shared<dai::ImgFrame>();
+    inputQueue->send(inputFrame);
+    for(int i = 0; i < 10; i++) {
+        if(passthrough) {
+            auto passthroughFrame = passthroughQueue->get<dai::ImgFrame>();
+            REQUIRE(passthroughFrame != nullptr);
+        }
+        auto reportData = reportQueue->get<dai::BenchmarkReport>();
+        REQUIRE(reportData != nullptr);
+        REQUIRE(reportData->numMessagesReceived > 1);
+        REQUIRE(reportData->fps == Catch::Approx(fps).epsilon(0.1));
+    }
+}
+
+void testCameraBenchmarking(float fps) {
+    dai::Pipeline p;
+    auto cam = p.create<dai::node::Camera>()->build(dai::CameraBoardSocket::CAM_A);
+    auto* output = cam->requestOutput(std::pair(640, 400), std::nullopt, dai::ImgResizeMode::CROP, fps);
+    REQUIRE(output != nullptr);
+    auto benchmarkIn = p.create<dai::node::BenchmarkIn>();
+    output->link(benchmarkIn->input);
+    auto reportQueue = benchmarkIn->report.createOutputQueue();
+    p.start();
+    for(int i = 0; i < 10; i++) {
+        auto reportData = reportQueue->get<dai::BenchmarkReport>();
+        REQUIRE(reportData != nullptr);
+        REQUIRE(reportData->numMessagesReceived > 1);
+        REQUIRE(reportData->fps == Catch::Approx(fps).epsilon(0.1));
+    }
+}
+
+TEST_CASE("BenchmarkIn and BenchmarkOut run on device") {
+    testBenchmarkIn(false, false, 30.0f, true);
+}
+
+TEST_CASE("BenchmarkIn run on host, BenchmarkOut run on device") {
+    testBenchmarkIn(true, false, 30.0f, true);
+}
+
+TEST_CASE("BenchmarkIn run on device, BenchmarkOut run on host") {
+    testBenchmarkIn(false, true, 30.0f, true);
+}
+
+TEST_CASE("BenchmarkIn and BenchmarkOut run on host") {
+    testBenchmarkIn(true, true, 30.0f, true);
+}
+
+TEST_CASE("BenchmarkIn and BenchmarkOut run on device - high FPS") {
+    testBenchmarkIn(false, false, 1000.0f, false);
+}
+
+TEST_CASE("Camera benchmarking") {
+    testCameraBenchmarking(30.0f);
+}
diff --git a/tests/src/ondevice_tests/regression/camera_concurrency.cpp b/tests/src/ondevice_tests/regression/camera_concurrency.cpp
index a3ab0f25f..fbe09ddd7 100644
--- a/tests/src/ondevice_tests/regression/camera_concurrency.cpp
+++ b/tests/src/ondevice_tests/regression/camera_concurrency.cpp
@@ -27,7 +27,7 @@ TEST_CASE("camera_concurrency") {
     for(auto* output : cameraOutputs) {
         auto node = pipeline.create<dai::node::BenchmarkIn>();
         output->link(node->input);
-        node->setNumMessagesToGet(numMessagesToGet);
+        node->sendReportEveryNMessages(numMessagesToGet);
         queues.push_back(node->report.createOutputQueue());
         benchmarkNodes.push_back(node);
     }