diff --git a/assets/imx500_mobilenet_ssd.json b/assets/imx500_mobilenet_ssd.json
new file mode 100644
index 00000000..0a45abf4
--- /dev/null
+++ b/assets/imx500_mobilenet_ssd.json
@@ -0,0 +1,123 @@
+{
+    "imx500_object_detection":
+    {
+        "max_detections" : 5,
+        "threshold" : 0.6,
+        "network_file": "/usr/share/imx500-models/imx500_network_ssd_mobilenetv2_fpnlite_320x320_pp.rpk",
+
+        "save_input_tensor":
+        {
+            "filename": "/home/pi/input_tensor.raw",
+            "num_tensors": 10,
+            "norm_val": [384, 384, 384, 0],
+            "norm_shift": [0, 0, 0, 0]
+        },
+
+        "temporal_filter":
+        {
+            "tolerance": 0.1,
+            "factor": 0.2,
+            "visible_frames": 4,
+            "hidden_frames": 2
+        },
+    
+        "classes":
+        [
+            "person",
+            "bicycle",
+            "car",
+            "motorcycle",
+            "airplane",
+            "bus",
+            "train",
+            "truck",
+            "boat",
+            "traffic light",
+            "fire hydrant",
+            "-",
+            "stop sign",
+            "parking meter",
+            "bench",
+            "bird",
+            "cat",
+            "dog",
+            "horse",
+            "sheep",
+            "cow",
+            "elephant",
+            "bear",
+            "zebra",
+            "giraffe",
+            "-",
+            "backpack",
+            "umbrella",
+            "-",
+            "-",
+            "handbag",
+            "tie",
+            "suitcase",
+            "frisbee",
+            "skis",
+            "snowboard",
+            "sports ball",
+            "kite",
+            "baseball bat",
+            "baseball glove",
+            "skateboard",
+            "surfboard",
+            "tennis racket",
+            "bottle",
+            "-",
+            "wine glass",
+            "cup",
+            "fork",
+            "knife",
+            "spoon",
+            "bowl",
+            "banana",
+            "apple",
+            "sandwich",
+            "orange",
+            "broccoli",
+            "carrot",
+            "hot dog",
+            "pizza",
+            "donut",
+            "cake",
+            "chair",
+            "couch",
+            "potted plant",
+            "bed",
+            "-",
+            "dining table",
+            "-",
+            "-",
+            "toilet",
+            "-",
+            "tv",
+            "laptop",
+            "mouse",
+            "remote",
+            "keyboard",
+            "cell phone",
+            "microwave",
+            "oven",
+            "toaster",
+            "sink",
+            "refrigerator",
+            "-",
+            "book",
+            "clock",
+            "vase",
+            "scissors",
+            "teddy bear",
+            "hair drier",
+            "toothbrush"
+        ]
+    },
+
+    "object_detect_draw_cv":
+    {
+        "line_thickness" : 2
+    }
+}
diff --git a/assets/imx500_posenet.json b/assets/imx500_posenet.json
new file mode 100644
index 00000000..c008f2a5
--- /dev/null
+++ b/assets/imx500_posenet.json
@@ -0,0 +1,29 @@
+{
+    "imx500_posenet":
+    {
+        "max_detections" : 5,
+        "threshold" : 0.4,
+        "offset_refinement_steps": 5,
+        "nms_radius": 10.0,
+        "network_file": "/usr/share/imx500-models/imx500_network_posenet.rpk",
+
+        "save_input_tensor":
+        {
+            "filename": "/home/pi/posenet_input_tensor.raw",
+            "num_tensors": 10
+        },
+
+        "temporal_filter":
+        {
+            "tolerance": 0.3,
+            "factor": 0.3,
+            "visible_frames": 8,
+            "hidden_frames": 2
+        }
+    },
+
+    "plot_pose_cv":
+    {
+        "confidence_threshold" : 0.2
+    }    
+}
diff --git a/meson.build b/meson.build
index 7d4c61c8..1a742df3 100644
--- a/meson.build
+++ b/meson.build
@@ -80,5 +80,6 @@ summary({
             'OpenCV postprocessing' : enable_opencv,
             'TFLite postprocessing' : enable_tflite,
             'Hailo postprocessing' : enable_hailo,
+            'IMX500 postprocessing' : get_option('enable_imx500'),
         },
         bool_yn : true, section : 'Build configuration')
diff --git a/meson_options.txt b/meson_options.txt
index 13b6de16..0f24d546 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -43,3 +43,13 @@ option('download_hailo_models',
         type : 'boolean',
         value : true,
         description : 'Download and install the Hailo postprocessing models')
+
+option('enable_imx500',
+        type : 'boolean',
+        value : true,
+        description : 'Enable IMX500 postprocessing support')
+
+option('download_imx500_models',
+        type : 'boolean',
+        value : true,
+        description : 'Download and install the IMX500 postprocessing models')
diff --git a/post_processing_stages/imx500/imx500_object_detection.cpp b/post_processing_stages/imx500/imx500_object_detection.cpp
new file mode 100644
index 00000000..310340f9
--- /dev/null
+++ b/post_processing_stages/imx500/imx500_object_detection.cpp
@@ -0,0 +1,337 @@
+/* SPDX-License-Identifier: BSD-2-Clause */
+/*
+ * Copyright (C) 2024, Raspberry Pi Ltd
+ *
+ * imx500_object_detection.cpp - IMX500 object detection for various networks
+ */
+
+#include <algorithm>
+#include <array>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <libcamera/control_ids.h>
+#include <libcamera/geometry.h>
+
+#include "core/rpicam_app.hpp"
+#include "post_processing_stages/object_detect.hpp"
+#include "post_processing_stages/post_processing_stage.hpp"
+
+#include "imx500_post_processing_stage.hpp"
+
+using Rectangle = libcamera::Rectangle;
+using Size = libcamera::Size;
+namespace controls = libcamera::controls;
+
+#define NAME "imx500_object_detection"
+
+struct Bbox
+{
+	float x0;
+	float y0;
+	float x1;
+	float y1;
+};
+
+struct ObjectDetectionOutput
+{
+	unsigned int num_detections = 0;
+	std::vector<Bbox> bboxes;
+	std::vector<float> scores;
+	std::vector<float> classes;
+};
+
+class ObjectDetection : public IMX500PostProcessingStage
+{
+public:
+	ObjectDetection(RPiCamApp *app) : IMX500PostProcessingStage(app) {}
+
+	char const *Name() const override;
+
+	void Read(boost::property_tree::ptree const &params) override;
+
+	void Configure() override;
+
+	bool Process(CompletedRequestPtr &completed_request) override;
+
+private:
+	int processOutputTensor(std::vector<Detection> &objects, const std::vector<float> &output_tensor,
+							const CnnOutputTensorInfo &output_tensor_info, const Rectangle &scaler_crop) const;
+	void filterOutputObjects(std::vector<Detection> &objects);
+
+	struct LtObject
+	{
+		Detection params;
+		unsigned int visible;
+		unsigned int hidden;
+		bool matched;
+	};
+
+	std::vector<LtObject> lt_objects_;
+	std::mutex lt_lock_;
+
+	// Config params
+	unsigned int max_detections_;
+	float threshold_;
+	std::vector<std::string> classes_;
+	bool temporal_filtering_;
+	float tolerance_;
+	float factor_;
+	unsigned int visible_frames_;
+	unsigned int hidden_frames_;
+};
+
+char const *ObjectDetection::Name() const
+{
+	return NAME;
+}
+
+void ObjectDetection::Read(boost::property_tree::ptree const &params)
+{
+	max_detections_ = params.get<unsigned int>("max_detections");
+	threshold_ = params.get<float>("threshold", 0.5f);
+	classes_ = PostProcessingStage::GetJsonArray<std::string>(params, "classes");
+
+	if (params.find("temporal_filter") != params.not_found())
+	{
+		temporal_filtering_ = true;
+		tolerance_ = params.get<float>("temporal_filter.tolerance", 0.05);
+		factor_ = params.get<float>("temporal_filter.factor", 0.2);
+		visible_frames_ = params.get<unsigned int>("temporal_filter.visible_frames", 5);
+		hidden_frames_ = params.get<unsigned int>("temporal_filter.hidden_frames", 2);
+	}
+	else
+		temporal_filtering_ = false;
+
+	IMX500PostProcessingStage::Read(params);
+}
+
+void ObjectDetection::Configure()
+{
+	IMX500PostProcessingStage::Configure();
+	IMX500PostProcessingStage::ShowFwProgressBar();
+}
+
+bool ObjectDetection::Process(CompletedRequestPtr &completed_request)
+{
+	auto scaler_crop = completed_request->metadata.get(controls::ScalerCrop);
+	if (!raw_stream_ || !scaler_crop)
+	{
+		LOG_ERROR("Must have RAW stream and scaler crop available to get sensor dimensions!");
+		return false;
+	}
+
+	auto output = completed_request->metadata.get(controls::rpi::CnnOutputTensor);
+	auto info = completed_request->metadata.get(controls::rpi::CnnOutputTensorInfo);
+	std::vector<Detection> objects;
+
+	// Process() can be concurrently called through different threads for consecutive CompletedRequests if
+	// things are running behind.  So protect access to the lt_objects_ state object.
+	std::scoped_lock<std::mutex> l(lt_lock_);
+
+	if (output && info)
+	{
+		std::vector<float> output_tensor(output->data(), output->data() + output->size());
+		CnnOutputTensorInfo output_tensor_info = *reinterpret_cast<const CnnOutputTensorInfo *>(info->data());
+
+		processOutputTensor(objects, output_tensor, output_tensor_info, *scaler_crop);
+
+		if (temporal_filtering_)
+		{
+			filterOutputObjects(objects);
+			if (lt_objects_.size())
+			{
+				objects.clear();
+				for (auto const &obj : lt_objects_)
+				{
+					if (!obj.hidden)
+						objects.push_back(obj.params);
+				}
+			}
+		}
+		else
+		{
+			// If no temporal filtering, make sure we fill lt_objects_ with the current result as it may be used if
+			// there is no output tensor on subsequent frames.
+			for (auto const &obj : objects)
+				lt_objects_.push_back({ obj, 0, 0, 0 });
+		}
+	}
+	else
+	{
+		// No output tensor, so simply reuse the results from the lt_objects_.
+		for (auto const &obj : lt_objects_)
+		{
+			if (!obj.hidden)
+				objects.push_back(obj.params);
+		}
+	}
+
+	if (objects.size())
+		completed_request->post_process_metadata.Set("object_detect.results", objects);
+
+	return IMX500PostProcessingStage::Process(completed_request);
+}
+
+static int createObjectDetectionData(ObjectDetectionOutput &output, const std::vector<float> &data,
+									 unsigned int total_detections)
+{
+	unsigned int count = 0;
+
+	// Extract bounding box co-ordinates
+	for (unsigned int i = 0; i < total_detections; i++)
+	{
+		Bbox bbox;
+		bbox.y0 = data.at(count + i);
+		bbox.x0 = data.at(count + i + (1 * total_detections));
+		bbox.y1 = data.at(count + i + (2 * total_detections));
+		bbox.x1 = data.at(count + i + (3 * total_detections));
+		output.bboxes.push_back(bbox);
+	}
+	count += (total_detections * 4);
+
+	// Extract scores
+	for (unsigned int i = 0; i < total_detections; i++)
+	{
+		output.scores.push_back(data.at(count));
+		count++;
+	}
+
+	// Extract class indices
+	for (unsigned int i = 0; i < total_detections; i++)
+	{
+		output.classes.push_back(data.at(count));
+		count++;
+	}
+
+	// Extract number of detections
+	unsigned int num_detections = data.at(count);
+	if (num_detections > total_detections)
+	{
+		LOG(1, "Unexpected value for num_detections: " << num_detections << ", setting it to " << total_detections);
+		num_detections = total_detections;
+	}
+
+	output.num_detections = num_detections;
+	return 0;
+}
+
+int ObjectDetection::processOutputTensor(std::vector<Detection> &objects, const std::vector<float> &output_tensor,
+										 const CnnOutputTensorInfo &output_tensor_info,
+										 const Rectangle &scaler_crop) const
+{
+	if (output_tensor_info.num_tensors != 4)
+	{
+		LOG_ERROR("Invalid number of tensors " << output_tensor_info.num_tensors << ", expected 4");
+		return -1;
+	}
+
+	const unsigned int total_detections = output_tensor_info.info[0].tensor_data_num / 4;
+	ObjectDetectionOutput output;
+
+	// 4x coords + 1x labels + 1x confidences + 1 total detections
+	if (output_tensor.size() != 6 * total_detections + 1)
+	{
+		LOG_ERROR("Invalid tensor size " << output_tensor.size() << ", expected " << 6 * total_detections + 1);
+		return -1;
+	}
+
+	int ret = createObjectDetectionData(output, output_tensor, total_detections);
+	if (ret)
+	{
+		LOG_ERROR("Failed to create object detection data");
+		return -1;
+	}
+
+	for (unsigned int i = 0; i < std::min(output.num_detections, max_detections_); i++)
+	{
+		uint8_t class_index = (uint8_t)output.classes[i];
+
+		// Filter detections
+		if (output.scores[i] < threshold_ || class_index >= classes_.size())
+			continue;
+
+		// Extract bounding box co-ordinates in the inference image co-ordinates and convert to the final ISP output
+		// co-ordinates.
+		std::vector<float> coords{ output.bboxes[i].x0, output.bboxes[i].y0,
+								   output.bboxes[i].x1 - output.bboxes[i].x0,
+								   output.bboxes[i].y1 - output.bboxes[i].y0 };
+		const Rectangle obj_scaled = ConvertInferenceCoordinates(coords, scaler_crop);
+
+		objects.emplace_back(class_index, classes_[class_index], output.scores[i],
+							 obj_scaled.x, obj_scaled.y, obj_scaled.width, obj_scaled.height);
+	}
+
+	LOG(2, "Number of objects detected: " << objects.size());
+	for (unsigned i = 0; i < objects.size(); i++)
+		LOG(2, "[" << i << "] : " << objects[i].toString());
+
+	return 0;
+}
+
+void ObjectDetection::filterOutputObjects(std::vector<Detection> &objects)
+{
+	const Size isp_output_size = output_stream_->configuration().size;
+
+	for (auto &lt_obj : lt_objects_)
+		lt_obj.matched = false;
+
+	for (auto const &object : objects)
+	{
+		bool matched = false;
+		for (auto &lt_obj : lt_objects_)
+		{
+			// Try and match a detected object in our long term list.
+			if (object.category == lt_obj.params.category &&
+				std::abs(object.box.x - lt_obj.params.box.x) < tolerance_ * isp_output_size.width &&
+				std::abs(object.box.y - lt_obj.params.box.y) < tolerance_ * isp_output_size.height &&
+				std::abs((int)object.box.width - (int)lt_obj.params.box.width) < tolerance_ * isp_output_size.width &&
+				std::abs((int)object.box.height - (int)lt_obj.params.box.height) < tolerance_ * isp_output_size.height)
+			{
+				lt_obj.matched = matched = true;
+				lt_obj.params.confidence = object.confidence;
+				lt_obj.params.box.x = factor_ * object.box.x + (1 - factor_) * lt_obj.params.box.x;
+				lt_obj.params.box.y = factor_ * object.box.y + (1 - factor_) * lt_obj.params.box.y;
+				lt_obj.params.box.width = factor_ * object.box.width + (1 - factor_) * lt_obj.params.box.width;
+				lt_obj.params.box.height = factor_ * object.box.height + (1 - factor_) * lt_obj.params.box.height;
+				// Reset the visibility counter for when the object next disappears.
+				lt_obj.visible = visible_frames_;
+				// Decrement the hidden counter until the object becomes visible in the list.
+				lt_obj.hidden = std::max(0, (int)lt_obj.hidden - 1);
+				break;
+			}
+		}
+
+		// Add the object to the long term list if not found.  This object will remain hidden for hidden_frames_
+		// consecutive frames.
+		if (!matched)
+			lt_objects_.push_back({ object, visible_frames_, hidden_frames_, 1 });
+	}
+
+	for (auto &lt_obj : lt_objects_)
+	{
+		if (!lt_obj.matched)
+		{
+			// If a non matched object in the long term list is still hidden, set visible count to 0 so that it must be
+			// matched for hidden_frames_ consecutive frames before becoming visible. Otherwise, decrement the visible
+			// count of unmatched objects in the long term list.
+			if (lt_obj.hidden)
+				lt_obj.visible = 0;
+			else
+				lt_obj.visible--;
+		}
+	}
+
+	// Remove now invisible objects from the long term list.
+	lt_objects_.erase(std::remove_if(lt_objects_.begin(), lt_objects_.end(),
+						[] (const LtObject &obj) { return !obj.matched && !obj.visible; }),
+					  lt_objects_.end());
+}
+
+static PostProcessingStage *Create(RPiCamApp *app)
+{
+	return new ObjectDetection(app);
+}
+
+static RegisterStage reg(NAME, &Create);
diff --git a/post_processing_stages/imx500/imx500_posenet.cpp b/post_processing_stages/imx500/imx500_posenet.cpp
new file mode 100644
index 00000000..a965bd20
--- /dev/null
+++ b/post_processing_stages/imx500/imx500_posenet.cpp
@@ -0,0 +1,861 @@
+/* SPDX-License-Identifier: BSD-2-Clause */
+/*
+ * Copyright (C) 2024, Raspberry Pi Ltd
+ *
+ * imx500_posenet.cpp - IMX500 inference for PoseNet
+ */
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <limits>
+#include <mutex>
+#include <numeric>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include <libcamera/control_ids.h>
+#include <libcamera/geometry.h>
+
+#include "core/rpicam_app.hpp"
+#include "post_processing_stages/post_processing_stage.hpp"
+
+#include "imx500_post_processing_stage.hpp"
+
+using Rectangle = libcamera::Rectangle;
+using Size = libcamera::Size;
+namespace controls = libcamera::controls;
+
+namespace
+{
+
+constexpr Size INPUT_TENSOR_SIZE = { 481, 353 };
+constexpr Size MAP_SIZE = { 31, 23 };
+constexpr unsigned int NUM_KEYPOINTS = 17;
+// These 16 edges allow traversing of the pose graph along the mid_offsets (see
+// paper for details).
+static constexpr int NUM_EDGES = 16;
+constexpr unsigned int STRIDE = 16;
+constexpr unsigned int NUM_HEATMAPS = NUM_KEYPOINTS * MAP_SIZE.width * MAP_SIZE.height;
+constexpr unsigned int NUM_SHORT_OFFSETS = 2 * NUM_KEYPOINTS * MAP_SIZE.width * MAP_SIZE.height;
+constexpr unsigned int NUM_MID_OFFSETS = 64 * MAP_SIZE.width * MAP_SIZE.height;
+
+enum KeypointType
+{
+	Nose,
+	LeftEye,
+	RightEye,
+	LeftEar,
+	RightEar,
+	LeftShoulder,
+	RightShoulder,
+	LeftElbow,
+	RightElbow,
+	LeftWrist,
+	RightWrist,
+	LeftHip,
+	RightHip,
+	LeftKnee,
+	RightKnee,
+	LeftAnkle,
+	RightAnkle
+};
+
+const std::array<std::pair<KeypointType, KeypointType>, 32> EdgeList = { {
+	// Forward edges
+	{ Nose, LeftEye },
+	{ LeftEye, LeftEar },
+	{ Nose, RightEye },
+	{ RightEye, RightEar },
+	{ Nose, LeftShoulder },
+	{ LeftShoulder, LeftElbow },
+	{ LeftElbow, LeftWrist },
+	{ LeftShoulder, LeftHip },
+	{ LeftHip, LeftKnee },
+	{ LeftKnee, LeftAnkle },
+	{ Nose, RightShoulder },
+	{ RightShoulder, RightElbow },
+	{ RightElbow, RightWrist },
+	{ RightShoulder, RightHip },
+	{ RightHip, RightKnee },
+	{ RightKnee, RightAnkle },
+
+	// Backward edges
+	{ LeftEye, Nose },
+	{ LeftEar, LeftEye },
+	{ RightEye, Nose },
+	{ RightEar, RightEye },
+	{ LeftShoulder, Nose },
+	{ LeftElbow, LeftShoulder },
+	{ LeftWrist, LeftElbow },
+	{ LeftHip, LeftShoulder },
+	{ LeftKnee, LeftHip },
+	{ LeftAnkle, LeftKnee },
+	{ RightShoulder, Nose },
+	{ RightElbow, RightShoulder },
+	{ RightWrist, RightElbow },
+	{ RightHip, RightShoulder },
+	{ RightKnee, RightHip },
+	{ RightAnkle, RightKnee },
+} };
+
+struct Point
+{
+	float y, x;
+};
+
+// An adjacency list representing the directed edges connecting keypoints.
+struct AdjacencyList
+{
+	explicit AdjacencyList(const int num_nodes) : child_ids(num_nodes), edge_ids(num_nodes) {}
+
+	// child_ids[i] is a vector holding the node ids of all children of the i-th
+	// node and edge_ids[i] is a vector holding the edge ids of all edges stemming
+	// from the i-th node. If the k-th edge in the graph starts at the i-th node
+	// and ends at the j-th node, then child_ids[i] and edge_ids will contain j
+	// and k, respectively, at corresponding positions.
+	std::vector<std::vector<int>> child_ids;
+	std::vector<std::vector<int>> edge_ids;
+};
+
+// Defines a 2-D keypoint with (x, y) float coordinates and its type id.
+struct KeypointWithScore
+{
+	KeypointWithScore(const Point &_pt, const int _id, const float _score) : point(_pt), id(_id), score(_score) {}
+
+	[[maybe_unused]] friend std::ostream &operator<<(std::ostream &ost, const KeypointWithScore &keypoint)
+	{
+		return ost << keypoint.point.y << ", " << keypoint.point.x << ", " << keypoint.id << ", " << keypoint.score;
+	}
+
+	bool operator<(const KeypointWithScore &other) const { return score < other.score; }
+	bool operator>(const KeypointWithScore &other) const { return score > other.score; }
+
+	Point point;
+	int id;
+	float score;
+};
+
+using KeypointQueue = std::priority_queue<KeypointWithScore, std::vector<KeypointWithScore>>;
+using PoseKeypoints = std::array<Point, NUM_KEYPOINTS>;
+using PoseKeypointScores = std::array<float, NUM_KEYPOINTS>;
+
+struct PoseResults
+{
+	float pose_score;
+	PoseKeypoints pose_keypoints;
+	PoseKeypointScores pose_keypoint_scores;
+};
+
+float sigmoid(const float x)
+{
+	return 1.0f / (1.0f + std::exp(-x));
+}
+
+float log_odds(const float x)
+{
+	return -std::log(1.0f / (x + 1E-6) - 1.0f);
+}
+
+// Computes the squared distance between a pair of 2-D points.
+float squared_distance(const Point &a, const Point &b)
+{
+	const float dy = b.y - a.y;
+	const float dx = b.x - a.x;
+	return dy * dy + dx * dx;
+}
+
+std::vector<float> format_tensor(const float *data, unsigned int size, unsigned int div)
+{
+	std::vector<float> tensor(size * MAP_SIZE.width * MAP_SIZE.height);
+
+	for (unsigned int i = 0; i < size; i++)
+	{
+		for (unsigned int j = 0; j < MAP_SIZE.width; j++)
+		{
+			for (unsigned int k = 0; k < MAP_SIZE.height; k++)
+			{
+				tensor[(size * MAP_SIZE.width * k) + (size * j) + i] =
+					data[(MAP_SIZE.width * MAP_SIZE.height * i) + (j * MAP_SIZE.height) + k] / div;
+			}
+		}
+	}
+
+	return tensor;
+}
+
+// Build an adjacency list of the pose graph.
+AdjacencyList build_agency_list()
+{
+	AdjacencyList adjacency_list(NUM_KEYPOINTS);
+
+	for (unsigned int k = 0; k < EdgeList.size(); ++k)
+	{
+		const int parent_id = EdgeList[k].first;
+		const int child_id = EdgeList[k].second;
+		adjacency_list.child_ids[parent_id].push_back(child_id);
+		adjacency_list.edge_ids[parent_id].push_back(k);
+	}
+
+	return adjacency_list;
+}
+
+bool pass_keypoint_nms(std::vector<PoseKeypoints> poses, const size_t num_poses, const KeypointWithScore &keypoint,
+					   const float squared_nms_radius)
+{
+	for (unsigned int i = 0; i < num_poses; ++i)
+	{
+		if (squared_distance(keypoint.point, poses[i][keypoint.id]) <= squared_nms_radius)
+			return false;
+	}
+
+	return true;
+}
+
+// Finds the indices of the scores if we sort them in decreasing order.
+void decreasing_arg_sort(std::vector<int> &indices, const float *scores, unsigned int num_scores)
+{
+	indices.resize(num_scores);
+	std::iota(indices.begin(), indices.end(), 0);
+	std::sort(indices.begin(), indices.end(), [&scores](const int i, const int j) { return scores[i] > scores[j]; });
+}
+
+// Finds the indices of the scores if we sort them in decreasing order.
+void decreasing_arg_sort(std::vector<int> &indices, const std::vector<float> &scores)
+{
+	decreasing_arg_sort(indices, scores.data(), scores.size());
+}
+
+// Helper function for 1-D linear interpolation. It computes the floor and the
+// ceiling of the input coordinate, as well as the weighting factor between the
+// two interpolation endpoints, such that:
+// y = (1 - x_lerp) * vec[x_floor] + x_lerp * vec[x_ceil]
+void build_linear_interpolation(const float x, const int n, int *x_floor, int *x_ceil, float *x_lerp)
+{
+	const float x_proj = std::clamp(x, 0.0f, n - 1.0f);
+	*x_floor = static_cast<int>(std::floor(x_proj));
+	*x_ceil = static_cast<int>(std::ceil(x_proj));
+	*x_lerp = x - (*x_floor);
+}
+
+// Helper function for 2-D bilinear interpolation. It computes the four corners
+// of the 2x2 cell that contain the input coordinates (x, y), as well as the
+// weighting factor between the four interpolation endpoints, such that:
+// y =
+//   (1 - y_lerp) * ((1 - x_lerp) * vec[top_left] + x_lerp * vec[top_right]) +
+//   y_lerp * ((1 - x_lerp) * tensor[bottom_left] + x_lerp * vec[bottom_right])
+void build_bilinear_interpolation(const Point point, const unsigned int num_channels, int *top_left, int *top_right,
+								  int *bottom_left, int *bottom_right, float *y_lerp, float *x_lerp)
+{
+	int y_floor;
+	int y_ceil;
+	build_linear_interpolation(point.y, MAP_SIZE.height, &y_floor, &y_ceil, y_lerp);
+	int x_floor;
+	int x_ceil;
+	build_linear_interpolation(point.x, MAP_SIZE.width, &x_floor, &x_ceil, x_lerp);
+	*top_left = (y_floor * MAP_SIZE.width + x_floor) * num_channels;
+	*top_right = (y_floor * MAP_SIZE.width + x_ceil) * num_channels;
+	*bottom_left = (y_ceil * MAP_SIZE.width + x_floor) * num_channels;
+	*bottom_right = (y_ceil * MAP_SIZE.width + x_ceil) * num_channels;
+}
+
+// Sample the input tensor values at position (x, y) and at multiple channels.
+// The input tensor has shape [height, width, num_channels]. We bilinearly
+// sample its value at tensor(y, x, c), for c in the channels specified. This
+// is faster than calling the single channel interpolation function multiple
+// times because the computation of the positions needs to be done only once.
+std::vector<float> sample_tensor_at_multiple_channels(const std::vector<float> &tensor, const Point &point,
+													  const std::vector<int> &result_channels,
+													  unsigned int num_channels)
+{
+	int top_left, top_right, bottom_left, bottom_right;
+	float y_lerp, x_lerp;
+
+	build_bilinear_interpolation(point, num_channels, &top_left, &top_right, &bottom_left, &bottom_right, &y_lerp,
+								 &x_lerp);
+
+	std::vector<float> result;
+	for (auto &c : result_channels)
+	{
+		result.push_back((1 - y_lerp) * ((1 - x_lerp) * tensor[top_left + c] + x_lerp * tensor[top_right + c]) +
+						 y_lerp * ((1 - x_lerp) * tensor[bottom_left + c] + x_lerp * tensor[bottom_right + c]));
+	}
+
+	return result;
+}
+
+// Sample the input tensor values at position (x, y) and at a single channel.
+// The input tensor has shape [height, width, num_channels]. We bilinearly
+// sample its value at tensor(y, x, channel).
+float sample_tensor_at_single_channel(const std::vector<float> &tensor, const Point &point, unsigned int num_channels,
+									  const int c)
+{
+	std::vector<float> result = sample_tensor_at_multiple_channels(tensor, point, std::vector<int> { c }, num_channels);
+	return result[0];
+}
+
+KeypointQueue build_keypoint_queue(const std::vector<float> &scores, const std::vector<float> &short_offsets,
+								   const float score_threshold)
+{
+	constexpr unsigned int local_maximum_radius = 1;
+	unsigned int score_index = 0;
+	KeypointQueue queue;
+
+	for (unsigned int y = 0; y < MAP_SIZE.height; ++y)
+	{
+		for (unsigned int x = 0; x < MAP_SIZE.width; ++x)
+		{
+			unsigned int offset_index = 2 * score_index;
+			for (unsigned int j = 0; j < NUM_KEYPOINTS; ++j)
+			{
+				const float score = scores[score_index];
+				if (score >= score_threshold)
+				{
+					// Only consider keypoints whose score is maximum in a local window.
+					bool local_maximum = true;
+					const unsigned int y_start = std::max((int)y - (int)local_maximum_radius, 0);
+					const unsigned int y_end = std::min(y + local_maximum_radius + 1, MAP_SIZE.height);
+					for (unsigned int y_current = y_start; y_current < y_end; ++y_current)
+					{
+						const unsigned int x_start = std::max((int)x - (int)local_maximum_radius, 0);
+						const unsigned int x_end = std::min(x + local_maximum_radius + 1, MAP_SIZE.width);
+						for (unsigned int x_current = x_start; x_current < x_end; ++x_current)
+						{
+							if (scores[y_current * MAP_SIZE.width * NUM_KEYPOINTS + x_current * NUM_KEYPOINTS + j] >
+									score)
+							{
+								local_maximum = false;
+								break;
+							}
+						}
+						if (!local_maximum)
+							break;
+					}
+					if (local_maximum)
+					{
+						const float dy = short_offsets[offset_index];
+						const float dx = short_offsets[offset_index + NUM_KEYPOINTS];
+						const float y_refined = std::clamp(y + dy, 0.0f, MAP_SIZE.height - 1.0f);
+						const float x_refined = std::clamp(x + dx, 0.0f, MAP_SIZE.width - 1.0f);
+						queue.emplace(Point { y_refined, x_refined }, j, score);
+					}
+				}
+
+				++score_index;
+				++offset_index;
+			}
+		}
+	}
+
+	return queue;
+}
+
+// Follows the mid-range offsets, and then refines the position by the short-
+// range offsets for a fixed number of steps.
+Point find_displaced_position(const std::vector<float> &short_offsets, const std::vector<float> &mid_offsets,
+							  const Point &source, const int edge_id, const int target_id,
+							  const int offset_refinement_steps)
+{
+	float y = source.y, x = source.x;
+
+	// Follow the mid-range offsets.
+	std::vector<int> channels = { edge_id, NUM_EDGES + edge_id };
+
+	// Total size of mid_offsets is height x width x 2*2*num_edges
+	std::vector<float> offsets = sample_tensor_at_multiple_channels(mid_offsets, source, channels, 2 * 2 * NUM_EDGES);
+	y = std::clamp(y + offsets[0], 0.0f, MAP_SIZE.height - 1.0f);
+	x = std::clamp(x + offsets[1], 0.0f, MAP_SIZE.width - 1.0f);
+
+	// Refine by the short-range offsets.
+	channels[0] = target_id;
+	channels[1] = NUM_KEYPOINTS + target_id;
+	for (int i = 0; i < offset_refinement_steps; ++i)
+	{
+		offsets = sample_tensor_at_multiple_channels(short_offsets, Point { y, x }, channels, 2 * NUM_KEYPOINTS);
+		y = std::clamp(y + offsets[0], 0.0f, MAP_SIZE.height - 1.0f);
+		x = std::clamp(x + offsets[1], 0.0f, MAP_SIZE.width - 1.0f);
+	}
+
+	return Point { y, x };
+}
+
+void backtrack_decode_pose(const std::vector<float> &scores, const std::vector<float> &short_offsets,
+						   const std::vector<float> &mid_offsets, const KeypointWithScore &root,
+						   const AdjacencyList &adjacency_list, PoseKeypoints &pose_keypoints,
+						   PoseKeypointScores &keypoint_scores, unsigned int offset_refinement_steps)
+{
+	const float root_score = sample_tensor_at_single_channel(scores, root.point, root.id, NUM_KEYPOINTS);
+
+	// Used in order to put candidate keypoints in a priority queue w.r.t. their
+	// score. Keypoints with higher score have higher priority and will be
+	// decoded/processed first.
+	KeypointQueue decode_queue;
+	decode_queue.push(KeypointWithScore(root.point, root.id, root_score));
+
+	// Keeps track of the keypoints whose position has already been decoded.
+	std::vector<bool> keypoint_decoded(NUM_KEYPOINTS, false);
+
+	while (!decode_queue.empty())
+	{
+		// The top element in the queue is the next keypoint to be processed.
+		const KeypointWithScore current_keypoint = decode_queue.top();
+		decode_queue.pop();
+
+		if (keypoint_decoded[current_keypoint.id])
+			continue;
+
+		pose_keypoints[current_keypoint.id] = current_keypoint.point;
+		keypoint_scores[current_keypoint.id] = current_keypoint.score;
+
+		keypoint_decoded[current_keypoint.id] = true;
+
+		// Add the children of the current keypoint that have not been decoded yet
+		// to the priority queue.
+		const int num_children = adjacency_list.child_ids[current_keypoint.id].size();
+		for (int j = 0; j < num_children; ++j)
+		{
+			const int child_id = adjacency_list.child_ids[current_keypoint.id][j];
+			int edge_id = adjacency_list.edge_ids[current_keypoint.id][j];
+			if (keypoint_decoded[child_id])
+				continue;
+
+			// The mid-offsets block is organized as 4 blocks of NUM_EDGES:
+			// [fwd Y offsets][fwd X offsets][bwd Y offsets][bwd X offsets]
+			// OTOH edge_id is [0,NUM_EDGES) for forward edges and
+			// [NUM_EDGES, 2*NUM_EDGES) for backward edges.
+			// Thus if the edge is a backward edge (>NUM_EDGES) then we need
+			// to start 16 indices later to be correctly aligned with the mid-offsets.
+			if (edge_id > NUM_EDGES)
+				edge_id += NUM_EDGES;
+
+			const Point child_point = find_displaced_position(short_offsets, mid_offsets, current_keypoint.point,
+															  edge_id, child_id, offset_refinement_steps);
+			const float child_score = sample_tensor_at_single_channel(scores, child_point, NUM_KEYPOINTS, child_id);
+			decode_queue.emplace(child_point, child_id, child_score);
+		}
+	}
+}
+
+void find_overlapping_keypoints(std::vector<bool> &mask, const PoseKeypoints &pose1, const PoseKeypoints &pose2,
+								const float squared_radius)
+{
+	for (unsigned int k = 0; k < mask.size(); ++k)
+	{
+		if (squared_distance(pose1[k], pose2[k]) <= squared_radius)
+			mask[k] = true;
+	}
+}
+
+void perform_soft_keypoint_NMS(std::vector<float> &all_instance_scores, const std::vector<int> &decreasing_indices,
+							   const std::vector<PoseKeypoints> &all_keypoint_coords,
+							   const std::vector<PoseKeypointScores> &all_keypoint_scores,
+							   const float squared_nms_radius)
+{
+	const int num_instances = decreasing_indices.size();
+
+	all_instance_scores.resize(num_instances);
+	// Indicates the occlusion status of the keypoints of the active instance.
+	std::vector<bool> keypoint_occluded(NUM_KEYPOINTS);
+	// Indices of the keypoints of the active instance in decreasing score value.
+	std::vector<int> indices(NUM_KEYPOINTS);
+	for (int i = 0; i < num_instances; ++i)
+	{
+		const int current_index = decreasing_indices[i];
+		// Find the keypoints of the current instance which are overlapping with
+		// the corresponding keypoints of the higher-scoring instances and
+		// zero-out their contribution to the score of the current instance.
+		std::fill(keypoint_occluded.begin(), keypoint_occluded.end(), false);
+		for (int j = 0; j < i; ++j)
+		{
+			const int previous_index = decreasing_indices[j];
+			find_overlapping_keypoints(keypoint_occluded, all_keypoint_coords[current_index],
+									   all_keypoint_coords[previous_index], squared_nms_radius);
+		}
+		// We compute the argsort keypoint indices based on the original keypoint
+		// scores, but we do not let them contribute to the instance score if they
+		// have been non-maximum suppressed.
+		decreasing_arg_sort(indices, all_keypoint_scores[current_index].data(),
+							all_keypoint_scores[current_index].size());
+		float total_score = 0.0f;
+		for (unsigned int k = 0; k < NUM_KEYPOINTS; ++k)
+		{
+			if (!keypoint_occluded[indices[k]])
+				total_score += all_keypoint_scores[current_index][indices[k]];
+		}
+		all_instance_scores[current_index] = total_score / NUM_KEYPOINTS;
+	}
+}
+
+} // namespace
+
+#define NAME "imx500_posenet"
+
+class PoseNet : public IMX500PostProcessingStage
+{
+public:
+	PoseNet(RPiCamApp *app) : IMX500PostProcessingStage(app) {}
+
+	char const *Name() const override;
+
+	void Read(boost::property_tree::ptree const &params) override;
+
+	void Configure() override;
+
+	bool Process(CompletedRequestPtr &completed_request) override;
+
+private:
+	std::vector<PoseResults> decodeAllPoses(const std::vector<float> &scores, const std::vector<float> &short_offsets,
+											const std::vector<float> &mid_offsets);
+	void translateCoordinates(std::vector<PoseResults> &results, const Rectangle &scaler_crop) const;
+	void filterOutputObjects(const std::vector<PoseResults> &results);
+
+	struct LtResults
+	{
+		PoseResults results;
+		unsigned int visible;
+		unsigned int hidden;
+		bool matched;
+	};
+
+	std::vector<LtResults> lt_results_;
+	std::mutex lt_lock_;
+
+	// Config params:
+	float threshold_;
+	unsigned int max_detections_;
+	unsigned int offset_refinement_steps_;
+	float nms_radius_;
+	bool temporal_filtering_;
+
+	float tolerance_;
+	float factor_;
+	unsigned int visible_frames_;
+	unsigned int hidden_frames_;
+};
+
+char const *PoseNet::Name() const
+{
+	return NAME;
+}
+
+void PoseNet::Read(boost::property_tree::ptree const &params)
+{
+	max_detections_ = params.get<unsigned int>("max_detections", 10);
+	threshold_ = params.get<float>("threshold", 0.5f);
+	offset_refinement_steps_ = params.get<unsigned int>("offset_refinement_steps", 5);
+	nms_radius_ = params.get<float>("nms_radius", 10) / STRIDE;
+
+	if (params.find("temporal_filter") != params.not_found())
+	{
+		temporal_filtering_ = true;
+		tolerance_ = params.get<float>("temporal_filter.tolerance", 0.05);
+		factor_ = params.get<float>("temporal_filter.factor", 0.2);
+		visible_frames_ = params.get<unsigned int>("temporal_filter.visible_frames", 5);
+		hidden_frames_ = params.get<unsigned int>("temporal_filter.hidden_frames", 2);
+	}
+	else
+		temporal_filtering_ = false;
+
+	IMX500PostProcessingStage::Read(params);
+}
+
+void PoseNet::Configure()
+{
+	IMX500PostProcessingStage::Configure();
+	IMX500PostProcessingStage::ShowFwProgressBar();
+}
+
+bool PoseNet::Process(CompletedRequestPtr &completed_request)
+{
+	auto scaler_crop = completed_request->metadata.get(controls::ScalerCrop);
+	if (!raw_stream_ || !scaler_crop)
+	{
+		LOG_ERROR("Must have RAW stream and scaler crop available to get sensor dimensions!");
+		return false;
+	}
+
+	auto output = completed_request->metadata.get(controls::rpi::CnnOutputTensor);
+	if (!output)
+	{
+		LOG_ERROR("No output tensor found in metadata!");
+		return false;
+	}
+
+	if (output->size() < NUM_HEATMAPS + NUM_SHORT_OFFSETS + NUM_MID_OFFSETS)
+	{
+		LOG_ERROR("Unexpected output tensor size: " << output->size());
+		return false;
+	}
+
+	std::vector<float> scores =
+		format_tensor((float *)output->data(), NUM_HEATMAPS / (MAP_SIZE.width * MAP_SIZE.height), 1);
+	std::vector<float> short_offsets = format_tensor((float *)output->data() + NUM_HEATMAPS,
+													 NUM_SHORT_OFFSETS / (MAP_SIZE.width * MAP_SIZE.height), STRIDE);
+	std::vector<float> mid_offsets = format_tensor((float *)output->data() + NUM_HEATMAPS + NUM_SHORT_OFFSETS,
+												   NUM_MID_OFFSETS / (MAP_SIZE.width * MAP_SIZE.height), STRIDE);
+
+	std::vector<PoseResults> results = decodeAllPoses(scores, short_offsets, mid_offsets);
+	translateCoordinates(results, *scaler_crop);
+
+	std::vector<std::vector<libcamera::Point>> locations;
+	std::vector<std::vector<float>> confidences;
+
+	if (temporal_filtering_)
+	{
+		// Process() can be concurrently called through different threads for consecutive CompletedRequests if
+		// things are running behind.  So protect access to the lt_results_ state object.
+		std::scoped_lock<std::mutex> l(lt_lock_);
+
+		filterOutputObjects(results);
+		for (auto const &lt_r : lt_results_)
+		{
+			if (lt_r.hidden)
+				continue;
+
+			locations.push_back({});
+			confidences.push_back({});
+
+			for (auto const &s : lt_r.results.pose_keypoint_scores)
+				confidences.back().push_back(s);
+			for (auto const &k : lt_r.results.pose_keypoints)
+				locations.back().emplace_back(k.x, k.y);
+		}
+	}
+	else if (results.size())
+	{
+		for (auto const &result : results)
+		{
+			locations.push_back({});
+			confidences.push_back({});
+
+			for (auto const &s : result.pose_keypoint_scores)
+				confidences.back().push_back(s);
+			for (auto const &k : result.pose_keypoints)
+				locations.back().emplace_back(k.x, k.y);
+		}
+	}
+
+	if (locations.size() && locations.size() == confidences.size())
+	{
+		completed_request->post_process_metadata.Set("pose_estimation.locations", locations);
+		completed_request->post_process_metadata.Set("pose_estimation.confidences", confidences);
+	}
+
+	return IMX500PostProcessingStage::Process(completed_request);
+}
+
+// Decodes poses from the score map, the short and mid offsets.
+// "Block space" refers to the output y and z size of the network.
+// For example if the network that takes a (353,481) (y,x) input image will have
+// an output field of 23, 31. Thus the sizes of the input vectors to this
+// functions will be
+//   scores: 23 x 31 x NUM_KEYPOINTS
+//   short_offsets 23 x 31 x 2 x NUM_KEYPOINTS (x and y per keypoint)
+//   mid_offsets 23 x 31 x 2 x 2 2 NUM_EDGES (x and y for each fwd and bwd edge)
+// Thus height and width need to be set to 23 and 31 respectively.
+// nms_radius must also be given in these units.
+// The output coordinates will be in pixel coordinates.
+//
+// For details see https://arxiv.org/abs/1803.08225
+// PersonLab: Person Pose Estimation and Instance Segmentation with a
+// Bottom-Up, Part-Based, Geometric Embedding Model
+// George Papandreou, Tyler Zhu, Liang-Chieh Chen, Spyros Gidaris,
+// Jonathan Tompson, Kevin Murphy
+std::vector<PoseResults> PoseNet::decodeAllPoses(const std::vector<float> &scores,
+												 const std::vector<float> &short_offsets,
+												 const std::vector<float> &mid_offsets)
+{
+	const float min_score_logit = log_odds(threshold_);
+
+	KeypointQueue queue = build_keypoint_queue(scores, short_offsets, min_score_logit);
+	AdjacencyList adjacency_list = build_agency_list();
+
+	std::vector<int> indices(NUM_KEYPOINTS);
+
+	// Generate at most max_detections object instances per image in decreasing
+	// root part score order.
+	std::vector<float> all_instance_scores;
+
+	std::vector<PoseKeypoints> scratch_poses(NUM_KEYPOINTS);
+	std::vector<PoseKeypointScores> scratch_keypoint_scores(max_detections_);
+
+	unsigned int pose_counter = 0;
+	while (pose_counter < max_detections_ && !queue.empty())
+	{
+		// The top element in the queue is the next root candidate.
+		const KeypointWithScore root = queue.top();
+		queue.pop();
+
+		// Reject a root candidate if it is within a disk of nms_radius_ pixels
+		// from the corresponding part of a previously detected instance.
+		if (!pass_keypoint_nms(scratch_poses, pose_counter, root, nms_radius_ * nms_radius_))
+			continue;
+
+		auto &next_pose = scratch_poses[pose_counter];
+		auto &next_scores = scratch_keypoint_scores[pose_counter];
+		for (unsigned int k = 0; k < NUM_KEYPOINTS; ++k)
+		{
+			next_pose[k].x = -1.0f;
+			next_pose[k].y = -1.0f;
+			next_scores[k] = -1E5;
+		}
+
+		backtrack_decode_pose(scores, short_offsets, mid_offsets, root, adjacency_list, next_pose, next_scores,
+							  offset_refinement_steps_);
+
+		// Convert keypoint-level scores from log-odds to probabilities and compute
+		// an initial instance-level score as the average of the scores of the top-k
+		// scoring keypoints.
+		for (unsigned int k = 0; k < NUM_KEYPOINTS; ++k)
+			next_scores[k] = sigmoid(next_scores[k]);
+
+		decreasing_arg_sort(indices, next_scores.data(), next_scores.size());
+
+		float instance_score = 0.0f;
+		for (unsigned int j = 0; j < NUM_KEYPOINTS; ++j)
+			instance_score += next_scores[indices[j]];
+
+		instance_score /= NUM_KEYPOINTS;
+
+		if (instance_score >= threshold_)
+		{
+			pose_counter++;
+			all_instance_scores.push_back(instance_score);
+		}
+	}
+
+	// Sort the detections in decreasing order of their instance-level scores.
+	std::vector<int> decreasing_indices;
+	decreasing_arg_sort(decreasing_indices, all_instance_scores);
+
+	// Keypoint-level soft non-maximum suppression and instance-level rescoring as
+	// the average of the top-k keypoints in terms of their keypoint-level scores.
+	perform_soft_keypoint_NMS(all_instance_scores, decreasing_indices, scratch_poses, scratch_keypoint_scores,
+							  nms_radius_ * nms_radius_);
+
+	// Sort the detections in decreasing order of their final instance-level
+	// scores. Usually the order does not change but this is not guaranteed.
+	decreasing_arg_sort(decreasing_indices, all_instance_scores);
+
+	std::vector<PoseResults> results;
+	for (int index : decreasing_indices)
+	{
+		if (all_instance_scores[index] < threshold_)
+			break;
+
+		// New result.
+		results.push_back({});
+
+		// Rescale keypoint coordinates into pixel space (much more useful for user).
+		for (unsigned int k = 0; k < NUM_KEYPOINTS; ++k)
+		{
+			results.back().pose_keypoints[k].y = scratch_poses[index][k].y * STRIDE;
+			results.back().pose_keypoints[k].x = scratch_poses[index][k].x * STRIDE;
+		}
+
+		std::copy(scratch_keypoint_scores[index].begin(), scratch_keypoint_scores[index].end(),
+				  results.back().pose_keypoint_scores.begin());
+		results.back().pose_score = all_instance_scores[index];
+	}
+
+	return results;
+}
+
+void PoseNet::translateCoordinates(std::vector<PoseResults> &results, const Rectangle &scaler_crop) const
+{
+	for (auto &r : results)
+	{
+		for (auto &keypoint : r.pose_keypoints)
+		{
+			std::vector<float> coords{ keypoint.x / (INPUT_TENSOR_SIZE.width - 1),
+									   keypoint.y / (INPUT_TENSOR_SIZE.height - 1),
+									   0, 0 };
+			Rectangle translated = ConvertInferenceCoordinates(coords, scaler_crop);
+			keypoint.x = translated.x;
+			keypoint.y = translated.y;
+		}
+	}
+}
+
+void PoseNet::filterOutputObjects(const std::vector<PoseResults> &results)
+{
+	const Size isp_output_size = output_stream_->configuration().size;
+
+	for (auto &lt_r : lt_results_)
+		lt_r.matched = false;
+
+	for (auto const &r : results)
+	{
+		bool matched = false;
+		for (auto &lt_r : lt_results_)
+		{
+			// Try and match a detected object in our long term list.
+			bool found = true;
+			for (unsigned int i = 0; i < NUM_KEYPOINTS; i++)
+			{
+				if (std::abs(lt_r.results.pose_keypoints[i].x - r.pose_keypoints[i].x) >
+						tolerance_ * isp_output_size.width ||
+					std::abs(lt_r.results.pose_keypoints[i].y - r.pose_keypoints[i].y) >
+						tolerance_ * isp_output_size.height)
+				{
+					found = false;
+					break;
+				}
+			}
+
+			if (found)
+			{
+				lt_r.matched = matched = true;
+				lt_r.results.pose_score = r.pose_score;
+
+				for (unsigned int i = 0; i < NUM_KEYPOINTS; i++)
+				{
+					lt_r.results.pose_keypoint_scores[i] =
+						factor_ * r.pose_keypoint_scores[i] + (1 - factor_) * lt_r.results.pose_keypoint_scores[i];
+					lt_r.results.pose_keypoints[i].x =
+						factor_ * r.pose_keypoints[i].x + (1 - factor_) * lt_r.results.pose_keypoints[i].x;
+					lt_r.results.pose_keypoints[i].y =
+						factor_ * r.pose_keypoints[i].y + (1 - factor_) * lt_r.results.pose_keypoints[i].y;
+				}
+
+				// Reset the visibility counter for when the result next disappears.
+				lt_r.visible = visible_frames_;
+				// Decrement the hidden counter until the result becomes visible in the list.
+				lt_r.hidden = std::max(0, (int)lt_r.hidden - 1);
+				break;
+			}
+		}
+
+		// Add the result to the long term list if not found.  This result will remain hidden for hidden_frames_
+		// consecutive frames.
+		if (!matched)
+			lt_results_.push_back({ r, visible_frames_, hidden_frames_, true });
+	}
+
+	for (auto &lt_r : lt_results_)
+	{
+		if (!lt_r.matched)
+		{
+			// If a non matched result in the long term list is still hidden, set visible count to 0 so that it must be
+			// matched for hidden_frames_ consecutive frames before becoming visible. Otherwise, decrement the visible
+			// count of unmatched objects in the long term list.
+			if (lt_r.hidden)
+				lt_r.visible = 0;
+			else
+				lt_r.visible--;
+		}
+	}
+
+	// Remove now invisible objects from the long term list.
+	lt_results_.erase(std::remove_if(lt_results_.begin(), lt_results_.end(),
+						[] (const LtResults &obj) { return !obj.matched && !obj.visible; }),
+					  lt_results_.end());
+}
+
+PostProcessingStage *Create(RPiCamApp *app)
+{
+	return new PoseNet(app);
+}
+
+RegisterStage reg(NAME, &Create);
diff --git a/post_processing_stages/imx500/imx500_post_processing_stage.cpp b/post_processing_stages/imx500/imx500_post_processing_stage.cpp
new file mode 100644
index 00000000..5114871f
--- /dev/null
+++ b/post_processing_stages/imx500/imx500_post_processing_stage.cpp
@@ -0,0 +1,280 @@
+/* SPDX-License-Identifier: BSD-2-Clause */
+/*
+ * Copyright (C) 2024, Raspberry Pi Ltd
+ *
+ * imx500_post_rpocessing_stage.cpp - IMX500 post processing stage base class
+ */
+#include <chrono>
+#include <cmath>
+#include <fcntl.h>
+#include <filesystem>
+#include <linux/videodev2.h>
+#include <sstream>
+#include <string>
+#include <sys/ioctl.h>
+#include <thread>
+#include <unistd.h>
+
+#include <boost/property_tree/ptree.hpp>
+
+#include "imx500_post_processing_stage.hpp"
+
+#include <libcamera/control_ids.h>
+
+using Stream = libcamera::Stream;
+using Rectangle = libcamera::Rectangle;
+using Size = libcamera::Size;
+
+using namespace std::chrono_literals;
+
+namespace fs = std::filesystem;
+
+namespace
+{
+
+const unsigned int ROI_CTRL_ID = 0x00982900;
+const unsigned int NETWORK_FW_CTRL_ID = 0x00982901;
+
+inline int16_t conv_reg_signed(int16_t reg)
+{
+	constexpr unsigned int ROT_DNN_NORM_SIGNED_SHT = 8;
+	constexpr unsigned int ROT_DNN_NORM_MASK = 0x01FF;
+
+	if (!((reg >> ROT_DNN_NORM_SIGNED_SHT) & 1))
+		return reg;
+	else
+		return -((~reg + 1) & ROT_DNN_NORM_MASK);
+}
+
+} // namespace
+
+
+IMX500PostProcessingStage::IMX500PostProcessingStage(RPiCamApp *app)
+	: PostProcessingStage(app), device_fd_(-1)
+{
+	for (unsigned int i = 0; i < 16; i++)
+	{
+		const fs::path test_dir { "/sys/class/video4linux/v4l-subdev" + std::to_string(i) + "/device" };
+		const fs::path module_dir { test_dir.string() + "/driver/module" };
+		const fs::path id_dir { test_dir.string() + "/of_node" };
+
+		if (fs::exists(module_dir) && fs::is_symlink(module_dir))
+		{
+			fs::path ln = fs::read_symlink(module_dir);
+			if (ln.string().find("imx500") != std::string::npos)
+			{
+				const std::string dev_node { "/dev/v4l-subdev" + std::to_string(i) };
+				device_fd_ = open(dev_node.c_str(), O_RDONLY, 0);
+
+				/* Find the progress indicator sysfs dev nodes. */
+				const std::string test_dir_str = fs::read_symlink(test_dir).string();
+				const std::size_t pos = test_dir_str.find_last_of("/") + 1;
+				assert(pos != std::string::npos);
+
+				const std::string imx500_device_id = test_dir_str.substr(pos);
+				std::string spi_device_id = imx500_device_id;
+				const std::size_t rep = spi_device_id.find("001a");
+				spi_device_id.replace(rep, 4, "0040");
+
+				const fs::path imx500_progress { "/sys/kernel/debug/imx500-fw:" + imx500_device_id + "/fw_progress" };
+				const fs::path spi_progress { "/sys/kernel/debug/rp2040-spi:" + spi_device_id + "/transfer_progress" };
+
+				fw_progress_.open(imx500_progress.c_str(), std::ios_base::in);
+				fw_progress_chunk_.open(spi_progress.c_str(), std::ios_base::in);
+
+				break;
+			}
+		}
+	}
+
+	if (device_fd_ < 0)
+		throw std::runtime_error("Cannot open imx500 device node");
+
+	SetInferenceRoiAbs({ 0, 0, 4056, 3040 });
+}
+
+IMX500PostProcessingStage::~IMX500PostProcessingStage()
+{
+	if (device_fd_ >= 0)
+		close(device_fd_);
+}
+
+void IMX500PostProcessingStage::Read(boost::property_tree::ptree const &params)
+{
+	if (params.find("save_input_tensor") != params.not_found())
+	{
+		auto const &pt = params.get_child("save_input_tensor");
+
+		std::string filename = pt.get<std::string>("filename");
+		num_input_tensors_saved_ = pt.get<unsigned int>("num_tensors", 1);
+		input_tensor_file_ = std::ofstream(filename, std::ios::out | std::ios::binary);
+
+		norm_val_ = PostProcessingStage::GetJsonArray<int32_t>(pt, "norm_val", { 0, 0, 0, 0 });
+		norm_shift_ = PostProcessingStage::GetJsonArray<uint8_t>(pt, "norm_shift", { 0, 0, 0, 0 });
+		div_val_ = PostProcessingStage::GetJsonArray<int16_t>(pt, "div_val", { 1, 1, 1, 1 });
+		div_shift_ = pt.get<unsigned int>("div_shift", 0);
+	}
+
+	/* Load the network firmware. */
+	std::string network_file = params.get<std::string>("network_file");
+	if (!fs::exists(network_file))
+		throw std::runtime_error(network_file + " not found!");
+
+	int fd = open(network_file.c_str(), O_RDONLY, 0);
+
+	v4l2_control ctrl { NETWORK_FW_CTRL_ID, fd };
+	int ret = ioctl(device_fd_, VIDIOC_S_CTRL, &ctrl);
+	if (ret)
+		throw std::runtime_error("failed to set network fw ioctl");
+
+	close(fd);
+
+	LOG(1, "\n------------------------------------------------------------------------------------------------------------------\n"
+		"NOTE: Loading network firmware onto the IMX500 can take several minutes, please do not close down the application."
+		"\n------------------------------------------------------------------------------------------------------------------\n");
+}
+
+void IMX500PostProcessingStage::Configure()
+{
+	output_stream_ = app_->GetMainStream();
+	raw_stream_ = app_->RawStream();
+	save_frames_ = num_input_tensors_saved_;
+}
+
+bool IMX500PostProcessingStage::Process(CompletedRequestPtr &completed_request)
+{
+	auto input = completed_request->metadata.get(controls::rpi::CnnInputTensor);
+
+	if (input && input_tensor_file_.is_open())
+	{
+		// There is a chance that this may be called through multiple threads, so serialize the file access.
+		std::scoped_lock<std::mutex> l(lock_);
+
+		for (unsigned int i = 0; i < input->size(); i++)
+		{
+			const unsigned int channel = i % 3; // Assume RGB interleaved format.
+			int16_t sample = static_cast<int8_t>(input->data()[i]);
+			sample = (sample << norm_shift_[channel]) - conv_reg_signed(norm_val_[channel]);
+			sample = ((sample << div_shift_) / div_val_[channel]) & 0xFF;
+			input_tensor_file_.put(static_cast<uint8_t>(sample));
+		}
+
+		if (--save_frames_ == 0)
+			input_tensor_file_.close();
+	}
+
+	return false;
+}
+
+Rectangle IMX500PostProcessingStage::ConvertInferenceCoordinates(const std::vector<float> &coords,
+																 const Rectangle &scaler_crop) const
+{
+	// Convert the inference image co-ordinates into the final ISP output co-ordinates.
+	const Size &isp_output_size = output_stream_->configuration().size;
+	const Size &sensor_output_size = raw_stream_->configuration().size;
+	const Rectangle sensor_crop = scaler_crop.scaledBy(sensor_output_size, full_sensor_resolution_.size());
+
+	if (coords.size() != 4)
+		return {};
+
+	// Object scaled to the full sensor resolution
+	Rectangle obj;
+	obj.x = std::round(coords[0] * (full_sensor_resolution_.width - 1));
+	obj.y = std::round(coords[1] * (full_sensor_resolution_.height - 1));
+	obj.width = std::round(coords[2] * (full_sensor_resolution_.width - 1));
+	obj.height = std::round(coords[3] * (full_sensor_resolution_.height - 1));
+
+	// Object on inference image -> sensor image
+	const Rectangle obj_sensor = obj.scaledBy(sensor_output_size, full_sensor_resolution_.size());
+	// -> bounded to the ISP crop on the sensor image
+	const Rectangle obj_bound = obj_sensor.boundedTo(sensor_crop);
+	// -> translated by the start of the crop offset
+	const Rectangle obj_translated = obj_bound.translatedBy(-sensor_crop.topLeft());
+	// -> and finally scaled to the ISP output.
+	const Rectangle obj_scaled = obj_translated.scaledBy(isp_output_size, sensor_crop.size());
+
+	LOG(2, obj << " -> (sensor) " << obj_sensor << " -> (bound) " << obj_bound
+			   << " -> (translate) " << obj_translated << " -> (scaled) " << obj_scaled);
+
+	return obj_scaled;
+}
+
+void IMX500PostProcessingStage::SetInferenceRoiAbs(const Rectangle &roi_) const
+{
+	Rectangle roi = roi_.boundedTo(full_sensor_resolution_);
+	const uint32_t roi_array[4] = { (uint32_t)roi.x, (uint32_t)roi.y, (uint32_t)roi.width, (uint32_t)roi.height };
+
+	v4l2_ext_control roi_ctrl;
+	roi_ctrl.id = ROI_CTRL_ID;
+	roi_ctrl.p_u32 = (unsigned int *)&roi_array;
+	roi_ctrl.size = 16;
+
+	v4l2_ext_controls ctrls;
+	ctrls.count = 1;
+	ctrls.controls = (v4l2_ext_control *)&roi_ctrl;
+
+	int ret = ioctl(device_fd_, VIDIOC_S_EXT_CTRLS, &ctrls);
+	if (ret)
+		LOG_ERROR("IMX500: Unable to set absolute ROI");
+}
+
+void IMX500PostProcessingStage::SetInferenceRoiAuto(const unsigned int width, const unsigned int height) const
+{
+	Size s = full_sensor_resolution_.size().boundedToAspectRatio(Size(width, height));
+	Rectangle r = s.centeredTo(full_sensor_resolution_.center()).enclosedIn(full_sensor_resolution_);
+	SetInferenceRoiAbs(r);
+}
+
+void IMX500PostProcessingStage::ShowFwProgressBar()
+{
+	if (fw_progress_.is_open() && fw_progress_chunk_.is_open())
+	{
+		std::thread progress_thread { &IMX500PostProcessingStage::doProgressBar, this };
+		progress_thread.detach();
+	}
+}
+
+std::vector<unsigned int> split(std::stringstream &stream)
+{
+	std::vector<unsigned int> result;
+	unsigned int n;
+
+	while (stream >> n)
+		result.push_back(n);
+
+	return result;
+}
+
+void IMX500PostProcessingStage::doProgressBar()
+{
+	while (1)
+	{
+		std::stringstream fw_progress_str, block_str;
+		unsigned int block_progress;
+
+		fw_progress_.seekg(0);
+		fw_progress_chunk_.seekg(0);
+		fw_progress_str << fw_progress_.rdbuf();
+		block_str << fw_progress_chunk_.rdbuf();
+
+		std::vector<unsigned int> progress = split(fw_progress_str);
+		block_str >> block_progress;
+
+		// [0] == FW state, [1] == current size, [2] == total size.
+		if (progress.size() == 3 && progress[0] == 2)
+		{
+			unsigned int current = progress[1] + block_progress;
+			std::cerr << "Network Firmware Upload: " << current * 100 / progress[2] << "% (" << current / 1024 << "/"
+					  << progress[2] / 1024 << " KB)\r";
+			std::cerr.flush();
+
+			if (progress[2] && progress[1] == progress[2])
+			{
+				std::cerr << std::endl;
+				break;
+			}
+		}
+
+		std::this_thread::sleep_for(500ms);
+	}
+}
diff --git a/post_processing_stages/imx500/imx500_post_processing_stage.hpp b/post_processing_stages/imx500/imx500_post_processing_stage.hpp
new file mode 100644
index 00000000..4fcd76b5
--- /dev/null
+++ b/post_processing_stages/imx500/imx500_post_processing_stage.hpp
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: BSD-2-Clause */
+/*
+ * Copyright (C) 2024, Raspberry Pi Ltd
+ *
+ * imx500_post_rpocessing_stage.hpp - IMX500 post processing stage base class
+ */
+
+#pragma once
+
+#include <fstream>
+#include <mutex>
+
+#include <boost/property_tree/ptree.hpp>
+
+#include <libcamera/geometry.h>
+#include <libcamera/stream.h>
+
+#include "core/completed_request.hpp"
+#include "core/rpicam_app.hpp"
+#include "post_processing_stages/post_processing_stage.hpp"
+
+class IMX500PostProcessingStage : public PostProcessingStage
+{
+public:
+	static constexpr unsigned int Max_Num_Tensors = 8;
+	static constexpr unsigned int Network_Name_Len = 64;
+
+	struct OutputTensorInfo
+	{
+		uint32_t tensor_data_num;
+		uint16_t size;
+		uint8_t ordinal;
+		uint8_t serialization_index;
+	};
+
+	struct CnnOutputTensorInfo
+	{
+		char network_name[Network_Name_Len];
+		uint32_t num_tensors;
+		OutputTensorInfo info[Max_Num_Tensors];
+	};
+
+	IMX500PostProcessingStage(RPiCamApp *app);
+	~IMX500PostProcessingStage();
+
+	void Read(boost::property_tree::ptree const &params) override;
+
+	void Configure() override;
+
+	bool Process(CompletedRequestPtr &completed_request) override;
+
+	libcamera::Rectangle ConvertInferenceCoordinates(const std::vector<float> &coords,
+													 const libcamera::Rectangle &scalerCrop) const;
+	void SetInferenceRoiAbs(const libcamera::Rectangle &roi_) const;
+	void SetInferenceRoiAuto(const unsigned int width, const unsigned int height) const;
+	void ShowFwProgressBar();
+
+protected:
+	libcamera::Rectangle full_sensor_resolution_ = libcamera::Rectangle(0, 0, 4056, 3040);
+	libcamera::Stream *output_stream_;
+	libcamera::Stream *raw_stream_;
+
+private:
+	void doProgressBar();
+
+	int device_fd_;
+	std::ifstream fw_progress_;
+	std::ifstream fw_progress_chunk_;
+
+	std::ofstream input_tensor_file_;
+	unsigned int num_input_tensors_saved_;
+	unsigned int save_frames_;
+	std::vector<int32_t> norm_val_;
+	std::vector<uint8_t> norm_shift_;
+	std::vector<int16_t> div_val_;
+	unsigned int div_shift_;
+	std::mutex lock_;
+};
diff --git a/post_processing_stages/imx500/meson.build b/post_processing_stages/imx500/meson.build
new file mode 100644
index 00000000..985fbc8f
--- /dev/null
+++ b/post_processing_stages/imx500/meson.build
@@ -0,0 +1,31 @@
+imx500_postprocessing_src = files([
+    # Base stage
+    'imx500_post_processing_stage.cpp',
+    # Object detection
+    'imx500_object_detection.cpp',
+    # Posenet
+    'imx500_posenet.cpp',
+])
+
+postproc_assets += files([
+    assets_dir / 'imx500_mobilenet_ssd.json',
+    assets_dir / 'imx500_posenet.json',
+])
+
+imx500_postprocessing_lib = shared_module('imx500-postproc', imx500_postprocessing_src,
+                                          dependencies : libcamera_dep,
+                                          include_directories : '../..',
+                                          install : true,
+                                          install_dir : posproc_libdir,
+                                          name_prefix : '',
+                                         )
+
+if get_option('download_imx500_models')
+    download_script = meson.project_source_root() / 'utils' / 'download-imx500-models.sh'
+    custom_target('imx500-models',
+                  command : [ download_script, '@OUTPUT@' ],
+                  output : 'imx500-models',
+                  install : true,
+                  install_dir : get_option('datadir'),
+                 )
+endif
diff --git a/post_processing_stages/meson.build b/post_processing_stages/meson.build
index 1dd6edf7..aa101413 100644
--- a/post_processing_stages/meson.build
+++ b/post_processing_stages/meson.build
@@ -109,6 +109,11 @@ if hailort_dep.found() and hailo_tappas_dep.found()
     enable_hailo = true
 endif
 
+# IMX500 postprocessing stages.
+if get_option('enable_imx500')
+    subdir('imx500')
+endif
+
 post_processing_headers = files([
     'histogram.hpp',
     'object_detect.hpp',
diff --git a/post_processing_stages/post_processing_stage.hpp b/post_processing_stages/post_processing_stage.hpp
index e745a491..30237ba5 100644
--- a/post_processing_stages/post_processing_stage.hpp
+++ b/post_processing_stages/post_processing_stage.hpp
@@ -5,9 +5,12 @@
  * post_processing_stage.hpp - Post processing stage base class definition.
  */
 
+#pragma once
+
 #include <chrono>
 #include <map>
 #include <string>
+#include <vector>
 
 // Prevents compiler warnings in Boost headers with more recent versions of GCC.
 #define BOOST_BIND_GLOBAL_PLACEHOLDERS
@@ -71,6 +74,24 @@ class PostProcessingStage
 		return std::chrono::duration<double, R>(t2 - t1);
 	}
 
+	template <typename T>
+	static std::vector<T> GetJsonArray(const boost::property_tree::ptree &pt, const std::string &key,
+									   const std::vector<T> &default_value = {})
+	{
+		std::vector<T> vec;
+
+		if (pt.find(key) != pt.not_found())
+		{
+			for (auto &v : pt.get_child(key))
+				vec.push_back(v.second.get_value<T>());
+		}
+
+		for (unsigned int i = vec.size(); i < default_value.size(); i++)
+			vec.push_back(default_value[i]);
+
+		return vec;
+	}
+
 	RPiCamApp *app_;
 };
 
diff --git a/utils/download-imx500-models.sh b/utils/download-imx500-models.sh
new file mode 100755
index 00000000..04e2ef3f
--- /dev/null
+++ b/utils/download-imx500-models.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+set -e
+
+networks=(
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_deeplabv3plus.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientdet_lite0_pp.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnet_bo.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnet_lite0.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnetv2_b0.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnetv2_b1.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnetv2_b2.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_higherhrnet_coco.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_inputtensoronly.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_levit_128s.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_mnasnet1.0.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_mobilenet_v2.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_mobilevit_xs.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_mobilevit_xxs.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_nanodet_plus_416x416_pp.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_nanodet_plus_416x416.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_posenet.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_regnetx_002.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_regnety_002.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_regnety_004.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_resnet18.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_shufflenet_v2_x1_5.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_squeezenet1.0.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_ssd_mobilenetv2_fpnlite_320x320_pp.rpk
+    https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_yolov8n_pp.rpk
+)
+
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 <directory>"
+    exit 1
+fi
+
+dir=$1
+mkdir -p $dir
+
+for url in ${networks[@]}; do
+    wget -nv -N -P $dir $url
+done