diff --git a/assets/imx500_mobilenet_ssd.json b/assets/imx500_mobilenet_ssd.json new file mode 100644 index 00000000..0a45abf4 --- /dev/null +++ b/assets/imx500_mobilenet_ssd.json @@ -0,0 +1,123 @@ +{ + "imx500_object_detection": + { + "max_detections" : 5, + "threshold" : 0.6, + "network_file": "/usr/share/imx500-models/imx500_network_ssd_mobilenetv2_fpnlite_320x320_pp.rpk", + + "save_input_tensor": + { + "filename": "/home/pi/input_tensor.raw", + "num_tensors": 10, + "norm_val": [384, 384, 384, 0], + "norm_shift": [0, 0, 0, 0] + }, + + "temporal_filter": + { + "tolerance": 0.1, + "factor": 0.2, + "visible_frames": 4, + "hidden_frames": 2 + }, + + "classes": + [ + "person", + "bicycle", + "car", + "motorcycle", + "airplane", + "bus", + "train", + "truck", + "boat", + "traffic light", + "fire hydrant", + "-", + "stop sign", + "parking meter", + "bench", + "bird", + "cat", + "dog", + "horse", + "sheep", + "cow", + "elephant", + "bear", + "zebra", + "giraffe", + "-", + "backpack", + "umbrella", + "-", + "-", + "handbag", + "tie", + "suitcase", + "frisbee", + "skis", + "snowboard", + "sports ball", + "kite", + "baseball bat", + "baseball glove", + "skateboard", + "surfboard", + "tennis racket", + "bottle", + "-", + "wine glass", + "cup", + "fork", + "knife", + "spoon", + "bowl", + "banana", + "apple", + "sandwich", + "orange", + "broccoli", + "carrot", + "hot dog", + "pizza", + "donut", + "cake", + "chair", + "couch", + "potted plant", + "bed", + "-", + "dining table", + "-", + "-", + "toilet", + "-", + "tv", + "laptop", + "mouse", + "remote", + "keyboard", + "cell phone", + "microwave", + "oven", + "toaster", + "sink", + "refrigerator", + "-", + "book", + "clock", + "vase", + "scissors", + "teddy bear", + "hair drier", + "toothbrush" + ] + }, + + "object_detect_draw_cv": + { + "line_thickness" : 2 + } +} diff --git a/assets/imx500_posenet.json b/assets/imx500_posenet.json new file mode 100644 index 00000000..c008f2a5 --- /dev/null +++ b/assets/imx500_posenet.json @@ -0,0 +1,29 @@ +{ + "imx500_posenet": + { + "max_detections" : 5, + "threshold" : 0.4, + "offset_refinement_steps": 5, + "nms_radius": 10.0, + "network_file": "/usr/share/imx500-models/imx500_network_posenet.rpk", + + "save_input_tensor": + { + "filename": "/home/pi/posenet_input_tensor.raw", + "num_tensors": 10 + }, + + "temporal_filter": + { + "tolerance": 0.3, + "factor": 0.3, + "visible_frames": 8, + "hidden_frames": 2 + } + }, + + "plot_pose_cv": + { + "confidence_threshold" : 0.2 + } +} diff --git a/meson.build b/meson.build index 7d4c61c8..1a742df3 100644 --- a/meson.build +++ b/meson.build @@ -80,5 +80,6 @@ summary({ 'OpenCV postprocessing' : enable_opencv, 'TFLite postprocessing' : enable_tflite, 'Hailo postprocessing' : enable_hailo, + 'IMX500 postprocessing' : get_option('enable_imx500'), }, bool_yn : true, section : 'Build configuration') diff --git a/meson_options.txt b/meson_options.txt index 13b6de16..0f24d546 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -43,3 +43,13 @@ option('download_hailo_models', type : 'boolean', value : true, description : 'Download and install the Hailo postprocessing models') + +option('enable_imx500', + type : 'boolean', + value : true, + description : 'Enable IMX500 postprocessing support') + +option('download_imx500_models', + type : 'boolean', + value : true, + description : 'Download and install the IMX500 postprocessing models') diff --git a/post_processing_stages/imx500/imx500_object_detection.cpp b/post_processing_stages/imx500/imx500_object_detection.cpp new file mode 100644 index 00000000..310340f9 --- /dev/null +++ b/post_processing_stages/imx500/imx500_object_detection.cpp @@ -0,0 +1,337 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright (C) 2024, Raspberry Pi Ltd + * + * imx500_object_detection.cpp - IMX500 object detection for various networks + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "core/rpicam_app.hpp" +#include "post_processing_stages/object_detect.hpp" +#include "post_processing_stages/post_processing_stage.hpp" + +#include "imx500_post_processing_stage.hpp" + +using Rectangle = libcamera::Rectangle; +using Size = libcamera::Size; +namespace controls = libcamera::controls; + +#define NAME "imx500_object_detection" + +struct Bbox +{ + float x0; + float y0; + float x1; + float y1; +}; + +struct ObjectDetectionOutput +{ + unsigned int num_detections = 0; + std::vector bboxes; + std::vector scores; + std::vector classes; +}; + +class ObjectDetection : public IMX500PostProcessingStage +{ +public: + ObjectDetection(RPiCamApp *app) : IMX500PostProcessingStage(app) {} + + char const *Name() const override; + + void Read(boost::property_tree::ptree const ¶ms) override; + + void Configure() override; + + bool Process(CompletedRequestPtr &completed_request) override; + +private: + int processOutputTensor(std::vector &objects, const std::vector &output_tensor, + const CnnOutputTensorInfo &output_tensor_info, const Rectangle &scaler_crop) const; + void filterOutputObjects(std::vector &objects); + + struct LtObject + { + Detection params; + unsigned int visible; + unsigned int hidden; + bool matched; + }; + + std::vector lt_objects_; + std::mutex lt_lock_; + + // Config params + unsigned int max_detections_; + float threshold_; + std::vector classes_; + bool temporal_filtering_; + float tolerance_; + float factor_; + unsigned int visible_frames_; + unsigned int hidden_frames_; +}; + +char const *ObjectDetection::Name() const +{ + return NAME; +} + +void ObjectDetection::Read(boost::property_tree::ptree const ¶ms) +{ + max_detections_ = params.get("max_detections"); + threshold_ = params.get("threshold", 0.5f); + classes_ = PostProcessingStage::GetJsonArray(params, "classes"); + + if (params.find("temporal_filter") != params.not_found()) + { + temporal_filtering_ = true; + tolerance_ = params.get("temporal_filter.tolerance", 0.05); + factor_ = params.get("temporal_filter.factor", 0.2); + visible_frames_ = params.get("temporal_filter.visible_frames", 5); + hidden_frames_ = params.get("temporal_filter.hidden_frames", 2); + } + else + temporal_filtering_ = false; + + IMX500PostProcessingStage::Read(params); +} + +void ObjectDetection::Configure() +{ + IMX500PostProcessingStage::Configure(); + IMX500PostProcessingStage::ShowFwProgressBar(); +} + +bool ObjectDetection::Process(CompletedRequestPtr &completed_request) +{ + auto scaler_crop = completed_request->metadata.get(controls::ScalerCrop); + if (!raw_stream_ || !scaler_crop) + { + LOG_ERROR("Must have RAW stream and scaler crop available to get sensor dimensions!"); + return false; + } + + auto output = completed_request->metadata.get(controls::rpi::CnnOutputTensor); + auto info = completed_request->metadata.get(controls::rpi::CnnOutputTensorInfo); + std::vector objects; + + // Process() can be concurrently called through different threads for consecutive CompletedRequests if + // things are running behind. So protect access to the lt_objects_ state object. + std::scoped_lock l(lt_lock_); + + if (output && info) + { + std::vector output_tensor(output->data(), output->data() + output->size()); + CnnOutputTensorInfo output_tensor_info = *reinterpret_cast(info->data()); + + processOutputTensor(objects, output_tensor, output_tensor_info, *scaler_crop); + + if (temporal_filtering_) + { + filterOutputObjects(objects); + if (lt_objects_.size()) + { + objects.clear(); + for (auto const &obj : lt_objects_) + { + if (!obj.hidden) + objects.push_back(obj.params); + } + } + } + else + { + // If no temporal filtering, make sure we fill lt_objects_ with the current result as it may be used if + // there is no output tensor on subsequent frames. + for (auto const &obj : objects) + lt_objects_.push_back({ obj, 0, 0, 0 }); + } + } + else + { + // No output tensor, so simply reuse the results from the lt_objects_. + for (auto const &obj : lt_objects_) + { + if (!obj.hidden) + objects.push_back(obj.params); + } + } + + if (objects.size()) + completed_request->post_process_metadata.Set("object_detect.results", objects); + + return IMX500PostProcessingStage::Process(completed_request); +} + +static int createObjectDetectionData(ObjectDetectionOutput &output, const std::vector &data, + unsigned int total_detections) +{ + unsigned int count = 0; + + // Extract bounding box co-ordinates + for (unsigned int i = 0; i < total_detections; i++) + { + Bbox bbox; + bbox.y0 = data.at(count + i); + bbox.x0 = data.at(count + i + (1 * total_detections)); + bbox.y1 = data.at(count + i + (2 * total_detections)); + bbox.x1 = data.at(count + i + (3 * total_detections)); + output.bboxes.push_back(bbox); + } + count += (total_detections * 4); + + // Extract scores + for (unsigned int i = 0; i < total_detections; i++) + { + output.scores.push_back(data.at(count)); + count++; + } + + // Extract class indices + for (unsigned int i = 0; i < total_detections; i++) + { + output.classes.push_back(data.at(count)); + count++; + } + + // Extract number of detections + unsigned int num_detections = data.at(count); + if (num_detections > total_detections) + { + LOG(1, "Unexpected value for num_detections: " << num_detections << ", setting it to " << total_detections); + num_detections = total_detections; + } + + output.num_detections = num_detections; + return 0; +} + +int ObjectDetection::processOutputTensor(std::vector &objects, const std::vector &output_tensor, + const CnnOutputTensorInfo &output_tensor_info, + const Rectangle &scaler_crop) const +{ + if (output_tensor_info.num_tensors != 4) + { + LOG_ERROR("Invalid number of tensors " << output_tensor_info.num_tensors << ", expected 4"); + return -1; + } + + const unsigned int total_detections = output_tensor_info.info[0].tensor_data_num / 4; + ObjectDetectionOutput output; + + // 4x coords + 1x labels + 1x confidences + 1 total detections + if (output_tensor.size() != 6 * total_detections + 1) + { + LOG_ERROR("Invalid tensor size " << output_tensor.size() << ", expected " << 6 * total_detections + 1); + return -1; + } + + int ret = createObjectDetectionData(output, output_tensor, total_detections); + if (ret) + { + LOG_ERROR("Failed to create object detection data"); + return -1; + } + + for (unsigned int i = 0; i < std::min(output.num_detections, max_detections_); i++) + { + uint8_t class_index = (uint8_t)output.classes[i]; + + // Filter detections + if (output.scores[i] < threshold_ || class_index >= classes_.size()) + continue; + + // Extract bounding box co-ordinates in the inference image co-ordinates and convert to the final ISP output + // co-ordinates. + std::vector coords{ output.bboxes[i].x0, output.bboxes[i].y0, + output.bboxes[i].x1 - output.bboxes[i].x0, + output.bboxes[i].y1 - output.bboxes[i].y0 }; + const Rectangle obj_scaled = ConvertInferenceCoordinates(coords, scaler_crop); + + objects.emplace_back(class_index, classes_[class_index], output.scores[i], + obj_scaled.x, obj_scaled.y, obj_scaled.width, obj_scaled.height); + } + + LOG(2, "Number of objects detected: " << objects.size()); + for (unsigned i = 0; i < objects.size(); i++) + LOG(2, "[" << i << "] : " << objects[i].toString()); + + return 0; +} + +void ObjectDetection::filterOutputObjects(std::vector &objects) +{ + const Size isp_output_size = output_stream_->configuration().size; + + for (auto <_obj : lt_objects_) + lt_obj.matched = false; + + for (auto const &object : objects) + { + bool matched = false; + for (auto <_obj : lt_objects_) + { + // Try and match a detected object in our long term list. + if (object.category == lt_obj.params.category && + std::abs(object.box.x - lt_obj.params.box.x) < tolerance_ * isp_output_size.width && + std::abs(object.box.y - lt_obj.params.box.y) < tolerance_ * isp_output_size.height && + std::abs((int)object.box.width - (int)lt_obj.params.box.width) < tolerance_ * isp_output_size.width && + std::abs((int)object.box.height - (int)lt_obj.params.box.height) < tolerance_ * isp_output_size.height) + { + lt_obj.matched = matched = true; + lt_obj.params.confidence = object.confidence; + lt_obj.params.box.x = factor_ * object.box.x + (1 - factor_) * lt_obj.params.box.x; + lt_obj.params.box.y = factor_ * object.box.y + (1 - factor_) * lt_obj.params.box.y; + lt_obj.params.box.width = factor_ * object.box.width + (1 - factor_) * lt_obj.params.box.width; + lt_obj.params.box.height = factor_ * object.box.height + (1 - factor_) * lt_obj.params.box.height; + // Reset the visibility counter for when the object next disappears. + lt_obj.visible = visible_frames_; + // Decrement the hidden counter until the object becomes visible in the list. + lt_obj.hidden = std::max(0, (int)lt_obj.hidden - 1); + break; + } + } + + // Add the object to the long term list if not found. This object will remain hidden for hidden_frames_ + // consecutive frames. + if (!matched) + lt_objects_.push_back({ object, visible_frames_, hidden_frames_, 1 }); + } + + for (auto <_obj : lt_objects_) + { + if (!lt_obj.matched) + { + // If a non matched object in the long term list is still hidden, set visible count to 0 so that it must be + // matched for hidden_frames_ consecutive frames before becoming visible. Otherwise, decrement the visible + // count of unmatched objects in the long term list. + if (lt_obj.hidden) + lt_obj.visible = 0; + else + lt_obj.visible--; + } + } + + // Remove now invisible objects from the long term list. + lt_objects_.erase(std::remove_if(lt_objects_.begin(), lt_objects_.end(), + [] (const LtObject &obj) { return !obj.matched && !obj.visible; }), + lt_objects_.end()); +} + +static PostProcessingStage *Create(RPiCamApp *app) +{ + return new ObjectDetection(app); +} + +static RegisterStage reg(NAME, &Create); diff --git a/post_processing_stages/imx500/imx500_posenet.cpp b/post_processing_stages/imx500/imx500_posenet.cpp new file mode 100644 index 00000000..a965bd20 --- /dev/null +++ b/post_processing_stages/imx500/imx500_posenet.cpp @@ -0,0 +1,861 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright (C) 2024, Raspberry Pi Ltd + * + * imx500_posenet.cpp - IMX500 inference for PoseNet + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "core/rpicam_app.hpp" +#include "post_processing_stages/post_processing_stage.hpp" + +#include "imx500_post_processing_stage.hpp" + +using Rectangle = libcamera::Rectangle; +using Size = libcamera::Size; +namespace controls = libcamera::controls; + +namespace +{ + +constexpr Size INPUT_TENSOR_SIZE = { 481, 353 }; +constexpr Size MAP_SIZE = { 31, 23 }; +constexpr unsigned int NUM_KEYPOINTS = 17; +// These 16 edges allow traversing of the pose graph along the mid_offsets (see +// paper for details). +static constexpr int NUM_EDGES = 16; +constexpr unsigned int STRIDE = 16; +constexpr unsigned int NUM_HEATMAPS = NUM_KEYPOINTS * MAP_SIZE.width * MAP_SIZE.height; +constexpr unsigned int NUM_SHORT_OFFSETS = 2 * NUM_KEYPOINTS * MAP_SIZE.width * MAP_SIZE.height; +constexpr unsigned int NUM_MID_OFFSETS = 64 * MAP_SIZE.width * MAP_SIZE.height; + +enum KeypointType +{ + Nose, + LeftEye, + RightEye, + LeftEar, + RightEar, + LeftShoulder, + RightShoulder, + LeftElbow, + RightElbow, + LeftWrist, + RightWrist, + LeftHip, + RightHip, + LeftKnee, + RightKnee, + LeftAnkle, + RightAnkle +}; + +const std::array, 32> EdgeList = { { + // Forward edges + { Nose, LeftEye }, + { LeftEye, LeftEar }, + { Nose, RightEye }, + { RightEye, RightEar }, + { Nose, LeftShoulder }, + { LeftShoulder, LeftElbow }, + { LeftElbow, LeftWrist }, + { LeftShoulder, LeftHip }, + { LeftHip, LeftKnee }, + { LeftKnee, LeftAnkle }, + { Nose, RightShoulder }, + { RightShoulder, RightElbow }, + { RightElbow, RightWrist }, + { RightShoulder, RightHip }, + { RightHip, RightKnee }, + { RightKnee, RightAnkle }, + + // Backward edges + { LeftEye, Nose }, + { LeftEar, LeftEye }, + { RightEye, Nose }, + { RightEar, RightEye }, + { LeftShoulder, Nose }, + { LeftElbow, LeftShoulder }, + { LeftWrist, LeftElbow }, + { LeftHip, LeftShoulder }, + { LeftKnee, LeftHip }, + { LeftAnkle, LeftKnee }, + { RightShoulder, Nose }, + { RightElbow, RightShoulder }, + { RightWrist, RightElbow }, + { RightHip, RightShoulder }, + { RightKnee, RightHip }, + { RightAnkle, RightKnee }, +} }; + +struct Point +{ + float y, x; +}; + +// An adjacency list representing the directed edges connecting keypoints. +struct AdjacencyList +{ + explicit AdjacencyList(const int num_nodes) : child_ids(num_nodes), edge_ids(num_nodes) {} + + // child_ids[i] is a vector holding the node ids of all children of the i-th + // node and edge_ids[i] is a vector holding the edge ids of all edges stemming + // from the i-th node. If the k-th edge in the graph starts at the i-th node + // and ends at the j-th node, then child_ids[i] and edge_ids will contain j + // and k, respectively, at corresponding positions. + std::vector> child_ids; + std::vector> edge_ids; +}; + +// Defines a 2-D keypoint with (x, y) float coordinates and its type id. +struct KeypointWithScore +{ + KeypointWithScore(const Point &_pt, const int _id, const float _score) : point(_pt), id(_id), score(_score) {} + + [[maybe_unused]] friend std::ostream &operator<<(std::ostream &ost, const KeypointWithScore &keypoint) + { + return ost << keypoint.point.y << ", " << keypoint.point.x << ", " << keypoint.id << ", " << keypoint.score; + } + + bool operator<(const KeypointWithScore &other) const { return score < other.score; } + bool operator>(const KeypointWithScore &other) const { return score > other.score; } + + Point point; + int id; + float score; +}; + +using KeypointQueue = std::priority_queue>; +using PoseKeypoints = std::array; +using PoseKeypointScores = std::array; + +struct PoseResults +{ + float pose_score; + PoseKeypoints pose_keypoints; + PoseKeypointScores pose_keypoint_scores; +}; + +float sigmoid(const float x) +{ + return 1.0f / (1.0f + std::exp(-x)); +} + +float log_odds(const float x) +{ + return -std::log(1.0f / (x + 1E-6) - 1.0f); +} + +// Computes the squared distance between a pair of 2-D points. +float squared_distance(const Point &a, const Point &b) +{ + const float dy = b.y - a.y; + const float dx = b.x - a.x; + return dy * dy + dx * dx; +} + +std::vector format_tensor(const float *data, unsigned int size, unsigned int div) +{ + std::vector tensor(size * MAP_SIZE.width * MAP_SIZE.height); + + for (unsigned int i = 0; i < size; i++) + { + for (unsigned int j = 0; j < MAP_SIZE.width; j++) + { + for (unsigned int k = 0; k < MAP_SIZE.height; k++) + { + tensor[(size * MAP_SIZE.width * k) + (size * j) + i] = + data[(MAP_SIZE.width * MAP_SIZE.height * i) + (j * MAP_SIZE.height) + k] / div; + } + } + } + + return tensor; +} + +// Build an adjacency list of the pose graph. +AdjacencyList build_agency_list() +{ + AdjacencyList adjacency_list(NUM_KEYPOINTS); + + for (unsigned int k = 0; k < EdgeList.size(); ++k) + { + const int parent_id = EdgeList[k].first; + const int child_id = EdgeList[k].second; + adjacency_list.child_ids[parent_id].push_back(child_id); + adjacency_list.edge_ids[parent_id].push_back(k); + } + + return adjacency_list; +} + +bool pass_keypoint_nms(std::vector poses, const size_t num_poses, const KeypointWithScore &keypoint, + const float squared_nms_radius) +{ + for (unsigned int i = 0; i < num_poses; ++i) + { + if (squared_distance(keypoint.point, poses[i][keypoint.id]) <= squared_nms_radius) + return false; + } + + return true; +} + +// Finds the indices of the scores if we sort them in decreasing order. +void decreasing_arg_sort(std::vector &indices, const float *scores, unsigned int num_scores) +{ + indices.resize(num_scores); + std::iota(indices.begin(), indices.end(), 0); + std::sort(indices.begin(), indices.end(), [&scores](const int i, const int j) { return scores[i] > scores[j]; }); +} + +// Finds the indices of the scores if we sort them in decreasing order. +void decreasing_arg_sort(std::vector &indices, const std::vector &scores) +{ + decreasing_arg_sort(indices, scores.data(), scores.size()); +} + +// Helper function for 1-D linear interpolation. It computes the floor and the +// ceiling of the input coordinate, as well as the weighting factor between the +// two interpolation endpoints, such that: +// y = (1 - x_lerp) * vec[x_floor] + x_lerp * vec[x_ceil] +void build_linear_interpolation(const float x, const int n, int *x_floor, int *x_ceil, float *x_lerp) +{ + const float x_proj = std::clamp(x, 0.0f, n - 1.0f); + *x_floor = static_cast(std::floor(x_proj)); + *x_ceil = static_cast(std::ceil(x_proj)); + *x_lerp = x - (*x_floor); +} + +// Helper function for 2-D bilinear interpolation. It computes the four corners +// of the 2x2 cell that contain the input coordinates (x, y), as well as the +// weighting factor between the four interpolation endpoints, such that: +// y = +// (1 - y_lerp) * ((1 - x_lerp) * vec[top_left] + x_lerp * vec[top_right]) + +// y_lerp * ((1 - x_lerp) * tensor[bottom_left] + x_lerp * vec[bottom_right]) +void build_bilinear_interpolation(const Point point, const unsigned int num_channels, int *top_left, int *top_right, + int *bottom_left, int *bottom_right, float *y_lerp, float *x_lerp) +{ + int y_floor; + int y_ceil; + build_linear_interpolation(point.y, MAP_SIZE.height, &y_floor, &y_ceil, y_lerp); + int x_floor; + int x_ceil; + build_linear_interpolation(point.x, MAP_SIZE.width, &x_floor, &x_ceil, x_lerp); + *top_left = (y_floor * MAP_SIZE.width + x_floor) * num_channels; + *top_right = (y_floor * MAP_SIZE.width + x_ceil) * num_channels; + *bottom_left = (y_ceil * MAP_SIZE.width + x_floor) * num_channels; + *bottom_right = (y_ceil * MAP_SIZE.width + x_ceil) * num_channels; +} + +// Sample the input tensor values at position (x, y) and at multiple channels. +// The input tensor has shape [height, width, num_channels]. We bilinearly +// sample its value at tensor(y, x, c), for c in the channels specified. This +// is faster than calling the single channel interpolation function multiple +// times because the computation of the positions needs to be done only once. +std::vector sample_tensor_at_multiple_channels(const std::vector &tensor, const Point &point, + const std::vector &result_channels, + unsigned int num_channels) +{ + int top_left, top_right, bottom_left, bottom_right; + float y_lerp, x_lerp; + + build_bilinear_interpolation(point, num_channels, &top_left, &top_right, &bottom_left, &bottom_right, &y_lerp, + &x_lerp); + + std::vector result; + for (auto &c : result_channels) + { + result.push_back((1 - y_lerp) * ((1 - x_lerp) * tensor[top_left + c] + x_lerp * tensor[top_right + c]) + + y_lerp * ((1 - x_lerp) * tensor[bottom_left + c] + x_lerp * tensor[bottom_right + c])); + } + + return result; +} + +// Sample the input tensor values at position (x, y) and at a single channel. +// The input tensor has shape [height, width, num_channels]. We bilinearly +// sample its value at tensor(y, x, channel). +float sample_tensor_at_single_channel(const std::vector &tensor, const Point &point, unsigned int num_channels, + const int c) +{ + std::vector result = sample_tensor_at_multiple_channels(tensor, point, std::vector { c }, num_channels); + return result[0]; +} + +KeypointQueue build_keypoint_queue(const std::vector &scores, const std::vector &short_offsets, + const float score_threshold) +{ + constexpr unsigned int local_maximum_radius = 1; + unsigned int score_index = 0; + KeypointQueue queue; + + for (unsigned int y = 0; y < MAP_SIZE.height; ++y) + { + for (unsigned int x = 0; x < MAP_SIZE.width; ++x) + { + unsigned int offset_index = 2 * score_index; + for (unsigned int j = 0; j < NUM_KEYPOINTS; ++j) + { + const float score = scores[score_index]; + if (score >= score_threshold) + { + // Only consider keypoints whose score is maximum in a local window. + bool local_maximum = true; + const unsigned int y_start = std::max((int)y - (int)local_maximum_radius, 0); + const unsigned int y_end = std::min(y + local_maximum_radius + 1, MAP_SIZE.height); + for (unsigned int y_current = y_start; y_current < y_end; ++y_current) + { + const unsigned int x_start = std::max((int)x - (int)local_maximum_radius, 0); + const unsigned int x_end = std::min(x + local_maximum_radius + 1, MAP_SIZE.width); + for (unsigned int x_current = x_start; x_current < x_end; ++x_current) + { + if (scores[y_current * MAP_SIZE.width * NUM_KEYPOINTS + x_current * NUM_KEYPOINTS + j] > + score) + { + local_maximum = false; + break; + } + } + if (!local_maximum) + break; + } + if (local_maximum) + { + const float dy = short_offsets[offset_index]; + const float dx = short_offsets[offset_index + NUM_KEYPOINTS]; + const float y_refined = std::clamp(y + dy, 0.0f, MAP_SIZE.height - 1.0f); + const float x_refined = std::clamp(x + dx, 0.0f, MAP_SIZE.width - 1.0f); + queue.emplace(Point { y_refined, x_refined }, j, score); + } + } + + ++score_index; + ++offset_index; + } + } + } + + return queue; +} + +// Follows the mid-range offsets, and then refines the position by the short- +// range offsets for a fixed number of steps. +Point find_displaced_position(const std::vector &short_offsets, const std::vector &mid_offsets, + const Point &source, const int edge_id, const int target_id, + const int offset_refinement_steps) +{ + float y = source.y, x = source.x; + + // Follow the mid-range offsets. + std::vector channels = { edge_id, NUM_EDGES + edge_id }; + + // Total size of mid_offsets is height x width x 2*2*num_edges + std::vector offsets = sample_tensor_at_multiple_channels(mid_offsets, source, channels, 2 * 2 * NUM_EDGES); + y = std::clamp(y + offsets[0], 0.0f, MAP_SIZE.height - 1.0f); + x = std::clamp(x + offsets[1], 0.0f, MAP_SIZE.width - 1.0f); + + // Refine by the short-range offsets. + channels[0] = target_id; + channels[1] = NUM_KEYPOINTS + target_id; + for (int i = 0; i < offset_refinement_steps; ++i) + { + offsets = sample_tensor_at_multiple_channels(short_offsets, Point { y, x }, channels, 2 * NUM_KEYPOINTS); + y = std::clamp(y + offsets[0], 0.0f, MAP_SIZE.height - 1.0f); + x = std::clamp(x + offsets[1], 0.0f, MAP_SIZE.width - 1.0f); + } + + return Point { y, x }; +} + +void backtrack_decode_pose(const std::vector &scores, const std::vector &short_offsets, + const std::vector &mid_offsets, const KeypointWithScore &root, + const AdjacencyList &adjacency_list, PoseKeypoints &pose_keypoints, + PoseKeypointScores &keypoint_scores, unsigned int offset_refinement_steps) +{ + const float root_score = sample_tensor_at_single_channel(scores, root.point, root.id, NUM_KEYPOINTS); + + // Used in order to put candidate keypoints in a priority queue w.r.t. their + // score. Keypoints with higher score have higher priority and will be + // decoded/processed first. + KeypointQueue decode_queue; + decode_queue.push(KeypointWithScore(root.point, root.id, root_score)); + + // Keeps track of the keypoints whose position has already been decoded. + std::vector keypoint_decoded(NUM_KEYPOINTS, false); + + while (!decode_queue.empty()) + { + // The top element in the queue is the next keypoint to be processed. + const KeypointWithScore current_keypoint = decode_queue.top(); + decode_queue.pop(); + + if (keypoint_decoded[current_keypoint.id]) + continue; + + pose_keypoints[current_keypoint.id] = current_keypoint.point; + keypoint_scores[current_keypoint.id] = current_keypoint.score; + + keypoint_decoded[current_keypoint.id] = true; + + // Add the children of the current keypoint that have not been decoded yet + // to the priority queue. + const int num_children = adjacency_list.child_ids[current_keypoint.id].size(); + for (int j = 0; j < num_children; ++j) + { + const int child_id = adjacency_list.child_ids[current_keypoint.id][j]; + int edge_id = adjacency_list.edge_ids[current_keypoint.id][j]; + if (keypoint_decoded[child_id]) + continue; + + // The mid-offsets block is organized as 4 blocks of NUM_EDGES: + // [fwd Y offsets][fwd X offsets][bwd Y offsets][bwd X offsets] + // OTOH edge_id is [0,NUM_EDGES) for forward edges and + // [NUM_EDGES, 2*NUM_EDGES) for backward edges. + // Thus if the edge is a backward edge (>NUM_EDGES) then we need + // to start 16 indices later to be correctly aligned with the mid-offsets. + if (edge_id > NUM_EDGES) + edge_id += NUM_EDGES; + + const Point child_point = find_displaced_position(short_offsets, mid_offsets, current_keypoint.point, + edge_id, child_id, offset_refinement_steps); + const float child_score = sample_tensor_at_single_channel(scores, child_point, NUM_KEYPOINTS, child_id); + decode_queue.emplace(child_point, child_id, child_score); + } + } +} + +void find_overlapping_keypoints(std::vector &mask, const PoseKeypoints &pose1, const PoseKeypoints &pose2, + const float squared_radius) +{ + for (unsigned int k = 0; k < mask.size(); ++k) + { + if (squared_distance(pose1[k], pose2[k]) <= squared_radius) + mask[k] = true; + } +} + +void perform_soft_keypoint_NMS(std::vector &all_instance_scores, const std::vector &decreasing_indices, + const std::vector &all_keypoint_coords, + const std::vector &all_keypoint_scores, + const float squared_nms_radius) +{ + const int num_instances = decreasing_indices.size(); + + all_instance_scores.resize(num_instances); + // Indicates the occlusion status of the keypoints of the active instance. + std::vector keypoint_occluded(NUM_KEYPOINTS); + // Indices of the keypoints of the active instance in decreasing score value. + std::vector indices(NUM_KEYPOINTS); + for (int i = 0; i < num_instances; ++i) + { + const int current_index = decreasing_indices[i]; + // Find the keypoints of the current instance which are overlapping with + // the corresponding keypoints of the higher-scoring instances and + // zero-out their contribution to the score of the current instance. + std::fill(keypoint_occluded.begin(), keypoint_occluded.end(), false); + for (int j = 0; j < i; ++j) + { + const int previous_index = decreasing_indices[j]; + find_overlapping_keypoints(keypoint_occluded, all_keypoint_coords[current_index], + all_keypoint_coords[previous_index], squared_nms_radius); + } + // We compute the argsort keypoint indices based on the original keypoint + // scores, but we do not let them contribute to the instance score if they + // have been non-maximum suppressed. + decreasing_arg_sort(indices, all_keypoint_scores[current_index].data(), + all_keypoint_scores[current_index].size()); + float total_score = 0.0f; + for (unsigned int k = 0; k < NUM_KEYPOINTS; ++k) + { + if (!keypoint_occluded[indices[k]]) + total_score += all_keypoint_scores[current_index][indices[k]]; + } + all_instance_scores[current_index] = total_score / NUM_KEYPOINTS; + } +} + +} // namespace + +#define NAME "imx500_posenet" + +class PoseNet : public IMX500PostProcessingStage +{ +public: + PoseNet(RPiCamApp *app) : IMX500PostProcessingStage(app) {} + + char const *Name() const override; + + void Read(boost::property_tree::ptree const ¶ms) override; + + void Configure() override; + + bool Process(CompletedRequestPtr &completed_request) override; + +private: + std::vector decodeAllPoses(const std::vector &scores, const std::vector &short_offsets, + const std::vector &mid_offsets); + void translateCoordinates(std::vector &results, const Rectangle &scaler_crop) const; + void filterOutputObjects(const std::vector &results); + + struct LtResults + { + PoseResults results; + unsigned int visible; + unsigned int hidden; + bool matched; + }; + + std::vector lt_results_; + std::mutex lt_lock_; + + // Config params: + float threshold_; + unsigned int max_detections_; + unsigned int offset_refinement_steps_; + float nms_radius_; + bool temporal_filtering_; + + float tolerance_; + float factor_; + unsigned int visible_frames_; + unsigned int hidden_frames_; +}; + +char const *PoseNet::Name() const +{ + return NAME; +} + +void PoseNet::Read(boost::property_tree::ptree const ¶ms) +{ + max_detections_ = params.get("max_detections", 10); + threshold_ = params.get("threshold", 0.5f); + offset_refinement_steps_ = params.get("offset_refinement_steps", 5); + nms_radius_ = params.get("nms_radius", 10) / STRIDE; + + if (params.find("temporal_filter") != params.not_found()) + { + temporal_filtering_ = true; + tolerance_ = params.get("temporal_filter.tolerance", 0.05); + factor_ = params.get("temporal_filter.factor", 0.2); + visible_frames_ = params.get("temporal_filter.visible_frames", 5); + hidden_frames_ = params.get("temporal_filter.hidden_frames", 2); + } + else + temporal_filtering_ = false; + + IMX500PostProcessingStage::Read(params); +} + +void PoseNet::Configure() +{ + IMX500PostProcessingStage::Configure(); + IMX500PostProcessingStage::ShowFwProgressBar(); +} + +bool PoseNet::Process(CompletedRequestPtr &completed_request) +{ + auto scaler_crop = completed_request->metadata.get(controls::ScalerCrop); + if (!raw_stream_ || !scaler_crop) + { + LOG_ERROR("Must have RAW stream and scaler crop available to get sensor dimensions!"); + return false; + } + + auto output = completed_request->metadata.get(controls::rpi::CnnOutputTensor); + if (!output) + { + LOG_ERROR("No output tensor found in metadata!"); + return false; + } + + if (output->size() < NUM_HEATMAPS + NUM_SHORT_OFFSETS + NUM_MID_OFFSETS) + { + LOG_ERROR("Unexpected output tensor size: " << output->size()); + return false; + } + + std::vector scores = + format_tensor((float *)output->data(), NUM_HEATMAPS / (MAP_SIZE.width * MAP_SIZE.height), 1); + std::vector short_offsets = format_tensor((float *)output->data() + NUM_HEATMAPS, + NUM_SHORT_OFFSETS / (MAP_SIZE.width * MAP_SIZE.height), STRIDE); + std::vector mid_offsets = format_tensor((float *)output->data() + NUM_HEATMAPS + NUM_SHORT_OFFSETS, + NUM_MID_OFFSETS / (MAP_SIZE.width * MAP_SIZE.height), STRIDE); + + std::vector results = decodeAllPoses(scores, short_offsets, mid_offsets); + translateCoordinates(results, *scaler_crop); + + std::vector> locations; + std::vector> confidences; + + if (temporal_filtering_) + { + // Process() can be concurrently called through different threads for consecutive CompletedRequests if + // things are running behind. So protect access to the lt_results_ state object. + std::scoped_lock l(lt_lock_); + + filterOutputObjects(results); + for (auto const <_r : lt_results_) + { + if (lt_r.hidden) + continue; + + locations.push_back({}); + confidences.push_back({}); + + for (auto const &s : lt_r.results.pose_keypoint_scores) + confidences.back().push_back(s); + for (auto const &k : lt_r.results.pose_keypoints) + locations.back().emplace_back(k.x, k.y); + } + } + else if (results.size()) + { + for (auto const &result : results) + { + locations.push_back({}); + confidences.push_back({}); + + for (auto const &s : result.pose_keypoint_scores) + confidences.back().push_back(s); + for (auto const &k : result.pose_keypoints) + locations.back().emplace_back(k.x, k.y); + } + } + + if (locations.size() && locations.size() == confidences.size()) + { + completed_request->post_process_metadata.Set("pose_estimation.locations", locations); + completed_request->post_process_metadata.Set("pose_estimation.confidences", confidences); + } + + return IMX500PostProcessingStage::Process(completed_request); +} + +// Decodes poses from the score map, the short and mid offsets. +// "Block space" refers to the output y and z size of the network. +// For example if the network that takes a (353,481) (y,x) input image will have +// an output field of 23, 31. Thus the sizes of the input vectors to this +// functions will be +// scores: 23 x 31 x NUM_KEYPOINTS +// short_offsets 23 x 31 x 2 x NUM_KEYPOINTS (x and y per keypoint) +// mid_offsets 23 x 31 x 2 x 2 2 NUM_EDGES (x and y for each fwd and bwd edge) +// Thus height and width need to be set to 23 and 31 respectively. +// nms_radius must also be given in these units. +// The output coordinates will be in pixel coordinates. +// +// For details see https://arxiv.org/abs/1803.08225 +// PersonLab: Person Pose Estimation and Instance Segmentation with a +// Bottom-Up, Part-Based, Geometric Embedding Model +// George Papandreou, Tyler Zhu, Liang-Chieh Chen, Spyros Gidaris, +// Jonathan Tompson, Kevin Murphy +std::vector PoseNet::decodeAllPoses(const std::vector &scores, + const std::vector &short_offsets, + const std::vector &mid_offsets) +{ + const float min_score_logit = log_odds(threshold_); + + KeypointQueue queue = build_keypoint_queue(scores, short_offsets, min_score_logit); + AdjacencyList adjacency_list = build_agency_list(); + + std::vector indices(NUM_KEYPOINTS); + + // Generate at most max_detections object instances per image in decreasing + // root part score order. + std::vector all_instance_scores; + + std::vector scratch_poses(NUM_KEYPOINTS); + std::vector scratch_keypoint_scores(max_detections_); + + unsigned int pose_counter = 0; + while (pose_counter < max_detections_ && !queue.empty()) + { + // The top element in the queue is the next root candidate. + const KeypointWithScore root = queue.top(); + queue.pop(); + + // Reject a root candidate if it is within a disk of nms_radius_ pixels + // from the corresponding part of a previously detected instance. + if (!pass_keypoint_nms(scratch_poses, pose_counter, root, nms_radius_ * nms_radius_)) + continue; + + auto &next_pose = scratch_poses[pose_counter]; + auto &next_scores = scratch_keypoint_scores[pose_counter]; + for (unsigned int k = 0; k < NUM_KEYPOINTS; ++k) + { + next_pose[k].x = -1.0f; + next_pose[k].y = -1.0f; + next_scores[k] = -1E5; + } + + backtrack_decode_pose(scores, short_offsets, mid_offsets, root, adjacency_list, next_pose, next_scores, + offset_refinement_steps_); + + // Convert keypoint-level scores from log-odds to probabilities and compute + // an initial instance-level score as the average of the scores of the top-k + // scoring keypoints. + for (unsigned int k = 0; k < NUM_KEYPOINTS; ++k) + next_scores[k] = sigmoid(next_scores[k]); + + decreasing_arg_sort(indices, next_scores.data(), next_scores.size()); + + float instance_score = 0.0f; + for (unsigned int j = 0; j < NUM_KEYPOINTS; ++j) + instance_score += next_scores[indices[j]]; + + instance_score /= NUM_KEYPOINTS; + + if (instance_score >= threshold_) + { + pose_counter++; + all_instance_scores.push_back(instance_score); + } + } + + // Sort the detections in decreasing order of their instance-level scores. + std::vector decreasing_indices; + decreasing_arg_sort(decreasing_indices, all_instance_scores); + + // Keypoint-level soft non-maximum suppression and instance-level rescoring as + // the average of the top-k keypoints in terms of their keypoint-level scores. + perform_soft_keypoint_NMS(all_instance_scores, decreasing_indices, scratch_poses, scratch_keypoint_scores, + nms_radius_ * nms_radius_); + + // Sort the detections in decreasing order of their final instance-level + // scores. Usually the order does not change but this is not guaranteed. + decreasing_arg_sort(decreasing_indices, all_instance_scores); + + std::vector results; + for (int index : decreasing_indices) + { + if (all_instance_scores[index] < threshold_) + break; + + // New result. + results.push_back({}); + + // Rescale keypoint coordinates into pixel space (much more useful for user). + for (unsigned int k = 0; k < NUM_KEYPOINTS; ++k) + { + results.back().pose_keypoints[k].y = scratch_poses[index][k].y * STRIDE; + results.back().pose_keypoints[k].x = scratch_poses[index][k].x * STRIDE; + } + + std::copy(scratch_keypoint_scores[index].begin(), scratch_keypoint_scores[index].end(), + results.back().pose_keypoint_scores.begin()); + results.back().pose_score = all_instance_scores[index]; + } + + return results; +} + +void PoseNet::translateCoordinates(std::vector &results, const Rectangle &scaler_crop) const +{ + for (auto &r : results) + { + for (auto &keypoint : r.pose_keypoints) + { + std::vector coords{ keypoint.x / (INPUT_TENSOR_SIZE.width - 1), + keypoint.y / (INPUT_TENSOR_SIZE.height - 1), + 0, 0 }; + Rectangle translated = ConvertInferenceCoordinates(coords, scaler_crop); + keypoint.x = translated.x; + keypoint.y = translated.y; + } + } +} + +void PoseNet::filterOutputObjects(const std::vector &results) +{ + const Size isp_output_size = output_stream_->configuration().size; + + for (auto <_r : lt_results_) + lt_r.matched = false; + + for (auto const &r : results) + { + bool matched = false; + for (auto <_r : lt_results_) + { + // Try and match a detected object in our long term list. + bool found = true; + for (unsigned int i = 0; i < NUM_KEYPOINTS; i++) + { + if (std::abs(lt_r.results.pose_keypoints[i].x - r.pose_keypoints[i].x) > + tolerance_ * isp_output_size.width || + std::abs(lt_r.results.pose_keypoints[i].y - r.pose_keypoints[i].y) > + tolerance_ * isp_output_size.height) + { + found = false; + break; + } + } + + if (found) + { + lt_r.matched = matched = true; + lt_r.results.pose_score = r.pose_score; + + for (unsigned int i = 0; i < NUM_KEYPOINTS; i++) + { + lt_r.results.pose_keypoint_scores[i] = + factor_ * r.pose_keypoint_scores[i] + (1 - factor_) * lt_r.results.pose_keypoint_scores[i]; + lt_r.results.pose_keypoints[i].x = + factor_ * r.pose_keypoints[i].x + (1 - factor_) * lt_r.results.pose_keypoints[i].x; + lt_r.results.pose_keypoints[i].y = + factor_ * r.pose_keypoints[i].y + (1 - factor_) * lt_r.results.pose_keypoints[i].y; + } + + // Reset the visibility counter for when the result next disappears. + lt_r.visible = visible_frames_; + // Decrement the hidden counter until the result becomes visible in the list. + lt_r.hidden = std::max(0, (int)lt_r.hidden - 1); + break; + } + } + + // Add the result to the long term list if not found. This result will remain hidden for hidden_frames_ + // consecutive frames. + if (!matched) + lt_results_.push_back({ r, visible_frames_, hidden_frames_, true }); + } + + for (auto <_r : lt_results_) + { + if (!lt_r.matched) + { + // If a non matched result in the long term list is still hidden, set visible count to 0 so that it must be + // matched for hidden_frames_ consecutive frames before becoming visible. Otherwise, decrement the visible + // count of unmatched objects in the long term list. + if (lt_r.hidden) + lt_r.visible = 0; + else + lt_r.visible--; + } + } + + // Remove now invisible objects from the long term list. + lt_results_.erase(std::remove_if(lt_results_.begin(), lt_results_.end(), + [] (const LtResults &obj) { return !obj.matched && !obj.visible; }), + lt_results_.end()); +} + +PostProcessingStage *Create(RPiCamApp *app) +{ + return new PoseNet(app); +} + +RegisterStage reg(NAME, &Create); diff --git a/post_processing_stages/imx500/imx500_post_processing_stage.cpp b/post_processing_stages/imx500/imx500_post_processing_stage.cpp new file mode 100644 index 00000000..5114871f --- /dev/null +++ b/post_processing_stages/imx500/imx500_post_processing_stage.cpp @@ -0,0 +1,280 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright (C) 2024, Raspberry Pi Ltd + * + * imx500_post_rpocessing_stage.cpp - IMX500 post processing stage base class + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "imx500_post_processing_stage.hpp" + +#include + +using Stream = libcamera::Stream; +using Rectangle = libcamera::Rectangle; +using Size = libcamera::Size; + +using namespace std::chrono_literals; + +namespace fs = std::filesystem; + +namespace +{ + +const unsigned int ROI_CTRL_ID = 0x00982900; +const unsigned int NETWORK_FW_CTRL_ID = 0x00982901; + +inline int16_t conv_reg_signed(int16_t reg) +{ + constexpr unsigned int ROT_DNN_NORM_SIGNED_SHT = 8; + constexpr unsigned int ROT_DNN_NORM_MASK = 0x01FF; + + if (!((reg >> ROT_DNN_NORM_SIGNED_SHT) & 1)) + return reg; + else + return -((~reg + 1) & ROT_DNN_NORM_MASK); +} + +} // namespace + + +IMX500PostProcessingStage::IMX500PostProcessingStage(RPiCamApp *app) + : PostProcessingStage(app), device_fd_(-1) +{ + for (unsigned int i = 0; i < 16; i++) + { + const fs::path test_dir { "/sys/class/video4linux/v4l-subdev" + std::to_string(i) + "/device" }; + const fs::path module_dir { test_dir.string() + "/driver/module" }; + const fs::path id_dir { test_dir.string() + "/of_node" }; + + if (fs::exists(module_dir) && fs::is_symlink(module_dir)) + { + fs::path ln = fs::read_symlink(module_dir); + if (ln.string().find("imx500") != std::string::npos) + { + const std::string dev_node { "/dev/v4l-subdev" + std::to_string(i) }; + device_fd_ = open(dev_node.c_str(), O_RDONLY, 0); + + /* Find the progress indicator sysfs dev nodes. */ + const std::string test_dir_str = fs::read_symlink(test_dir).string(); + const std::size_t pos = test_dir_str.find_last_of("/") + 1; + assert(pos != std::string::npos); + + const std::string imx500_device_id = test_dir_str.substr(pos); + std::string spi_device_id = imx500_device_id; + const std::size_t rep = spi_device_id.find("001a"); + spi_device_id.replace(rep, 4, "0040"); + + const fs::path imx500_progress { "/sys/kernel/debug/imx500-fw:" + imx500_device_id + "/fw_progress" }; + const fs::path spi_progress { "/sys/kernel/debug/rp2040-spi:" + spi_device_id + "/transfer_progress" }; + + fw_progress_.open(imx500_progress.c_str(), std::ios_base::in); + fw_progress_chunk_.open(spi_progress.c_str(), std::ios_base::in); + + break; + } + } + } + + if (device_fd_ < 0) + throw std::runtime_error("Cannot open imx500 device node"); + + SetInferenceRoiAbs({ 0, 0, 4056, 3040 }); +} + +IMX500PostProcessingStage::~IMX500PostProcessingStage() +{ + if (device_fd_ >= 0) + close(device_fd_); +} + +void IMX500PostProcessingStage::Read(boost::property_tree::ptree const ¶ms) +{ + if (params.find("save_input_tensor") != params.not_found()) + { + auto const &pt = params.get_child("save_input_tensor"); + + std::string filename = pt.get("filename"); + num_input_tensors_saved_ = pt.get("num_tensors", 1); + input_tensor_file_ = std::ofstream(filename, std::ios::out | std::ios::binary); + + norm_val_ = PostProcessingStage::GetJsonArray(pt, "norm_val", { 0, 0, 0, 0 }); + norm_shift_ = PostProcessingStage::GetJsonArray(pt, "norm_shift", { 0, 0, 0, 0 }); + div_val_ = PostProcessingStage::GetJsonArray(pt, "div_val", { 1, 1, 1, 1 }); + div_shift_ = pt.get("div_shift", 0); + } + + /* Load the network firmware. */ + std::string network_file = params.get("network_file"); + if (!fs::exists(network_file)) + throw std::runtime_error(network_file + " not found!"); + + int fd = open(network_file.c_str(), O_RDONLY, 0); + + v4l2_control ctrl { NETWORK_FW_CTRL_ID, fd }; + int ret = ioctl(device_fd_, VIDIOC_S_CTRL, &ctrl); + if (ret) + throw std::runtime_error("failed to set network fw ioctl"); + + close(fd); + + LOG(1, "\n------------------------------------------------------------------------------------------------------------------\n" + "NOTE: Loading network firmware onto the IMX500 can take several minutes, please do not close down the application." + "\n------------------------------------------------------------------------------------------------------------------\n"); +} + +void IMX500PostProcessingStage::Configure() +{ + output_stream_ = app_->GetMainStream(); + raw_stream_ = app_->RawStream(); + save_frames_ = num_input_tensors_saved_; +} + +bool IMX500PostProcessingStage::Process(CompletedRequestPtr &completed_request) +{ + auto input = completed_request->metadata.get(controls::rpi::CnnInputTensor); + + if (input && input_tensor_file_.is_open()) + { + // There is a chance that this may be called through multiple threads, so serialize the file access. + std::scoped_lock l(lock_); + + for (unsigned int i = 0; i < input->size(); i++) + { + const unsigned int channel = i % 3; // Assume RGB interleaved format. + int16_t sample = static_cast(input->data()[i]); + sample = (sample << norm_shift_[channel]) - conv_reg_signed(norm_val_[channel]); + sample = ((sample << div_shift_) / div_val_[channel]) & 0xFF; + input_tensor_file_.put(static_cast(sample)); + } + + if (--save_frames_ == 0) + input_tensor_file_.close(); + } + + return false; +} + +Rectangle IMX500PostProcessingStage::ConvertInferenceCoordinates(const std::vector &coords, + const Rectangle &scaler_crop) const +{ + // Convert the inference image co-ordinates into the final ISP output co-ordinates. + const Size &isp_output_size = output_stream_->configuration().size; + const Size &sensor_output_size = raw_stream_->configuration().size; + const Rectangle sensor_crop = scaler_crop.scaledBy(sensor_output_size, full_sensor_resolution_.size()); + + if (coords.size() != 4) + return {}; + + // Object scaled to the full sensor resolution + Rectangle obj; + obj.x = std::round(coords[0] * (full_sensor_resolution_.width - 1)); + obj.y = std::round(coords[1] * (full_sensor_resolution_.height - 1)); + obj.width = std::round(coords[2] * (full_sensor_resolution_.width - 1)); + obj.height = std::round(coords[3] * (full_sensor_resolution_.height - 1)); + + // Object on inference image -> sensor image + const Rectangle obj_sensor = obj.scaledBy(sensor_output_size, full_sensor_resolution_.size()); + // -> bounded to the ISP crop on the sensor image + const Rectangle obj_bound = obj_sensor.boundedTo(sensor_crop); + // -> translated by the start of the crop offset + const Rectangle obj_translated = obj_bound.translatedBy(-sensor_crop.topLeft()); + // -> and finally scaled to the ISP output. + const Rectangle obj_scaled = obj_translated.scaledBy(isp_output_size, sensor_crop.size()); + + LOG(2, obj << " -> (sensor) " << obj_sensor << " -> (bound) " << obj_bound + << " -> (translate) " << obj_translated << " -> (scaled) " << obj_scaled); + + return obj_scaled; +} + +void IMX500PostProcessingStage::SetInferenceRoiAbs(const Rectangle &roi_) const +{ + Rectangle roi = roi_.boundedTo(full_sensor_resolution_); + const uint32_t roi_array[4] = { (uint32_t)roi.x, (uint32_t)roi.y, (uint32_t)roi.width, (uint32_t)roi.height }; + + v4l2_ext_control roi_ctrl; + roi_ctrl.id = ROI_CTRL_ID; + roi_ctrl.p_u32 = (unsigned int *)&roi_array; + roi_ctrl.size = 16; + + v4l2_ext_controls ctrls; + ctrls.count = 1; + ctrls.controls = (v4l2_ext_control *)&roi_ctrl; + + int ret = ioctl(device_fd_, VIDIOC_S_EXT_CTRLS, &ctrls); + if (ret) + LOG_ERROR("IMX500: Unable to set absolute ROI"); +} + +void IMX500PostProcessingStage::SetInferenceRoiAuto(const unsigned int width, const unsigned int height) const +{ + Size s = full_sensor_resolution_.size().boundedToAspectRatio(Size(width, height)); + Rectangle r = s.centeredTo(full_sensor_resolution_.center()).enclosedIn(full_sensor_resolution_); + SetInferenceRoiAbs(r); +} + +void IMX500PostProcessingStage::ShowFwProgressBar() +{ + if (fw_progress_.is_open() && fw_progress_chunk_.is_open()) + { + std::thread progress_thread { &IMX500PostProcessingStage::doProgressBar, this }; + progress_thread.detach(); + } +} + +std::vector split(std::stringstream &stream) +{ + std::vector result; + unsigned int n; + + while (stream >> n) + result.push_back(n); + + return result; +} + +void IMX500PostProcessingStage::doProgressBar() +{ + while (1) + { + std::stringstream fw_progress_str, block_str; + unsigned int block_progress; + + fw_progress_.seekg(0); + fw_progress_chunk_.seekg(0); + fw_progress_str << fw_progress_.rdbuf(); + block_str << fw_progress_chunk_.rdbuf(); + + std::vector progress = split(fw_progress_str); + block_str >> block_progress; + + // [0] == FW state, [1] == current size, [2] == total size. + if (progress.size() == 3 && progress[0] == 2) + { + unsigned int current = progress[1] + block_progress; + std::cerr << "Network Firmware Upload: " << current * 100 / progress[2] << "% (" << current / 1024 << "/" + << progress[2] / 1024 << " KB)\r"; + std::cerr.flush(); + + if (progress[2] && progress[1] == progress[2]) + { + std::cerr << std::endl; + break; + } + } + + std::this_thread::sleep_for(500ms); + } +} diff --git a/post_processing_stages/imx500/imx500_post_processing_stage.hpp b/post_processing_stages/imx500/imx500_post_processing_stage.hpp new file mode 100644 index 00000000..4fcd76b5 --- /dev/null +++ b/post_processing_stages/imx500/imx500_post_processing_stage.hpp @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright (C) 2024, Raspberry Pi Ltd + * + * imx500_post_rpocessing_stage.hpp - IMX500 post processing stage base class + */ + +#pragma once + +#include +#include + +#include + +#include +#include + +#include "core/completed_request.hpp" +#include "core/rpicam_app.hpp" +#include "post_processing_stages/post_processing_stage.hpp" + +class IMX500PostProcessingStage : public PostProcessingStage +{ +public: + static constexpr unsigned int Max_Num_Tensors = 8; + static constexpr unsigned int Network_Name_Len = 64; + + struct OutputTensorInfo + { + uint32_t tensor_data_num; + uint16_t size; + uint8_t ordinal; + uint8_t serialization_index; + }; + + struct CnnOutputTensorInfo + { + char network_name[Network_Name_Len]; + uint32_t num_tensors; + OutputTensorInfo info[Max_Num_Tensors]; + }; + + IMX500PostProcessingStage(RPiCamApp *app); + ~IMX500PostProcessingStage(); + + void Read(boost::property_tree::ptree const ¶ms) override; + + void Configure() override; + + bool Process(CompletedRequestPtr &completed_request) override; + + libcamera::Rectangle ConvertInferenceCoordinates(const std::vector &coords, + const libcamera::Rectangle &scalerCrop) const; + void SetInferenceRoiAbs(const libcamera::Rectangle &roi_) const; + void SetInferenceRoiAuto(const unsigned int width, const unsigned int height) const; + void ShowFwProgressBar(); + +protected: + libcamera::Rectangle full_sensor_resolution_ = libcamera::Rectangle(0, 0, 4056, 3040); + libcamera::Stream *output_stream_; + libcamera::Stream *raw_stream_; + +private: + void doProgressBar(); + + int device_fd_; + std::ifstream fw_progress_; + std::ifstream fw_progress_chunk_; + + std::ofstream input_tensor_file_; + unsigned int num_input_tensors_saved_; + unsigned int save_frames_; + std::vector norm_val_; + std::vector norm_shift_; + std::vector div_val_; + unsigned int div_shift_; + std::mutex lock_; +}; diff --git a/post_processing_stages/imx500/meson.build b/post_processing_stages/imx500/meson.build new file mode 100644 index 00000000..985fbc8f --- /dev/null +++ b/post_processing_stages/imx500/meson.build @@ -0,0 +1,31 @@ +imx500_postprocessing_src = files([ + # Base stage + 'imx500_post_processing_stage.cpp', + # Object detection + 'imx500_object_detection.cpp', + # Posenet + 'imx500_posenet.cpp', +]) + +postproc_assets += files([ + assets_dir / 'imx500_mobilenet_ssd.json', + assets_dir / 'imx500_posenet.json', +]) + +imx500_postprocessing_lib = shared_module('imx500-postproc', imx500_postprocessing_src, + dependencies : libcamera_dep, + include_directories : '../..', + install : true, + install_dir : posproc_libdir, + name_prefix : '', + ) + +if get_option('download_imx500_models') + download_script = meson.project_source_root() / 'utils' / 'download-imx500-models.sh' + custom_target('imx500-models', + command : [ download_script, '@OUTPUT@' ], + output : 'imx500-models', + install : true, + install_dir : get_option('datadir'), + ) +endif diff --git a/post_processing_stages/meson.build b/post_processing_stages/meson.build index 1dd6edf7..aa101413 100644 --- a/post_processing_stages/meson.build +++ b/post_processing_stages/meson.build @@ -109,6 +109,11 @@ if hailort_dep.found() and hailo_tappas_dep.found() enable_hailo = true endif +# IMX500 postprocessing stages. +if get_option('enable_imx500') + subdir('imx500') +endif + post_processing_headers = files([ 'histogram.hpp', 'object_detect.hpp', diff --git a/post_processing_stages/post_processing_stage.hpp b/post_processing_stages/post_processing_stage.hpp index e745a491..30237ba5 100644 --- a/post_processing_stages/post_processing_stage.hpp +++ b/post_processing_stages/post_processing_stage.hpp @@ -5,9 +5,12 @@ * post_processing_stage.hpp - Post processing stage base class definition. */ +#pragma once + #include #include #include +#include // Prevents compiler warnings in Boost headers with more recent versions of GCC. #define BOOST_BIND_GLOBAL_PLACEHOLDERS @@ -71,6 +74,24 @@ class PostProcessingStage return std::chrono::duration(t2 - t1); } + template + static std::vector GetJsonArray(const boost::property_tree::ptree &pt, const std::string &key, + const std::vector &default_value = {}) + { + std::vector vec; + + if (pt.find(key) != pt.not_found()) + { + for (auto &v : pt.get_child(key)) + vec.push_back(v.second.get_value()); + } + + for (unsigned int i = vec.size(); i < default_value.size(); i++) + vec.push_back(default_value[i]); + + return vec; + } + RPiCamApp *app_; }; diff --git a/utils/download-imx500-models.sh b/utils/download-imx500-models.sh new file mode 100755 index 00000000..04e2ef3f --- /dev/null +++ b/utils/download-imx500-models.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -e + +networks=( + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_deeplabv3plus.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientdet_lite0_pp.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnet_bo.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnet_lite0.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnetv2_b0.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnetv2_b1.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_efficientnetv2_b2.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_higherhrnet_coco.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_inputtensoronly.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_levit_128s.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_mnasnet1.0.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_mobilenet_v2.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_mobilevit_xs.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_mobilevit_xxs.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_nanodet_plus_416x416_pp.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_nanodet_plus_416x416.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_posenet.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_regnetx_002.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_regnety_002.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_regnety_004.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_resnet18.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_shufflenet_v2_x1_5.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_squeezenet1.0.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_ssd_mobilenetv2_fpnlite_320x320_pp.rpk + https://github.com/raspberrypi/imx500-models/raw/main/imx500_network_yolov8n_pp.rpk +) + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +dir=$1 +mkdir -p $dir + +for url in ${networks[@]}; do + wget -nv -N -P $dir $url +done