Face Detection Depth Fix (#130)

* Ignore mouth points no farther away than the fork tip * Using depth_octomap to filter out fork/food and fitting a plane to face * Formatting cleanup * Addressed bugs & verified both face detection methods work in real --------- Co-authored-by: taylorkf <[email protected]> Co-authored-by: Amal Nanavati <[email protected]>
personalrobotics · Nov 14, 2023 · 48aeced · 48aeced
1 parent 7295169
commit 48aeced
Show file tree

Hide file tree

Showing 4 changed files with 164 additions and 37 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -193,7 +193,11 @@ function-naming-style=snake_case
 #function-rgx=
 
 # Good variable names which should always be accepted, separated by a comma.
-good-names=i,
+good-names=a,
+           b,
+           c,
+           d,
+           i,
            j,
            k,
            x,

diff --git a/ada_feeding/README.md b/ada_feeding/README.md
@@ -6,7 +6,7 @@ The `ada_feeding` package contains the overarching code to run the robot-assiste
 
 This code has been developed and tested with the Kinova JACO Gen2 Arm, on computers running Ubuntu 22.04. To get started:
 - Install [ROS2 Humble](https://docs.ros.org/en/humble/Installation.html)
-- Install Python dependencies: `python3 -m pip install overrides pyrealsense2 pyyaml py_trees pymongo tornado trimesh`
+- Install Python dependencies: `python3 -m pip install overrides pyrealsense2 pyyaml py_trees pymongo tornado trimesh scikit-spatial`
     - NOTE: [`pyrealsense2` is not released for ARM systems](https://github.com/IntelRealSense/librealsense/issues/6449#issuecomment-650784066), so ARM users will have to [build from source](https://github.com/IntelRealSense/librealsense/blob/master/wrappers/python/readme.md#building-from-source). When running `sudo make install`, pay close attention to which path `pyrealsense2` is installed to and add *that path* to the `PYTHONPATH` -- it should be `/use/local/lib` but may be `/usr/local/OFF`.
 - Install the code to command the real robot ([instructions here](https://github.com/personalrobotics/ada_ros2/blob/main/README.md))
 - Git clone the [PRL fork of pymoveit (branch: `amaln/move_collision`)](https://github.com/personalrobotics/pymoveit2/tree/amaln/move_collision) the [PRL fork of py_trees_ros (branch: `amaln/service_client`)](https://github.com/personalrobotics/py_trees_ros/tree/amaln/service_client), and [ros2_numpy (branch: `humble`)](https://github.com/Box-Robotics/ros2_numpy) into your ROS2 workspace's `src` folder.

diff --git a/ada_feeding_perception/ada_feeding_perception/face_detection.py b/ada_feeding_perception/ada_feeding_perception/face_detection.py
@@ -7,9 +7,10 @@
 
 # Standard Imports
 import collections
+from enum import Enum
 import os
 import threading
-from typing import Tuple, Union
+from typing import List, Tuple, Union
 
 # Third-party imports
 import cv2
@@ -24,8 +25,10 @@
 from rclpy.executors import MultiThreadedExecutor
 from rcl_interfaces.msg import ParameterDescriptor, ParameterType
 from sensor_msgs.msg import CompressedImage, CameraInfo, Image
+from skspatial.objects import Plane, Points
 from std_srvs.srv import SetBool
 
+
 # Local imports
 from ada_feeding_msgs.msg import FaceDetection
 from ada_feeding_perception.helpers import (
@@ -35,6 +38,15 @@
 )
 
 
+class DepthComputationMethod(Enum):
+    """
+    Enum for the different methods of computing the depth of the mouth.
+    """
+
+    AVERAGE_MOUTH_POINTS = "average_mouth_points"
+    FACIAL_PLANE = "facial_plane"
+
+
 class FaceDetectionNode(Node):
     """
     This node publishes a 3d PointStamped location
@@ -337,7 +349,7 @@ def detect_largest_face(
         is_face_detected: Whether a face was detected in the image.
         mouth_center: The (u,v) coordinates of the somion of the largest face
             detected in the image.
-        mouth_points: A list of (u,v) coordinates of the facial landmarks of the mouth.
+        face_points: A list of (u,v) coordinates of the landmarks of the face.
         face_bbox: The bounding box of the largest face detected in the image,
             in the form (x, y, w, h).
         """
@@ -354,7 +366,7 @@ def detect_largest_face(
         faces = self.detector.detectMultiScale(image_gray)
         is_face_detected = len(faces) > 0
         img_mouth_center = (0, 0)
-        img_mouth_points = []
+        img_face_points = []
         face_bbox = (0, 0, 0, 0)
 
         if is_face_detected:
@@ -377,30 +389,138 @@ def detect_largest_face(
             # Find stomion (mouth center) in image. See below for the landmark indices:
             # https://pyimagesearch.com/2017/04/03/facial-landmarks-dlib-opencv-python/
             img_mouth_center = landmarks[largest_face[1]][0][66]
-            img_mouth_points = landmarks[largest_face[1]][0][48:68]
+            img_face_points = landmarks[largest_face[1]][0]
+
+        return is_face_detected, img_mouth_center, img_face_points, face_bbox
+
+    def depth_method_average_mouth_points(
+        self,
+        mouth_points: npt.NDArray,
+        image_depth: npt.NDArray,
+        threshold: float = 0.5,
+    ) -> Tuple[bool, float]:
+        """
+        Compute the depth of the mouth stomion by averaging the depth of all
+        viable mouth points.
+
+        Parameters
+        ----------
+        mouth_points: A list of (u,v) coordinates of all facial landmarks.
+        image_depth: The depth image corresponding to the RGB image that the
+            face was detected in.
+        threshold: The minimum fraction of mouth points that must be valid for
+            the depth to be computed.
+
+        Returns
+        -------
+        detected: whether the mouth was detected in the depth image.
+        depth_mm: The depth of the mouth in mm.
+        """
+        # pylint: disable=too-many-locals
+        # This function is not too complex, but it does have a lot of local variables.
+
+        # Retrieve the depth value averaged over all viable mouth coordinates
+        depth_sum = 0
+        num_valid_points = 0
+        for point in mouth_points:
+            u, v = int(point[0]), int(point[1])
+            # Ensure that point is contained within the depth image frame
+            if 0 <= u < image_depth.shape[1] and 0 <= v < image_depth.shape[0]:
+                if image_depth[v][u] != 0:
+                    num_valid_points += 1
+                    depth_sum += image_depth[v][u]
+        if num_valid_points >= threshold * len(mouth_points):
+            # If at least half of the mouth points are valid, use the average
+            # depth of the mouth points
+            depth_mm = depth_sum / float(num_valid_points)
+            return True, depth_mm
+
+        self.get_logger().warn(
+            "The majority of mouth points were invalid (outside the frame or depth not detected). "
+            "Unable to compute mouth depth by averaging mouth points."
+        )
+        return False, 0
 
-        return is_face_detected, img_mouth_center, img_mouth_points, face_bbox
+    def depth_method_facial_plane(
+        self, face_points: npt.NDArray, image_depth: npt.NDArray, threshold: float = 0.5
+    ) -> Tuple[bool, float]:
+        """
+        Compute the depth of the mouth stomion by fitting a plane to the
+        internal points of the face and calculating the depth of the stomion
+        from the plane.
+
+        Parameters
+        ----------
+        face_points: A list of (u,v) coordinates of all facial landmarks.
+        image_depth: The depth image corresponding to the RGB image that the
+            face was detected in.
+        threshold: The minimum fraction of face points that must be valid for
+            the plane to be fit.
+
+        Returns
+        -------
+        detected: whether the mouth was detected in the depth image.
+        depth_mm: The depth of the mouth in mm.
+        """
+        # pylint: disable=too-many-locals
+        # This function is not too complex, but it does have a lot of local variables.
+
+        internal_face_points = face_points[18:]  # Ignore points along the face outline
+        face_points_3d = []
+        for point in internal_face_points:
+            u, v = int(point[0]), int(point[1])
+            # Ensure that point is contained within the depth image frame
+            if 0 <= u < image_depth.shape[1] and 0 <= v < image_depth.shape[0]:
+                if image_depth[v][u] != 0:
+                    z = image_depth[v][u]
+                    face_points_3d.append([u, v, z])
+        if len(face_points_3d) >= threshold * len(internal_face_points):
+            # Fit a plane to detected face points
+            plane = Plane.best_fit(Points(face_points_3d))
+            a, b, c, d = plane.cartesian()
+
+            # Locate u,v of stomion. See below for the landmark indices:
+            # https://pyimagesearch.com/2017/04/03/facial-landmarks-dlib-opencv-python/
+            stomion_u, stomion_v = face_points[66]
+            # Calculate depth of stomion u,v from fit plane
+            depth_mm = (d + (a * stomion_u) + (b * stomion_v)) / (-c)
+            return True, depth_mm
+        self.get_logger().warn(
+            "The majority of internal face points were invalid (outside the frame or depth not detected). "
+            "Unable to compute mouth depth from facial plane."
+        )
+        return False, 0
 
     def get_mouth_depth(
-        self, rgb_msg: Union[CompressedImage, Image], face_points: np.ndarray
+        self,
+        rgb_msg: Union[CompressedImage, Image],
+        face_points: npt.NDArray,
+        methods: List[DepthComputationMethod] = [
+            DepthComputationMethod.AVERAGE_MOUTH_POINTS,
+            DepthComputationMethod.FACIAL_PLANE,
+        ],
     ) -> Tuple[bool, float]:
         """
         Get the depth of the mouth of the largest detected face in the image.
 
         Parameters
         ----------
         rgb_msg: The RGB image that the face was detected in.
-        face_points: A list of (u,v) coordinates of the facial landmarks whose depth
-            should be averaged to get the predicted depth of the mouth.
+        face_points: A list of (u,v) coordinates of all facial landmarks.
 
         Returns
         -------
         detected: whether the mouth was detected in the depth image.
         depth_mm: The depth of the mouth in mm.
         """
 
-        # pylint: disable=too-many-locals
+        # pylint: disable=too-many-locals, dangerous-default-value
         # This function is not too complex, but it does have a lot of local variables.
+        # In this case, a list as the default value is okay since we don't change it.
+
+        # Pull out a list of (u,v) coordinates of all facial landmarks that can be
+        # averaged to approximate the mouth center
+        mouth_points = face_points[48:68]
 
         # Find depth image closest in time to RGB image that face was in
         with self.depth_buffer_lock:
@@ -432,25 +552,24 @@ def get_mouth_depth(
             desired_encoding="passthrough",
         )
 
-        # Retrieve the depth value averaged over all mouth coordinates
-        depth_sum = 0
-        num_points_in_frame = 0
-        for point in face_points:
-            x, y = int(point[0]), int(point[1])
-            if 0 <= x < image_depth.shape[1] and 0 <= y < image_depth.shape[0]:
-                # Points with depth 0mm are invalid (typically out-of-range)
-                if image_depth[y][x] != 0:
-                    num_points_in_frame += 1
-                    depth_sum += image_depth[y][x]
-        if num_points_in_frame < 0.5 * len(face_points):
-            self.get_logger().warn(
-                "Detected face in the RGB image, but majority of mouth points "
-                "were invalid. Ignoring this face."
-            )
-            return False, 0
+        # Compute the depth of the mouth
+        for method in methods:
+            if method == DepthComputationMethod.AVERAGE_MOUTH_POINTS:
+                detected, depth_mm = self.depth_method_average_mouth_points(
+                    mouth_points, image_depth
+                )
+            elif method == DepthComputationMethod.FACIAL_PLANE:
+                detected, depth_mm = self.depth_method_facial_plane(
+                    face_points, image_depth
+                )
+            else:
+                self.get_logger().warn(
+                    f"Invalid depth computation method: {method}. Skipping."
+                )
+            if detected:
+                return detected, depth_mm
 
-        depth_mm = depth_sum / float(num_points_in_frame)
-        return True, depth_mm
+        return False, 0
 
     def get_stomion_point(self, u: int, v: int, depth_mm: float) -> Point:
         """
@@ -522,14 +641,14 @@ def run(self) -> None:
             (
                 is_face_detected,
                 img_mouth_center,
-                img_mouth_points,
+                img_face_points,
                 face_bbox,
             ) = self.detect_largest_face(image_bgr)
 
             if is_face_detected:
                 # Get the depth of the mouth
                 is_face_detected_depth, depth_mm = self.get_mouth_depth(
-                    rgb_msg, img_mouth_points
+                    rgb_msg, img_face_points
                 )
                 if is_face_detected_depth:
                     # Get the 3d location of the mouth
@@ -544,7 +663,7 @@ def run(self) -> None:
 
             # Annotate the image with the face
             if is_face_detected:
-                self.annotate_image(image_bgr, img_mouth_points, face_bbox)
+                self.annotate_image(image_bgr, img_face_points, face_bbox)
 
             # Publish the face detection image
             self.publisher_image.publish(
@@ -558,16 +677,16 @@ def run(self) -> None:
     def annotate_image(
         self,
         image_bgr: npt.NDArray,
-        img_mouth_points: npt.NDArray,
+        img_face_points: npt.NDArray,
         face_bbox: Tuple[int, int, int, int],
     ) -> None:
         """
-        Annotate the image with the face and mouth center.
+        Annotate the image with the face and facial landmarks center.
 
         Parameters
         ----------
         image_bgr: The OpenCV image to annotate.
-        img_mouth_points: A list of (u,v) coordinates of the facial landmarks of the mouth.
+        img_face_points: A list of (u,v) coordinates of the facial landmarks.
         face_bbox: The bounding box of the largest face detected in the image,
             in the form (x, y, w, h).
         """
@@ -577,7 +696,7 @@ def annotate_image(
 
         # Annotate the image with the face.
         cv2.rectangle(image_bgr, (x, y), (x + w, y + h), (255, 255, 255), 2)
-        for landmark_x, landmark_y in img_mouth_points:
+        for landmark_x, landmark_y in img_face_points:
             cv2.circle(
                 image_bgr,
                 (int(landmark_x), int(landmark_y)),

diff --git a/ada_feeding_perception/launch/ada_feeding_perception.launch.py b/ada_feeding_perception/launch/ada_feeding_perception.launch.py
@@ -84,7 +84,11 @@ def generate_launch_description():
         (
             "~/aligned_depth",
             PythonExpression(
-                expression=["'", prefix, "/camera/aligned_depth_to_color/image_raw'"]
+                expression=[
+                    "'",
+                    prefix,
+                    "/camera/aligned_depth_to_color/image_raw'",
+                ]
             ),
         ),
     ]