diff --git a/demo_final/KalmanFilter1D.py b/demo_final/KalmanFilter1D.py new file mode 100755 index 0000000..0ecbf93 --- /dev/null +++ b/demo_final/KalmanFilter1D.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +# -------------------------------------------------------- +# Copyright (C) 2020 NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License (1-Way Commercial) +# Code written by Pavlo Molchanov, Shalini De Mello. +# -------------------------------------------------------- + +import numpy as np + +class Kalman1D(object): + + def __init__(self, R=0.001**2, sz=100): + self.Q = 1e-5 # process variance + # allocate space for arrays + self.xhat = np.zeros(sz, dtype=complex) # a posteri estimate of x + self.P = np.zeros(sz, dtype=complex) # a posteri error estimate + self.xhatminus = np.zeros(sz, dtype=complex) # a priori estimate of x + self.Pminus = np.zeros(sz, dtype=complex) # a priori error estimate + self.K = np.zeros(sz, dtype=complex) # gain or blending factor + self.R = R # estimate of measurement variance, change to see effect + self.sz = sz + # intial guesses + self.xhat[0] = 0.0 + self.P[0] = 1.0 + self.k = 1 + + def update(self, val): + k = self.k % self.sz + km = (self.k-1) % self.sz + self.xhatminus[k] = self.xhat[km] + self.Pminus[k] = self.P[km] + self.Q + + # measurement update + self.K[k] = self.Pminus[k]/( self.Pminus[k]+self.R ) + self.xhat[k] = self.xhatminus[k]+self.K[k]*(val-self.xhatminus[k]) + self.P[k] = (1-self.K[k])*self.Pminus[k] + self.k = self.k + 1 + return self.xhat[k] diff --git a/demo_final/camera.py b/demo_final/camera.py new file mode 100644 index 0000000..14022a9 --- /dev/null +++ b/demo_final/camera.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +# -------------------------------------------------------- +# Copyright (C) 2020 NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License (1-Way Commercial) +# Code written by Shalini De Mello, Seonwook Park. +# -------------------------------------------------------- + +import cv2 +import numpy as np +import pickle + +def cam_calibrate(cam_idx, cap, cam_calib): + + # termination criteria + criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 0.001) + + # prepare object points, like (0,0,0), (1,0,0), (2,0,0) ....,(6,5,0) + pts = np.zeros((6 * 9, 3), np.float32) + pts[:, :2] = np.mgrid[0:9, 0:6].T.reshape(-1, 2) + + # capture calibration frames + obj_points = [] # 3d point in real world space + img_points = [] # 2d points in image plane. + frames = [] + while True: + ret, frame = cap.read() + + if ret: + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + ret, corners = cv2.findChessboardCorners(gray, (9, 6), None) + if ret: + cv2.cornerSubPix(gray, corners, (11, 11), (-1, -1), criteria) + # Draw and display the corners + frame_copy = frame.copy() + cv2.drawChessboardCorners(frame_copy, (9, 6), corners, ret) + cv2.imshow('points', frame_copy) + + # s to save, c to continue, q to quit + if cv2.waitKey(0) & 0xFF == ord('s'): + img_points.append(corners) + obj_points.append(pts) + frames.append(frame) + elif cv2.waitKey(0) & 0xFF == ord('n'): + continue + elif cv2.waitKey(0) & 0xFF == ord('q'): + cv2.destroyAllWindows() + break + + # compute calibration matrices + ret, mtx, dist, rvecs, tvecs = cv2.calibrateCamera(obj_points, img_points, frames[0].shape[0:2], None, None) + + # check + error = 0.0 + for i in range(len(frames)): + proj_imgpoints, _ = cv2.projectPoints(obj_points[i], rvecs[i], tvecs[i], mtx, dist) + error += (cv2.norm(img_points[i], proj_imgpoints, cv2.NORM_L2) / len(proj_imgpoints)) + print("Camera calibrated successfully, total re-projection error: %f" % (error / len(frames))) + + cam_calib['mtx'] = mtx + cam_calib['dist'] = dist + print("Camera parameters:") + print(cam_calib) + + pickle.dump(cam_calib, open("calib_cam%d.pkl" % (cam_idx), "wb")) diff --git a/demo_final/face.py b/demo_final/face.py new file mode 100644 index 0000000..fb53683 --- /dev/null +++ b/demo_final/face.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +# -------------------------------------------------------- +# Copyright (C) 2020 NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License (1-Way Commercial) +# Code written by Seonwook Park, Shalini De Mello. +# -------------------------------------------------------- + +import sys +import cv2 + +sys.path.append("ext/mtcnn-pytorch/") +from src import detect_faces, show_bboxes +from PIL import Image + +class face: + + def detect(frame, scale = 1.0, use_max='SIZE'): + + # detect face + frame_small = cv2.resize(frame, (0, 0), fx=scale, fy=scale) + frame_rgb = cv2.cvtColor(frame_small, cv2.COLOR_BGR2RGB) + pil_im = Image.fromarray(frame_rgb) + bounding_boxes, landmarks = detect_faces(pil_im, min_face_size=30.0) + dets = [x[:4] for x in bounding_boxes] + scores = [x[4] for x in bounding_boxes] + + face_location = [] + if len(dets) > 0: + max = 0 + max_id = -1 + for i, d in enumerate(dets): + if use_max == 'SCORE': + property = scores[i] + elif use_max == 'SIZE': + property = abs(dets[i][2] - dets[i][0]) * abs(dets[i][3] - dets[i][1]) + if max < property: + max = property + max_id = i + if use_max == 'SCORE': + if max > -0.5: + face_location = dets[max_id] + else: + face_location = dets[max_id] + face_location = face_location * (1/scale) + + return face_location + diff --git a/demo_final/frame_processor.py b/demo_final/frame_processor.py new file mode 100644 index 0000000..0430309 --- /dev/null +++ b/demo_final/frame_processor.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python3 + +# -------------------------------------------------------- +# Copyright (C) 2020 NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License (1-Way Commercial) +# Code written by Shalini De Mello, Seonwook Park +# -------------------------------------------------------- + +import cv2 +from subprocess import call +import numpy as np +from os import path +import pickle +import sys +import os +import torch + +sys.path.append("ext/eth") +from undistorter import Undistorter +from KalmanFilter1D import Kalman1D + +from face import face +from landmarks import landmarks +from head import PnPHeadPoseEstimator +from normalization import normalize + +class frame_processer: + + def __init__(self, cam_calib): + + self.cam_calib = cam_calib + + ####################################################### + #### prepare Kalman filters, R can change behaviour of Kalman filter + #### play with it to get better smoothing, larger R - more smoothing and larger delay + ####################################################### + self.kalman_filters = list() + for point in range(2): + # initialize kalman filters for different coordinates + # will be used for face detection over a single object + self.kalman_filters.append(Kalman1D(sz=100, R=0.01 ** 2)) + + self.kalman_filters_landm = list() + for point in range(68): + # initialize Kalman filters for different coordinates + # will be used to smooth landmarks over the face for a single face tracking + self.kalman_filters_landm.append(Kalman1D(sz=100, R=0.005 ** 2)) + + # initialize Kalman filter for the on-screen gaze point-of regard + self.kalman_filter_gaze = list() + self.kalman_filter_gaze.append(Kalman1D(sz=100, R=0.01 ** 2)) + + self.undistorter = Undistorter(self.cam_calib['mtx'], self.cam_calib['dist']) + self.landmarks_detector = landmarks() + self.head_pose_estimator = PnPHeadPoseEstimator() + + + def process(self, subject, cap, mon, device, gaze_network, por_available=False, show=False): + + g_t = None + data = {'image_a': [], 'gaze_a': [], 'head_a': [], 'R_gaze_a': [], 'R_head_a': []} + if por_available: + f = open('./%s_calib_target.pkl' % subject, 'rb') + targets = pickle.load(f) + + frames_read = 0 + ret, img = cap.read() + while ret: + img = self.undistorter.apply(img) + if por_available: + g_t = targets[frames_read] + frames_read += 1 + + # detect face + face_location = face.detect(img, scale=0.25, use_max='SIZE') + + if len(face_location) > 0: + # use kalman filter to smooth bounding box position + # assume work with complex numbers: + output_tracked = self.kalman_filters[0].update(face_location[0] + 1j * face_location[1]) + face_location[0], face_location[1] = np.real(output_tracked), np.imag(output_tracked) + output_tracked = self.kalman_filters[1].update(face_location[2] + 1j * face_location[3]) + face_location[2], face_location[3] = np.real(output_tracked), np.imag(output_tracked) + + # detect facial points + pts = self.landmarks_detector.detect(face_location, img) + # run Kalman filter on landmarks to smooth them + for i in range(68): + kalman_filters_landm_complex = self.kalman_filters_landm[i].update(pts[i, 0] + 1j * pts[i, 1]) + pts[i, 0], pts[i, 1] = np.real(kalman_filters_landm_complex), np.imag(kalman_filters_landm_complex) + + # compute head pose + fx, _, cx, _, fy, cy, _, _, _ = self.cam_calib['mtx'].flatten() + camera_parameters = np.asarray([fx, fy, cx, cy]) + rvec, tvec = self.head_pose_estimator.fit_func(pts, camera_parameters) + + ######### GAZE PART ######### + + # create normalized eye patch and gaze and head pose value, + # if the ground truth point of regard is given + head_pose = (rvec, tvec) + por = None + if por_available: + por = np.zeros((3, 1)) + por[0] = g_t[0] + por[1] = g_t[1] + entry = { + 'full_frame': img, + '3d_gaze_target': por, + 'camera_parameters': camera_parameters, + 'full_frame_size': (img.shape[0], img.shape[1]), + 'face_bounding_box': (int(face_location[0]), int(face_location[1]), + int(face_location[2] - face_location[0]), + int(face_location[3] - face_location[1])) + } + [patch, h_n, g_n, inverse_M, gaze_cam_origin, gaze_cam_target] = normalize(entry, head_pose) + # cv2.imshow('raw patch', patch) + + def preprocess_image(image): + ycrcb = cv2.cvtColor(image, cv2.COLOR_RGB2YCrCb) + ycrcb[:, :, 0] = cv2.equalizeHist(ycrcb[:, :, 0]) + image = cv2.cvtColor(ycrcb, cv2.COLOR_YCrCb2RGB) + # cv2.imshow('processed patch', image) + + image = np.transpose(image, [2, 0, 1]) # CxHxW + image = 2.0 * image / 255.0 - 1 + return image + + # estimate the PoR using the gaze network + processed_patch = preprocess_image(patch) + processed_patch = processed_patch[np.newaxis, :, :, :] + + # Functions to calculate relative rotation matrices for gaze dir. and head pose + def R_x(theta): + sin_ = np.sin(theta) + cos_ = np.cos(theta) + return np.array([ + [1., 0., 0.], + [0., cos_, -sin_], + [0., sin_, cos_] + ]).astype(np.float32) + + def R_y(phi): + sin_ = np.sin(phi) + cos_ = np.cos(phi) + return np.array([ + [cos_, 0., sin_], + [0., 1., 0.], + [-sin_, 0., cos_] + ]).astype(np.float32) + + def calculate_rotation_matrix(e): + return np.matmul(R_y(e[1]), R_x(e[0])) + + def pitchyaw_to_vector(pitchyaw): + + vector = np.zeros((3, 1)) + vector[0, 0] = np.cos(pitchyaw[0]) * np.sin(pitchyaw[1]) + vector[1, 0] = np.sin(pitchyaw[0]) + vector[2, 0] = np.cos(pitchyaw[0]) * np.cos(pitchyaw[1]) + return vector + + # compute the ground truth POR if the + # ground truth is available + R_head_a = calculate_rotation_matrix(h_n) + R_gaze_a = np.zeros((1, 3, 3)) + if type(g_n) is np.ndarray: + R_gaze_a = calculate_rotation_matrix(g_n) + + # verify that g_n can be transformed back + # to the screen's pixel location shown + # during calibration + gaze_n_vector = pitchyaw_to_vector(g_n) + gaze_n_forward = -gaze_n_vector + g_cam_forward = inverse_M * gaze_n_forward + + # compute the POR on z=0 plane + d = -gaze_cam_origin[2] / g_cam_forward[2] + por_cam_x = gaze_cam_origin[0] + d * g_cam_forward[0] + por_cam_y = gaze_cam_origin[1] + d * g_cam_forward[1] + por_cam_z = 0.0 + + x_pixel_gt, y_pixel_gt = mon.camera_to_monitor(por_cam_x, por_cam_y) + # verified for correctness of calibration targets + + input_dict = { + 'image_a': processed_patch, + 'gaze_a': g_n, + 'head_a': h_n, + 'R_gaze_a': R_gaze_a, + 'R_head_a': R_head_a, + } + if por_available: + data['image_a'].append(processed_patch) + data['gaze_a'].append(g_n) + data['head_a'].append(h_n) + data['R_gaze_a'].append(R_gaze_a) + data['R_head_a'].append(R_head_a) + + if show: + + # compute eye gaze and point of regard + for k, v in input_dict.items(): + input_dict[k] = torch.FloatTensor(v).to(device).detach() + + gaze_network.eval() + output_dict = gaze_network(input_dict) + output = output_dict['gaze_a_hat'] + g_cnn = output.data.cpu().numpy() + g_cnn = g_cnn.reshape(3, 1) + g_cnn /= np.linalg.norm(g_cnn) + + # compute the POR on z=0 plane + g_n_forward = -g_cnn + g_cam_forward = inverse_M * g_n_forward + g_cam_forward = g_cam_forward / np.linalg.norm(g_cam_forward) + + d = -gaze_cam_origin[2] / g_cam_forward[2] + por_cam_x = gaze_cam_origin[0] + d * g_cam_forward[0] + por_cam_y = gaze_cam_origin[1] + d * g_cam_forward[1] + por_cam_z = 0.0 + + x_pixel_hat, y_pixel_hat = mon.camera_to_monitor(por_cam_x, por_cam_y) + + output_tracked = self.kalman_filter_gaze[0].update(x_pixel_hat + 1j * y_pixel_hat) + x_pixel_hat, y_pixel_hat = np.ceil(np.real(output_tracked)), np.ceil(np.imag(output_tracked)) + + # show point of regard on screen + display = np.ones((mon.h_pixels, mon.w_pixels, 3), np.float32) + h, w, c = patch.shape + display[0:h, int(mon.w_pixels/2 - w/2):int(mon.w_pixels/2 + w/2), :] = 1.0 * patch / 255.0 + font = cv2.FONT_HERSHEY_SIMPLEX + if type(g_n) is np.ndarray: + cv2.putText(display, '.', (x_pixel_gt, y_pixel_gt), font, 0.5, (0, 0, 0), 10, cv2.LINE_AA) + cv2.putText(display, '.', (int(x_pixel_hat), int(y_pixel_hat)), font, 0.5, (0, 0, 255), 10, cv2.LINE_AA) + cv2.namedWindow("por", cv2.WINDOW_NORMAL) + cv2.setWindowProperty("por", cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN) + cv2.imshow('por', display) + + # also show the face: + cv2.rectangle(img, (int(face_location[0]), int(face_location[1])), + (int(face_location[2]), int(face_location[3])), (255, 0, 0), 2) + self.landmarks_detector.plot_markers(img, pts) + self.head_pose_estimator.drawPose(img, rvec, tvec, self.cam_calib['mtx'], np.zeros((1, 4))) + cv2.imshow('image', img) + + if cv2.waitKey(1) & 0xFF == ord('q'): + cv2.destroyAllWindows() + cap.release() + break + + # read the next frame + ret, img = cap.read() + + return data + + diff --git a/demo_final/head.py b/demo_final/head.py new file mode 100644 index 0000000..9c30980 --- /dev/null +++ b/demo_final/head.py @@ -0,0 +1,175 @@ +""" +Copyright 2019 ETH Zurich, Seonwook Park + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +# -------------------------------------------------------- +# Copyright (C) 2020 NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License (1-Way Commercial) +# Code written by Shalini De Mello, Seonwook Park. +# -------------------------------------------------------- + +import os +import cv2 +import eos +import numpy as np + +class EosHeadPoseEstimator(object): + + def __init__(self): + cwd = os.path.dirname(__file__) + base_dir = cwd + '/ext/eos' + + model = eos.morphablemodel.load_model(base_dir + '/share/sfm_shape_3448.bin') + self.blendshapes = eos.morphablemodel.load_blendshapes( + base_dir + '/share/expression_blendshapes_3448.bin') + self.morphablemodel_with_expressions = eos.morphablemodel.MorphableModel( + model.get_shape_model(), self.blendshapes, + eos.morphablemodel.PcaModel(), + model.get_texture_coordinates(), + ) + self.landmark_mapper = eos.core.LandmarkMapper( + base_dir + '/share/ibug_to_sfm.txt') + self.edge_topology = eos.morphablemodel.load_edge_topology( + base_dir + '/share/sfm_3448_edge_topology.json') + self.contour_landmarks = eos.fitting.ContourLandmarks.load( + base_dir + '/share/ibug_to_sfm.txt') + self.model_contour = eos.fitting.ModelContour.load( + base_dir + '/share/sfm_model_contours.json') + + def fit_func(self, landmarks, image_size): + image_w, image_h = image_size + return eos.fitting.fit_shape_and_pose( + self.morphablemodel_with_expressions, landmarks_to_eos(landmarks), + self.landmark_mapper, image_w, image_h, self.edge_topology, + self.contour_landmarks, self.model_contour, + ) + + +def landmarks_to_eos(landmarks): + out = [] + for i, (x, y) in enumerate(landmarks[:68, :]): + out.append(eos.core.Landmark(str(i + 1), [x, y])) + return out + + +class PnPHeadPoseEstimator(object): + ibug_ids_to_use = sorted([ + 28, 29, 30, 31, # nose ridge + 32, 33, 34, 35, 36, # nose base + 37, 40, # left-eye corners + 43, 46, # right-eye corners + ]) + + def __init__(self): + # Load and extract vertex positions for selected landmarks + cwd = os.path.dirname(__file__) + base_dir = cwd + '/ext/eos' + self.model = eos.morphablemodel.load_model( + base_dir + '/share/sfm_shape_3448.bin') + self.shape_model = self.model.get_shape_model() + self.landmarks_mapper = eos.core.LandmarkMapper( + base_dir + '/share/ibug_to_sfm.txt') + self.sfm_points_ibug_subset = np.array([ + self.shape_model.get_mean_at_point( + int(self.landmarks_mapper.convert(str(d))) + ) + for d in range(1, 69) + if self.landmarks_mapper.convert(str(d)) is not None + ]) + + self.sfm_points_for_pnp = np.array([ + self.shape_model.get_mean_at_point( + int(self.landmarks_mapper.convert(str(d))) + ) + for d in self.ibug_ids_to_use + ]) + + # Rotate face around + rotate_mat = np.asarray([[1, 0, 0], [0, -1, 0], [0, 0, -1]], dtype=np.float64) + self.sfm_points_ibug_subset = np.matmul(self.sfm_points_ibug_subset.reshape(-1, 3), rotate_mat) + self.sfm_points_for_pnp = np.matmul(self.sfm_points_for_pnp.reshape(-1, 3), rotate_mat) + + # Center on mean point between eye corners + between_eye_point = np.mean(self.sfm_points_for_pnp[-4:, :], axis=0) + self.sfm_points_ibug_subset -= between_eye_point.reshape(1, 3) + self.sfm_points_for_pnp -= between_eye_point.reshape(1, 3) + + # # Visualize selected vertices as scatter plot + # print(self.sfm_points_for_pnp) + # import matplotlib.pyplot as plt + # from mpl_toolkits.mplot3d import Axes3D + # fig = plt.figure(figsize=(8,8)) + # ax = fig.add_subplot(111, projection='3d') + # ax.scatter( + # self.sfm_points_for_pnp[:, 0], + # self.sfm_points_for_pnp[:, 1], + # self.sfm_points_for_pnp[:, 2], + # ) + # ax.set_xlabel('x') + # ax.set_ylabel('y') + # ax.set_zlabel('z') + # plt.show(block=True) + + def fit_func(self, landmarks, camera_parameters): + landmarks = np.array([ + landmarks[i - 1, :] + for i in self.ibug_ids_to_use + ], dtype=np.float64) + fx, fy, cx, cy = camera_parameters + + # Initial fit + camera_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float64) + success, rvec, tvec, inliers = cv2.solvePnPRansac(self.sfm_points_for_pnp, landmarks, + camera_matrix, None, flags=cv2.SOLVEPNP_EPNP) + + # Second fit for higher accuracy + success, rvec, tvec = cv2.solvePnP(self.sfm_points_for_pnp, landmarks, camera_matrix, None, + rvec=rvec, tvec=tvec, useExtrinsicGuess=True, flags=cv2.SOLVEPNP_ITERATIVE) + + return rvec, tvec + + def project_model(self, rvec, tvec, camera_parameters): + fx, fy, cx, cy = camera_parameters + camera_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float64) + points, _ = cv2.projectPoints(self.sfm_points_ibug_subset, rvec, tvec, camera_matrix, None) + return points + + + def drawPose(self, img, r, t, cam, dist): + + modelAxes = np.array([ + np.array([0., -20., 0.]).reshape(1, 3), + np.array([50., -20., 0.]).reshape(1, 3), + np.array([0., -70., 0.]).reshape(1, 3), + np.array([0., -20., -50.]).reshape(1, 3) + ]) + + projAxes, jac = cv2.projectPoints(modelAxes, r, t, cam, dist) + + cv2.line(img, (int(projAxes[0, 0, 0]), int(projAxes[0, 0, 1])), + (int(projAxes[1, 0, 0]), int(projAxes[1, 0, 1])), + (0, 255, 255), 2) + cv2.line(img, (int(projAxes[0, 0, 0]), int(projAxes[0, 0, 1])), + (int(projAxes[2, 0, 0]), int(projAxes[2, 0, 1])), + (255, 0, 255), 2) + cv2.line(img, (int(projAxes[0, 0, 0]), int(projAxes[0, 0, 1])), + (int(projAxes[3, 0, 0]), int(projAxes[3, 0, 1])), + (255, 255, 0), 2) diff --git a/demo_final/landmarks.py b/demo_final/landmarks.py new file mode 100644 index 0000000..82291ee --- /dev/null +++ b/demo_final/landmarks.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 + +# -------------------------------------------------------- +# Copyright (C) 2020 NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License (1-Way Commercial) +# Code written by Shalini De Mello, Seonwook Park. +# -------------------------------------------------------- + +import sys +import cv2 +import numpy as np +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn + +sys.path.append("ext/HRNet-Facial-Landmark-Detection") +from lib.config import config +import lib.models as models +from lib.datasets import get_dataset +from lib.core import evaluation +from lib.utils import transforms + +from face import face + +class landmarks: + + def __init__(self, config=config): + + config.defrost() + config.merge_from_file("ext/HRNet-Facial-Landmark-Detection/experiments/wflw/face_alignment_wflw_hrnet_w18.yaml") + config.freeze() + + cudnn.benchmark = config.CUDNN.BENCHMARK + cudnn.determinstic = config.CUDNN.DETERMINISTIC + cudnn.enabled = config.CUDNN.ENABLED + + config.defrost() + config.MODEL.INIT_WEIGHTS = False + config.freeze() + + self.model = models.get_face_alignment_net(config) + state_dict = torch.load("ext/HRNet-Facial-Landmark-Detection/hrnetv2_pretrained/HR18-WFLW.pth") + self.model.load_state_dict(state_dict, strict=False) + + gpus = list(config.GPUS) + self.model = nn.DataParallel(self.model, device_ids=gpus).cuda() + + def map_to_300vw(self): + + DLIB_68_PTS_MODEL_IDX = { + "jaw": list(range(0, 17)), + "left_eyebrow": list(range(17, 22)), + "right_eyebrow": list(range(22, 27)), + "nose": list(range(27, 36)), + "left_eye": list(range(36, 42)), + "right_eye": list(range(42, 48)), + "left_eye_poly": list(range(36, 42)), + "right_eye_poly": list(range(42, 48)), + "mouth": list(range(48, 68)), + "eyes": list(range(36, 42)) + list(range(42, 48)), + "eyebrows": list(range(17, 22)) + list(range(22, 27)), + "eyes_and_eyebrows": list(range(17, 22)) + list(range(22, 27)) + list(range(36, 42)) + list(range(42, 48)), + } + + WFLW_98_PTS_MODEL_IDX = { + "jaw": list(range(0, 33)), + "left_eyebrow": list(range(33, 42)), + "right_eyebrow": list(range(42, 51)), + "nose": list(range(51, 60)), + "left_eye": list(range(60, 68)) + [96], + "right_eye": list(range(68, 76)) + [97], + "left_eye_poly": list(range(60, 68)), + "right_eye_poly": list(range(68, 76)), + "mouth": list(range(76, 96)), + "eyes": list(range(60, 68)) + [96] + list(range(68, 76)) + [97], + "eyebrows": list(range(33, 42)) + list(range(42, 51)), + "eyes_and_eyebrows": list(range(33, 42)) + list(range(42, 51)) + list(range(60, 68)) + [96] + list( + range(68, 76)) + [97], + } + + DLIB_68_TO_WFLW_98_IDX_MAPPING = OrderedDict() + DLIB_68_TO_WFLW_98_IDX_MAPPING.update(dict(zip(range(0, 17), range(0, 34, 2)))) # jaw | 17 pts + DLIB_68_TO_WFLW_98_IDX_MAPPING.update( + dict(zip(range(17, 22), range(33, 38)))) # left upper eyebrow points | 5 pts + DLIB_68_TO_WFLW_98_IDX_MAPPING.update( + dict(zip(range(22, 27), range(42, 47)))) # right upper eyebrow points | 5 pts + DLIB_68_TO_WFLW_98_IDX_MAPPING.update(dict(zip(range(27, 36), range(51, 60)))) # nose points | 9 pts + DLIB_68_TO_WFLW_98_IDX_MAPPING.update({36: 60}) # left eye points | 6 pts + DLIB_68_TO_WFLW_98_IDX_MAPPING.update({37: 61}) + DLIB_68_TO_WFLW_98_IDX_MAPPING.update({38: 63}) + DLIB_68_TO_WFLW_98_IDX_MAPPING.update({39: 64}) + DLIB_68_TO_WFLW_98_IDX_MAPPING.update({40: 65}) + DLIB_68_TO_WFLW_98_IDX_MAPPING.update({41: 67}) + DLIB_68_TO_WFLW_98_IDX_MAPPING.update({42: 68}) # right eye | 6 pts + DLIB_68_TO_WFLW_98_IDX_MAPPING.update({43: 69}) + DLIB_68_TO_WFLW_98_IDX_MAPPING.update({44: 71}) + DLIB_68_TO_WFLW_98_IDX_MAPPING.update({45: 72}) + DLIB_68_TO_WFLW_98_IDX_MAPPING.update({46: 73}) + DLIB_68_TO_WFLW_98_IDX_MAPPING.update({47: 75}) + DLIB_68_TO_WFLW_98_IDX_MAPPING.update(dict(zip(range(48, 68), range(76, 96)))) # mouth points | 20 pts + + WFLW_98_TO_DLIB_68_IDX_MAPPING = {k: v for k, v in DLIB_68_TO_WFLW_98_IDX_MAPPING.items()} + + return list(WFLW_98_TO_DLIB_68_IDX_MAPPING.values()) + + def detect(self, face_location, frame): + + x_min = face_location[0] + y_min = face_location[1] + x_max = face_location[2] + y_max = face_location[3] + + w = x_max - x_min + h = y_max - y_min + scale = max(w, h) / 200 + scale *= 1.25 + + center_w = (x_min + x_max) / 2 + center_h = (y_min + y_max) / 2 + center = torch.Tensor([center_w, center_h]) + + frame_rgb = np.array(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), dtype=np.float32) + img = transforms.crop(frame_rgb, center, scale, [256, 256], rot=0) + + img = img.astype(np.float32) + img_mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) + img_std = np.array([0.229, 0.224, 0.225], dtype=np.float32) + img = (img / 255.0 - img_mean) / img_std + img = img.transpose([2, 0, 1]) + img = np.expand_dims(img, axis=0) + img = torch.Tensor(img) + + self.model.eval() + output = self.model(img) + score_map = output.data.cpu() + center = np.expand_dims(np.array(center, dtype=np.float32), axis=0) + scale = np.expand_dims(np.array(scale, dtype=np.float32), axis=0) + preds = evaluation.decode_preds(score_map, center, scale, [64, 64]) + preds = np.squeeze(preds.numpy(), axis=0) + + # get the 68 300 VW points: + idx_300vw = self.map_to_300vw() + preds = preds[idx_300vw, :] + + return preds + + def plot_markers(self, img, markers, color=(0, 0, 255), radius=3, drawline=False): + # plot all 68 pts on the face image + N = markers.shape[0] + # if N >= 68: + # last_point = 68 + for i in range(0, N): + x = markers[i, 0] + y = markers[i, 1] + # cv2.circle(img, (x, y), radius, color) + font = cv2.FONT_HERSHEY_SIMPLEX + cv2.putText(img, str(i), (x, y), font, 0.3, (255, 0, 0), 1, cv2.LINE_AA) + + if drawline: + def draw_line(start, end): + for i in range(start, end): + x1 = markers[i, 0] + y1 = markers[i, 1] + x2 = markers[i + 1, 0] + y2 = markers[i + 1, 1] + cv2.line(img, (x1, y1), (x2, y2), color) + + draw_line(0, 16) + draw_line(17, 21) + draw_line(22, 26) + draw_line(27, 35) + draw_line(36, 41) + draw_line(42, 47) + draw_line(48, 67) + + return img + + +# landmarks_detector = landmarks() +# +# img = cv2.imread('test2.jpg') +# #img = cv2.resize(img, None, fx=0.5, fy=0.5) +# +# # detect the largest face: +# face_location = face.detect(img, use_max='SIZE') +# +# # detect facial points +# pts = landmarks_detector.detect(face_location, img) +# +# # display +# cv2.rectangle(img, (int(face_location[0]), int(face_location[1])), (int(face_location[2]), int(face_location[3])), (255, 0, 0), 2) +# landmarks_detector.plot_markers(img, pts, drawline=True) +# cv2.imwrite('test_out.png',img) +# cv2.imshow('test', img) +cv2.waitKey(0) diff --git a/demo_final/monitor.py b/demo_final/monitor.py new file mode 100644 index 0000000..fd1f737 --- /dev/null +++ b/demo_final/monitor.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +# -------------------------------------------------------- +# Copyright (C) 2020 NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License (1-Way Commercial) +# Code written by Shalini De Mello, Seonwook Park. +# -------------------------------------------------------- + +import gi.repository +gi.require_version('Gdk', '3.0') +from gi.repository import Gdk +import numpy as np + +class monitor: + + def __init__(self): + display = Gdk.Display.get_default() + screen = display.get_default_screen() + default_screen = screen.get_default() + num = default_screen.get_number() + + self.h_mm = default_screen.get_monitor_height_mm(num) + self.w_mm = default_screen.get_monitor_width_mm(num) + + self.h_pixels = default_screen.get_height() + self.w_pixels = default_screen.get_width() + + def monitor_to_camera(self, x_pixel, y_pixel): + + # assumes in-build laptop camera, located centered and 10 mm above display + # update this function for you camera and monitor using: https://github.com/computer-vision/takahashi2012cvpr + x_cam_mm = ((int(self.w_pixels/2) - x_pixel)/self.w_pixels) * self.w_mm + y_cam_mm = 10.0 + (y_pixel/self.h_pixels) * self.h_mm + z_cam_mm = 0.0 + + return x_cam_mm, y_cam_mm, z_cam_mm + + def camera_to_monitor(self, x_cam_mm, y_cam_mm): + # assumes in-build laptop camera, located centered and 10 mm above display + # update this function for you camera and monitor using: https://github.com/computer-vision/takahashi2012cvpr + x_mon_pixel = np.ceil(int(self.w_pixels/2) - x_cam_mm * self.w_pixels / self.w_mm) + y_mon_pixel = np.ceil((y_cam_mm - 10.0) * self.h_pixels / self.h_mm) + + return x_mon_pixel, y_mon_pixel diff --git a/demo_final/normalization.py b/demo_final/normalization.py new file mode 100644 index 0000000..d7b42c0 --- /dev/null +++ b/demo_final/normalization.py @@ -0,0 +1,159 @@ +""" +Copyright 2019 ETH Zurich, Seonwook Park + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +#!/usr/bin/env python3 + +# -------------------------------------------------------- +# Copyright (C) 2020 NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License (1-Way Commercial) +# Code written by Seonwook Park, Shalini De Mello. +# -------------------------------------------------------- + +import cv2 +import numpy as np + +from head import PnPHeadPoseEstimator +head_pose_estimator = PnPHeadPoseEstimator() + +def common_pre(entry, head_pose): + + rvec, tvec = head_pose + if rvec is None or tvec is None: + raise ValueError('rvec or tvec is None') + + # Calculate rotation matrix and euler angles + rvec = rvec.reshape(3, 1) + tvec = tvec.reshape(3, 1) + rotate_mat, _ = cv2.Rodrigues(rvec) + + # Reconstruct frame + full_frame = cv2.cvtColor(entry['full_frame'], cv2.COLOR_BGR2RGB) + + # Form camera matrix + fx, fy, cx, cy = entry['camera_parameters'] + camera_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], + dtype=np.float64) + + # Get camera parameters + normalized_parameters = { + 'focal_length': 1300, + 'distance': 600, + 'size': (256, 64), + } + n_f = normalized_parameters['focal_length'] + n_d = normalized_parameters['distance'] + ow, oh = normalized_parameters['size'] + norm_camera_matrix = np.array([[n_f, 0, 0.5*ow], [0, n_f, 0.5*oh], [0, 0, 1]], + dtype=np.float64) + + # Compute gaze-origin (g_o) + landmarks_3d = np.matmul(rotate_mat, head_pose_estimator.sfm_points_for_pnp.T).T + tvec.T + g_o = np.mean(landmarks_3d[10:12, :], axis=0) + g_o = g_o.reshape(3, 1) + + g_t = g = None + if entry['3d_gaze_target'] is not None: + g_t = entry['3d_gaze_target'].reshape(3, 1) + g = g_t - g_o + g /= np.linalg.norm(g) + + return [full_frame, rvec, tvec, rotate_mat, camera_matrix, n_f, n_d, + norm_camera_matrix, ow, oh, landmarks_3d, g_o, g_t, g] + + +def normalize(entry, head_pose): + [full_frame, rvec, tvec, rotate_mat, camera_matrix, n_f, n_d, norm_camera_matrix, + ow, oh, landmarks_3d, g_o, g_t, g] = common_pre(entry, head_pose) + + # Code below is an adaptation of code by Xucong Zhang + # https://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal-computing/research/gaze-based-human-computer-interaction/revisiting-data-normalization-for-appearance-based-gaze-estimation/ + + distance = np.linalg.norm(g_o) + z_scale = n_d / distance + S = np.eye(3, dtype=np.float64) + S[2, 2] = z_scale + + hRx = rotate_mat[:, 0] + forward = (g_o / np.linalg.norm(g_o)).reshape(3) + down = np.cross(forward, hRx) + down /= np.linalg.norm(down) + right = np.cross(down, forward) + right /= np.linalg.norm(right) + R = np.c_[right, down, forward].T # rotation matrix R + + W = np.dot(np.dot(norm_camera_matrix, S), + np.dot(R, np.linalg.inv(camera_matrix))) # transformation matrix + patch = cv2.warpPerspective(full_frame, W, (ow, oh)) # image normalization + + R = np.asmatrix(R) + + # Correct head pose + head_mat = R * rotate_mat + n_h = np.array([np.arcsin(head_mat[1, 2]), np.arctan2(head_mat[0, 2], head_mat[2, 2])]) + + # Correct head pose + n_g = [] + if g is not None: + # Correct gaze + n_g = correctGaze(R, g) + + # if mode == 'face': + # to_visualize = cv2.equalizeHist(cv2.cvtColor(patch, cv2.COLOR_RGB2GRAY)) + # to_visualize = draw_gaze(to_visualize, (0.5 * ow, 0.24 * oh), n_g, length=80.0, thickness=1) + # to_visualize = draw_gaze(to_visualize, (0.5 * ow, 0.5 * oh), n_h, length=80.0, thickness=3, + # color=(0, 0, 0)) + # to_visualize = draw_gaze(to_visualize, (0.5 * ow, 0.5 * oh), n_h, length=80.0, thickness=1, + # color=(255, 255, 255)) + # cv2.imshow('zhang', to_visualize) + # cv2.waitKey(1) + + return patch, n_h, n_g, np.transpose(R), g_o, g_t + + +def correctGaze(R, g): + n_g = R * g + n_g /= np.linalg.norm(n_g) + n_g = vector_to_pitchyaw(-n_g.T).flatten() + return n_g + + +def vector_to_pitchyaw(vectors): + """Convert given gaze vectors to yaw (theta) and pitch (phi) angles.""" + n = vectors.shape[0] + out = np.empty((n, 2)) + vectors = np.divide(vectors, np.linalg.norm(vectors, axis=1).reshape(n, 1)) + out[:, 0] = np.arcsin(vectors[:, 1]) # theta + out[:, 1] = np.arctan2(vectors[:, 0], vectors[:, 2]) # phi + return out + + +def draw_gaze(image_in, eye_pos, pitchyaw, length=40.0, thickness=2, color=(0, 0, 255)): + """Draw gaze angle on given image with a given eye positions.""" + image_out = image_in + if len(image_out.shape) == 2 or image_out.shape[2] == 1: + image_out = cv2.cvtColor(image_out, cv2.COLOR_GRAY2BGR) + dx = -length * np.sin(pitchyaw[1]) + dy = -length * np.sin(pitchyaw[0]) + cv2.arrowedLine(image_out, tuple(np.round(eye_pos).astype(np.int32)), + tuple(np.round([eye_pos[0] + dx, eye_pos[1] + dy]).astype(int)), color, + thickness, cv2.LINE_AA, tipLength=0.2) + return image_out \ No newline at end of file diff --git a/demo_final/pattern.png b/demo_final/pattern.png new file mode 100644 index 0000000..5227c91 Binary files /dev/null and b/demo_final/pattern.png differ diff --git a/demo_final/person_calibration.py b/demo_final/person_calibration.py new file mode 100644 index 0000000..5ed7f53 --- /dev/null +++ b/demo_final/person_calibration.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 + +# -------------------------------------------------------- +# Copyright (C) 2020 NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License (1-Way Commercial) +# Code written by Shalini De Mello, Seonwook Park. +# -------------------------------------------------------- + +import cv2 +import numpy as np +import random +import threading +import pickle +import sys + +import torch +sys.path.append("../src") +from losses import GazeAngularLoss + +directions = ['l', 'r', 'u', 'd'] +keys = {'u': 82, + 'd': 84, + 'l': 81, + 'r': 83} + +global THREAD_RUNNING +global frames + +def create_image(mon, direction, i, color, target='E', grid=True, total=9): + + h = mon.h_pixels + w = mon.w_pixels + if grid: + if total == 9: + row = i % 3 + col = int(i / 3) + x = int((0.02 + 0.48 * row) * w) + y = int((0.02 + 0.48 * col) * h) + elif total == 16: + row = i % 4 + col = int(i / 4) + x = int((0.05 + 0.3 * row) * w) + y = int((0.05 + 0.3 * col) * h) + else: + x = int(random.uniform(0, 1) * w) + y = int(random.uniform(0, 1) * h) + + # compute the ground truth point of regard + x_cam, y_cam, z_cam = mon.monitor_to_camera(x, y) + g_t = (x_cam, y_cam) + + font = cv2.FONT_HERSHEY_SIMPLEX + img = np.ones((h, w, 3), np.float32) + if direction == 'r' or direction == 'l': + if direction == 'r': + cv2.putText(img, target, (x, y), font, 0.5, color, 2, cv2.LINE_AA) + elif direction == 'l': + cv2.putText(img, target, (w - x, y), font, 0.5, color, 2, cv2.LINE_AA) + img = cv2.flip(img, 1) + elif direction == 'u' or direction == 'd': + imgT = np.ones((w, h, 3), np.float32) + if direction == 'd': + cv2.putText(imgT, target, (y, x), font, 0.5, color, 2, cv2.LINE_AA) + elif direction == 'u': + cv2.putText(imgT, target, (h - y, x), font, 0.5, color, 2, cv2.LINE_AA) + imgT = cv2.flip(imgT, 1) + img = imgT.transpose((1, 0, 2)) + + return img, g_t + + +def grab_img(cap): + global THREAD_RUNNING + global frames + while THREAD_RUNNING: + _, frame = cap.read() + frames.append(frame) + + +def collect_data(cap, mon, calib_points=9, rand_points=5): + global THREAD_RUNNING + global frames + + cv2.namedWindow("image", cv2.WINDOW_NORMAL) + cv2.setWindowProperty("image", cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN) + + calib_data = {'frames': [], 'g_t': []} + + i = 0 + while i < calib_points: + + # Start the sub-thread, which is responsible for grabbing images + frames = [] + THREAD_RUNNING = True + th = threading.Thread(target=grab_img, args=(cap,)) + th.start() + direction = random.choice(directions) + img, g_t = create_image(mon, direction, i, (0, 0, 0), grid=True, total=calib_points) + cv2.imshow('image', img) + key_press = cv2.waitKey(0) + if key_press == keys[direction]: + THREAD_RUNNING = False + th.join() + calib_data['frames'].append(frames) + calib_data['g_t'].append(g_t) + i += 1 + elif key_press & 0xFF == ord('q'): + cv2.destroyAllWindows() + break + else: + THREAD_RUNNING = False + th.join() + + i = 0 + while i < rand_points: + + # Start the sub-thread, which is responsible for grabbing images + frames = [] + THREAD_RUNNING = True + th = threading.Thread(target=grab_img, args=(cap,)) + th.start() + direction = random.choice(directions) + img, g_t = create_image(mon, direction, i, (0, 0, 0), grid=False, total=rand_points) + cv2.imshow('image', img) + key_press = cv2.waitKey(0) + if key_press == keys[direction]: + THREAD_RUNNING = False + th.join() + calib_data['frames'].append(frames) + calib_data['g_t'].append(g_t) + i += 1 + elif key_press & 0xFF == ord('q'): + cv2.destroyAllWindows() + break + else: + THREAD_RUNNING = False + th.join() + cv2.destroyAllWindows() + + return calib_data + + +def fine_tune(subject, data, frame_processor, mon, device, gaze_network, k, steps=1000, lr=1e-4, show=False): + + # collect person calibration data + fourcc = cv2.VideoWriter_fourcc(*'XVID') + out = cv2.VideoWriter('%s_calib.avi' % subject, fourcc, 30.0, (640, 480)) + target = [] + for index, frames in enumerate(data['frames']): + n = 0 + for i in range(len(frames) - 10, len(frames)): + frame = frames[i] + g_t = data['g_t'][index] + target.append(g_t) + out.write(frame) + + # # show + # cv2.putText(frame, str(n),(20,20), cv2.FONT_HERSHEY_SIMPLEX, 1, (200,0,0), 3, cv2.LINE_AA) + # cv2.imshow('img', frame) + # cv2.waitKey(30) + + n += 1 + cv2.destroyAllWindows() + out.release() + fout = open('%s_calib_target.pkl' % subject, 'wb') + pickle.dump(target, fout) + fout.close() + + vid_cap = cv2.VideoCapture('%s_calib.avi' % subject) + data = frame_processor.process(subject, vid_cap, mon, device, gaze_network, por_available=True, show=show) + vid_cap.release() + + n = len(data['image_a']) + assert n==130, "Face not detected correctly. Collect calibration data again." + _, c, h, w = data['image_a'][0].shape + img = np.zeros((n, c, h, w)) + gaze_a = np.zeros((n, 2)) + head_a = np.zeros((n, 2)) + R_gaze_a = np.zeros((n, 3, 3)) + R_head_a = np.zeros((n, 3, 3)) + for i in range(n): + img[i, :, :, :] = data['image_a'][i] + gaze_a[i, :] = data['gaze_a'][i] + head_a[i, :] = data['head_a'][i] + R_gaze_a[i, :, :] = data['R_gaze_a'][i] + R_head_a[i, :, :] = data['R_head_a'][i] + + # create data subsets + train_indices = [] + for i in range(0, k*10, 10): + train_indices.append(random.sample(range(i, i + 10), 3)) + train_indices = sum(train_indices, []) + + valid_indices = [] + for i in range(k*10, n, 10): + valid_indices.append(random.sample(range(i, i + 10), 1)) + valid_indices = sum(valid_indices, []) + + input_dict_train = { + 'image_a': img[train_indices, :, :, :], + 'gaze_a': gaze_a[train_indices, :], + 'head_a': head_a[train_indices, :], + 'R_gaze_a': R_gaze_a[train_indices, :, :], + 'R_head_a': R_head_a[train_indices, :, :], + } + + input_dict_valid = { + 'image_a': img[valid_indices, :, :, :], + 'gaze_a': gaze_a[valid_indices, :], + 'head_a': head_a[valid_indices, :], + 'R_gaze_a': R_gaze_a[valid_indices, :, :], + 'R_head_a': R_head_a[valid_indices, :, :], + } + + for d in (input_dict_train, input_dict_valid): + for k, v in d.items(): + d[k] = torch.FloatTensor(v).to(device).detach() + + ############# + # Finetuning + ################# + + loss = GazeAngularLoss() + optimizer = torch.optim.SGD( + [p for n, p in gaze_network.named_parameters() if n.startswith('gaze')], + lr=lr, + ) + + gaze_network.eval() + output_dict = gaze_network(input_dict_valid) + valid_loss = loss(input_dict_valid, output_dict).cpu() + print('%04d> , Validation: %.2f' % (0, valid_loss.item())) + + for i in range(steps): + # zero the parameter gradient + gaze_network.train() + optimizer.zero_grad() + + # forward + backward + optimize + output_dict = gaze_network(input_dict_train) + train_loss = loss(input_dict_train, output_dict) + train_loss.backward() + optimizer.step() + + if i % 100 == 99: + gaze_network.eval() + output_dict = gaze_network(input_dict_valid) + valid_loss = loss(input_dict_valid, output_dict).cpu() + print('%04d> Train: %.2f, Validation: %.2f' % + (i+1, train_loss.item(), valid_loss.item())) + torch.save(gaze_network.state_dict(), '%s_gaze_network.pth.tar' % subject) + torch.cuda.empty_cache() + + # vid_cap = cv2.VideoCapture('%s_calib.avi' % subject) + # data = frame_processor.process(subject, vid_cap, mon, device, gaze_network, por_available=True, show=show) + # vid_cap.release() + + return gaze_network \ No newline at end of file diff --git a/demo_final/requirements.txt b/demo_final/requirements.txt new file mode 100644 index 0000000..6e11671 --- /dev/null +++ b/demo_final/requirements.txt @@ -0,0 +1,17 @@ +apex +h5py +imageio +moviepy +numpy +opencv_python +torch +torchvision +tqdm +yacs +pandas==0.24.2 +scipy==1.0.0 +hdf5storage +pgi +vext +vext.gi + diff --git a/demo_final/run_demo.py b/demo_final/run_demo.py new file mode 100644 index 0000000..b2d4a14 --- /dev/null +++ b/demo_final/run_demo.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 + +# -------------------------------------------------------- +# Copyright (C) 2020 NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License (1-Way Commercial) +# Code written by Shalini De Mello, Seonwook Park. +# -------------------------------------------------------- + +import time +import cv2 +import numpy as np +from os import path +from subprocess import call +import pickle +import sys +import torch +import os + +import warnings +warnings.filterwarnings("ignore") + +from monitor import monitor +from camera import cam_calibrate +from person_calibration import collect_data, fine_tune +from frame_processor import frame_processer + +################################# +# Start camera +################################# + +cam_idx = 0 + +# adjust these for your camera to get the best accuracy +call('v4l2-ctl -d /dev/video%d -c brightness=100' % cam_idx, shell=True) +call('v4l2-ctl -d /dev/video%d -c contrast=50' % cam_idx, shell=True) +call('v4l2-ctl -d /dev/video%d -c sharpness=100' % cam_idx, shell=True) + +cam_cap = cv2.VideoCapture(cam_idx) +cam_cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640) +cam_cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480) + +# calibrate camera +cam_calib = {'mtx': np.eye(3), 'dist': np.zeros((1, 5))} +if path.exists("calib_cam%d.pkl" % (cam_idx)): + cam_calib = pickle.load(open("calib_cam%d.pkl" % (cam_idx), "rb")) +else: + print("Calibrate camera once. Print pattern.png, paste on a clipboard, show to camera and capture non-blurry images in which points are detected well.") + print("Press s to save frame, c to continue, q to quit") + cam_calibrate(cam_idx, cam_cap, cam_calib) + +################################# +# Load gaze network +################################# +ted_parameters_path = 'weights_ted.pth.tar' #'../src/outputs_of_full_train_test_and_plot/checkpoints/at_step_0057101.pth.tar' +maml_parameters_path = 'weights_maml' #'../src/outputs_of_full_train_test_and_plot/Zg_OLR1e-03_IN5_ILR1e-05_Net64' +k = 9 + +# Set device +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + +# Create network +sys.path.append("../src") +from models import DTED +gaze_network = DTED( + growth_rate=32, + z_dim_app=64, + z_dim_gaze=2, + z_dim_head=16, + decoder_input_c=32, + normalize_3d_codes=True, + normalize_3d_codes_axis=1, + backprop_gaze_to_encoder=False, +).to(device) + +################################# + +# Load T-ED weights if available +assert os.path.isfile(ted_parameters_path) +print('> Loading: %s' % ted_parameters_path) +ted_weights = torch.load(ted_parameters_path) +if torch.cuda.device_count() == 1: + if next(iter(ted_weights.keys())).startswith('module.'): + ted_weights = dict([(k[7:], v) for k, v in ted_weights.items()]) + +##################################### + +# Load MAML MLP weights if available +full_maml_parameters_path = maml_parameters_path +'/%02d.pth.tar' % k #maml_parameters_path +'/MAML_%02d/meta_learned_parameters.pth.tar' % k +assert os.path.isfile(full_maml_parameters_path) +print('> Loading: %s' % full_maml_parameters_path) +maml_weights = torch.load(full_maml_parameters_path) +ted_weights.update({ # rename to fit + 'gaze1.weight': maml_weights['layer01.weights'], + 'gaze1.bias': maml_weights['layer01.bias'], + 'gaze2.weight': maml_weights['layer02.weights'], + 'gaze2.bias': maml_weights['layer02.bias'], +}) +gaze_network.load_state_dict(ted_weights) + +################################# +# Personalize gaze network +################################# + +# Initialize monitor and frame processor +mon = monitor() +frame_processor = frame_processer(cam_calib) + +# collect person calibration data and fine- +# tune gaze network +subject = input('Enter subject name: ') +data = collect_data(cam_cap, mon, calib_points=9, rand_points=4) +# adjust steps and lr for best results +# To debug calibration, set show=True +gaze_network = fine_tune(subject, data, frame_processor, mon, device, gaze_network, k, steps=1000, lr=1e-4, show=False) + +################################# +# Run on live webcam feed and +# show point of regard on screen +################################# +data = frame_processor.process(subject, cam_cap, mon, device, gaze_network, show=True) diff --git a/demo_final/undistorter.py b/demo_final/undistorter.py new file mode 100644 index 0000000..38b3f63 --- /dev/null +++ b/demo_final/undistorter.py @@ -0,0 +1,51 @@ +""" +Copyright 2019 ETH Zurich, Seonwook Park + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +#!/usr/bin/env python3 + +# -------------------------------------------------------- +# Copyright (C) 2020 NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License (1-Way Commercial) +# Code written by Seonwook Park, Shalini De Mello. +# -------------------------------------------------------- + +import cv2 as cv +import numpy as np + +class Undistorter(object): + + def __init__(self, camera_matrix, distortion_coefficients, output_size=None, + new_camera_matrix=None): + self.camera_matrix = camera_matrix + self.distortion_coefficients = distortion_coefficients + self.output_size = output_size + self.new_camera_matrix = new_camera_matrix + self.undistort_maps = None + + def apply(self, image_original): + if self.undistort_maps is None: + h, w, _ = image_original.shape + self.undistort_maps = cv.initUndistortRectifyMap( + self.camera_matrix, self.distortion_coefficients, np.eye(3), self.new_camera_matrix, + (w, h) if self.output_size is None else self.output_size, cv.CV_32FC1) + return cv.remap(image_original, self.undistort_maps[0], self.undistort_maps[1], + cv.INTER_LINEAR)