Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix in video demo and feature in visualization #5

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@
# Written by Joao Carreira, Pulkit Agrawal and Katerina Fragkiadki
# --------------------------------------------------------
from . import config
import _init_paths

151 changes: 139 additions & 12 deletions src/pose_video_demo.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python2
# vim: set shiftwidth=1
"""
/************************************************************************
Copyright (c) 2016, Stefan Helmert
Expand All @@ -7,17 +8,79 @@

************************************************************************/
"""
import _init_paths
import cv2
import test_demo as td
import scipy.misc as scm
import numpy as np
import csv
import time, os, sys
import argparse
try:
import _init_py_faster_rcnn_paths
import detectcore
except:
print('No person detector found! - Person detection not useable. Please specify the coordinates where humans appear that should be analyzed.')
import collections
import copy
#try:
#except:
# print('py-faster-rcnn not available - no automatic human detection')
class rectangle_c:
def __init__(self):
self.x_center = 0
self.y_center = 0
self.x_range = 0
self.y_range = 0

def posevideo(input_video_name, output_video_name=None, output_csv_name=None, isGPU=True, deviceId=0, bodyPt=[600, 400]):
def humdet(frame, threshold=0.5):
global gnet
cls_vec, dets_vec = detectcore.detect_object(gnet, frame)
person = dets_vec[cls_vec.index("person")]
human_vec = []
score_vec = []
detection_vec = []
det_world_vec = []
dets = person
thresh = threshold
inds = np.where(dets[:, -1] >= thresh)[0]
for i in inds:
bbox = dets[i, :4]
score = dets[i, -1]
score_vec.append(score)
detection = rectangle_c()
detection.x_center = (bbox[0] + bbox[2])/2
detection.y_center = (bbox[1] + bbox[3])/2
detection.x_range = (bbox[2] - bbox[0])
detection.y_range = (bbox[3] - bbox[1])
detection_vec.append(detection)
return detection_vec



def sameorder(objs, objs_old):
objs_new = copy.deepcopy(objs_old)
while len(objs_new) < len(objs):
objs_new.append(rectangle_c())
objs_set = np.zeros(len(objs_new))
while np.sum(objs_set) < len(objs) and np.sum(objs_set) < len(objs_set):
for i, obj in enumerate(objs):
dist_min = 10000000
idx = i
for j, obj_old in enumerate(objs_old):
if 0 == objs_set[j]:
dist = np.sqrt(np.power(obj_old.x_center - obj.x_center, 2) + np.power(obj_old.y_center - obj.y_center, 2))
if dist < dist_min:
dist_min = dist
idx = j
objs_new[idx] = copy.deepcopy(obj)
objs_set[idx] = 1
return objs_new

def posevideo(input_video_name, output_video_name=None, output_csv_name=None, isGPU=True, deviceId=0, bodyPt=[600, 400], iterations=4, fixedScale=False, scaleIdx=0, fbfactor=0.0, thresh=0.9, detinterv=10, bodyPts= [600, 400], maxhumans=4):
""" processing the video """
# Find OpenCV version
global gnet
(major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.')

ief = td.PoseIEF(isGPU=isGPU, deviceId=deviceId)
Expand All @@ -27,29 +90,70 @@ def posevideo(input_video_name, output_video_name=None, output_csv_name=None, is
if(output_csv_name is not None and '' != output_csv_name):
pose_csv_file = open(output_csv_name, 'w')
pose_csv = csv.writer(pose_csv_file)
pose_csv.writerows([['x_rft', 'y_rft', 'x_rkn', 'y_rkn', 'x_rhp', 'y_rhp', 'x_lhp', 'y_lhp', 'x_lkn', 'y_lkn', 'x_lft', 'y_lft', 'x_plv', 'y_plv', 'x_trx', 'y_trx', 'x_un', 'y_un', 'x_hd', 'y_hd', 'x_rhn', 'y_rhn', 'x_rlb', 'y_rlb', 'x_rsh', 'y_rsh', 'x_lsh', 'y_lsh', 'x_llb', 'y_llb', 'x_lhn', 'y_lhn', 'x_hum', 'y_hum']])
pose_csv.writerows([['no_frm', 'no_prs', 'x_rft', 'y_rft', 'x_rkn', 'y_rkn', 'x_rhp', 'y_rhp', 'x_lhp', 'y_lhp', 'x_lkn', 'y_lkn', 'x_lft', 'y_lft', 'x_plv', 'y_plv', 'x_trx', 'y_trx', 'x_un', 'y_un', 'x_hd', 'y_hd', 'x_rhn', 'y_rhn', 'x_rlb', 'y_rlb', 'x_rsh', 'y_rsh', 'x_lsh', 'y_lsh', 'x_llb', 'y_llb', 'x_lhn', 'y_lhn', 'x_hum', 'y_hum']])
cnt = 0

humans_old = []
currPoses = []
read_bodyPts = True
while(True):
ret, frame = cap.read()
if ret is False:
return
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
if(output_video_name is not None and '' != output_video_name):
if(False == outv.isOpened()):
if(major_ver<3):
if int(major_ver) < 3:
fps = cap.get(cv2.cv.CV_CAP_PROP_FPS)
outv.open(output_video_name, cv2.cv.CV_FOURCC('A', 'P', '4', '1'), fps, (np.size(frame, 1), np.size(frame, 0)), True) #, frame.shape, True)
else:
fps = cap.get(cv2.CAP_PROP_FPS)
outv.open(output_video_name, cv2.VideoWriter_fourcc('A', 'P', '4', '1'), fps, (np.size(frame, 1), np.size(frame, 0)), True) #, frame.shape, True)
pose,_ = ief.predict(frame, bodyPt)
poses = []
if 0 < detinterv:
if 0 == cnt % detinterv:
humans = humdet(frame, thresh)
else:
humans = []
if read_bodyPts:
human = rectangle_c()
for i, val in enumerate(bodyPts):
if 0 == i % 2:
human.x_center = val
else:
human.y_center = val
humans.append(copy.deepcopy(human))
read_bodyPts = False

humans = sameorder(humans, humans_old)
humans = humans[0:maxhumans]
humans_old = humans
cnt += 1
print('Frame number: '+str(cnt))
if(output_csv_name is not None and '' != output_csv_name):
pose_arr = np.append(pose,[])
pose_csv.writerows([pose_arr])
for i, human in enumerate(humans):
bodyPt = [human.x_center, human.y_center]
try:
currPose = currPoses[i]
pose, currPose = ief.predict(frame, bodyPt, False, iterations, fixedScale, scaleIdx, True, currPose, fbfactor)
except:
pose, currPose = ief.predict(frame, bodyPt, False, iterations, fixedScale, scaleIdx)
humans_old[i].x_center = pose[7][0]
humans_old[i].y_center = pose[7][1]
try:
currPoses[i] = currPose
except:
currPoses.append(currPose)
poses.append(pose)
if(output_csv_name is not None and '' != output_csv_name):
pose_arr = np.append([cnt ,i], pose)
pose_csv.writerows([pose_arr])
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
if(output_video_name is not None and '' != output_video_name):
frame = td.vis.plot_pose_stickmodel_cv2mat(frame, pose.squeeze().transpose((1,0)))
for i, pose in enumerate(poses):
frame = td.vis.plot_pose_stickmodel_cv2mat(frame, pose.squeeze().transpose((1,0)), 2, False)
cv2.putText(frame, str(i), (int(humans[i].x_center), int(humans[i].y_center)), cv2.FONT_HERSHEY_PLAIN, 1, (255, 0, 0))
cv2.imshow('stickmodel', frame)
cv2.waitKey(1)
outv.write(frame)
if(output_video_name is not None and '' != output_video_name):
outv.close()
Expand All @@ -62,21 +166,44 @@ def parse_args():
parser.add_argument('--isGPU', dest='isGPU', help='Boolean value that specifies if a GPU should be used for detection - isGPU=False means the network runs on CPU', default=True, type=bool)
parser.add_argument('--deviceId', dest='deviceId', help='Natural value that specifies the number of the GPU which should be used. It starts with 0.', default='0', type=int)
parser.add_argument('--input_video', dest='input_video_name', help='The name of the video which should be analyzed.', default='video/demo.avi', type=str)
default_output_name = (parser.parse_args().input_video_name).rsplit('.', 1)[0]
parser.add_argument('--output_video', dest='output_video_name', help='The name of the video to be newly created containing the stick model.', default=default_output_name+'_PoseIEF.avi', type=str)
parser.add_argument('--output_csv', dest='output_csv_name', help='The name of the csv file to be newly created containing the joint postions.', default=default_output_name+'_PoseIEF.csv', type=str)
parser.add_argument('--output_video', dest='output_video_name', help='The name of the video to be newly created containing the stick model.', default='?', type=str)
parser.add_argument('--output_csv', dest='output_csv_name', help='The name of the csv file to be newly created containing the joint postions.', default='?', type=str)
parser.add_argument('--x_bodyPt', dest='x_bodyPt', help='Natural value that represents the x-coordinate of the pointer telling which human should be analyzed.', default=600, type=int)
parser.add_argument('--y_bodyPt', dest='y_bodyPt', help='Natural value that represents the y-coordinate of the pointer telling which human should be analyzed.', default=400, type=int)
parser.add_argument('--iterations', dest='iterations', help='Natural value that specifies how much IEF iterations per image should be done.', default=4, type=int)
parser.add_argument('--scaleIdx', dest='scaleIdx', help='Natural value that specifies the IEF scaleIdx if fixed scale flag is set.', default=0, type=int)
parser.add_argument('--fbfactor', dest='fbfactor', help='Fractional value between 0.0 and 1.0 that specifies the weight of the last pose of the last image for the current image.', default=0.0, type=float)
parser.add_argument('--fixedScale', dest='fixedScale', help='Boolean value that deactivates the autoscale network netScale. The scale index scaleIdx has to be specified manualy (default=0)', default=False, type=bool)
parser.add_argument('--bodyPts', dest='bodyPts', help='Natural value coordinates of human appearance/starting points for pose detection. Order: x_person_1 y_person_1 x_person_2 ...', default=[600, 400], type=int, nargs='+')
# arguments for human detection
parser.add_argument('--thresh', dest='thresh', help='Fractional value between 0.0 and 1.0 that specifies the selectivity of human detection.', default=0.9, type=float)
parser.add_argument('--detinterv', dest='detinterv', help='Natural value that specifies every how much images human detection is done.', default=10, type=int)
parser.add_argument('--net', dest='net', help='The name of network used for human detection.', default='vgg16', type=str)
parser.add_argument('--maxhumans', dest='maxhumans', help='Natural value that specifies maximum number of tracked humans.', default=4, type=int)


if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)

args = parser.parse_args()
default_output_name = args.input_video_name.rsplit('.', 1)[0]
if '?' == args.output_video_name:
args.output_video_name = default_output_name+'_PoseIEF.avi'
if '?' == args.output_csv_name:
args.output_csv_name = default_output_name+'_PoseIEF.csv'

return args

if __name__ == '__main__':
global gnet
args = parse_args()
print('Called with args:')
print(args)
posevideo(args.input_video_name, args.output_video_name, args.output_csv_name, isGPU=args.isGPU, deviceId=args.deviceId, bodyPt=[args.x_bodyPt, args.y_bodyPt])
frargs = collections.namedtuple('args', 'demo_net cpu_mode gpu_id')
frargs.cpu_mode = not args.isGPU
frargs.demo_net = args.net
frargs.gpu_id = args.deviceId
gnet = detectcore.init(frargs)
posevideo(args.input_video_name, args.output_video_name, args.output_csv_name, isGPU=args.isGPU, deviceId=args.deviceId, bodyPt=[args.x_bodyPt, args.y_bodyPt], iterations=args.iterations, fixedScale=args.fixedScale, scaleIdx=args.scaleIdx, fbfactor=args.fbfactor, thresh=args.thresh, detinterv=args.detinterv, bodyPts=args.bodyPts, maxhumans=args.maxhumans)

127 changes: 87 additions & 40 deletions src/test_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
import scipy.io as sio
import scipy.misc as scm
import pdb
import time
import cv2

LIST_SCALES = cfg.SCALE_LAMBDA

Expand Down Expand Up @@ -51,6 +53,7 @@ def get_pose_net(isGPU=True, deviceId=0):
metaData = pickle.load(open(metaFile, 'r'))
return net, metaData


##
# Predicting Poses
class PoseIEF(object):
Expand All @@ -68,73 +71,117 @@ def __init__(self, netScale=None, netPose=None, metaPose=None, cropSz=256, poseI
self.cropSz_ = cropSz
self.poseImSz_ = poseImSz

##
#Predict pose
def predict(self, imName='./test_images/mpii-test-079555750.jpg',
bodyPt=(249,249), returnIm=False):
'''
imName : image file name for which the pose needs to be predicted
bodyPt : A point on the body of the person (torso) for whom the pose
is to be predicted
returnIm: If True, return the image also
'''
cropSz, poseImSz = self.cropSz_, self.poseImSz_
#Read the image
if(isinstance(imName, str)):
im = scm.imread(imName)
else:
im = imName


def calc_scaleIdx_from_bbox(self, width, height):
cropSz = self.cropSz_
hscale = cropSz / height
wscale = cropSz / width
for i,s in enumerate(LIST_SCALES):
if s < hscale or s < wscale:
return s

def proc_fixedScale(self, im, cropSz, poseImSz, bodyPt, scaleIdx):
imScale = np.zeros((cropSz, cropSz, 3))
oScale = np.zeros((2))
oPos = np.zeros((2))
scale = LIST_SCALES[scaleIdx]
imScale, scs, crpPos = imu.centered_crop(cropSz, copy.deepcopy(im), bodyPt, scale, returnScale=True)
oScale = np.array(scs).reshape(1,2)
oPos = np.array(crpPos).reshape(1,2)
xSt, ySt = (cropSz - poseImSz)/2, (cropSz - poseImSz)/2
xEn, yEn = xSt + poseImSz, ySt + poseImSz
imScale = imScale[ySt:yEn, xSt:xEn,:].reshape((1,poseImSz,poseImSz,3))
return imScale, xSt, ySt, oPos, oScale, scaleIdx



def proc_netScale(self, im, cropSz, poseImSz, bodyPt):
#Crop the image at different scales
t = time.time()
imData = np.zeros((len(LIST_SCALES), cropSz, cropSz, 3))
scData = np.zeros((len(LIST_SCALES), 2))
posData = np.zeros((len(LIST_SCALES), 2))
for i,s in enumerate(LIST_SCALES):
imData[i], scs, crpPos = imu.centered_crop(cropSz, copy.deepcopy(im), bodyPt, s,
returnScale=True)
imData[i], scs, crpPos = imu.centered_crop(cropSz, copy.deepcopy(im), bodyPt, s, returnScale=True)
scData[i] = np.array(scs).reshape(1,2)
posData[i] = np.array(crpPos).reshape(1,2)

print('crop time: {:.3f}s').format(time.time() - t)
#Use the scale net to find the best scale
scaleOp = self.netScale_.forward(blobs=['fc-op'], data=imData)
t = time.time()
scaleOp = self.netScale_.forward(blobs=['fc-op'], data=imData)
print('netScale time: {:.3f}s').format(time.time() - t)
scaleIdx = scaleOp['fc-op'].squeeze().argmax()
scale = LIST_SCALES[scaleIdx]
#Scale to use to return the image in the original space
oScale = scData[scaleIdx]
#Original location of the cropped image
oPos = posData[scaleIdx]

#Prepare image for pose prediction
imScale = imData[scaleIdx]
print(scaleIdx)
print(len(imData))
xSt, ySt = (cropSz - poseImSz)/2, (cropSz - poseImSz)/2
xEn, yEn = xSt + poseImSz, ySt + poseImSz
imScale = imScale[ySt:yEn, xSt:xEn,:].reshape((1,poseImSz,poseImSz,3))
return imScale, xSt, ySt, oPos, oScale, scaleIdx

#Seed pose
currPose = np.zeros((1,17,2,1)).astype(np.float32)
for i in range(16):
currPose[0,i,0] = copy.deepcopy(self.seedPose_[0,i] - xSt)
currPose[0,i,1] = copy.deepcopy(self.seedPose_[1,i] - ySt)
#The marking point is the center of the image
currPose[0, 16, 0] = poseImSz / 2
currPose[0, 16, 1] = poseImSz / 2

def proc_netPose(self, imScale, currPose):
t = time.time()
#Dummy labels
labels = np.zeros((1,16,2,1)).astype(np.float32)
poseOp = self.netPose_.forward(blobs=['cls3_fc'], image=imScale, kp_pos=copy.deepcopy(currPose), label=labels)
print('netPose time: {:.3f}s').format(time.time() - t)
kPred = copy.deepcopy(poseOp['cls3_fc'].squeeze())
for i in range(16):
dx, dy = kPred[i], kPred[16 + i]
#print(dx, dy)
currPose[0,i,0] = currPose[0,i,0] + self.mxStepSz_ * dx
currPose[0,i,1] = currPose[0,i,1] + self.mxStepSz_ * dy
return currPose
##
#Predict pose
def predict(self, imName='./test_images/mpii-test-079555750.jpg', bodyPt=(249,249), returnIm=False, noIterations=4, fixedScale=False, scaleIdx=None, initialPose=False, currPose=None, loopfactor=1.0):
'''
imName : image file name for which the pose needs to be predicted
bodyPt : A point on the body of the person (torso) for whom the pose
is to be predicted
returnIm: If True, return the image also
'''
tt = time.time()
cropSz, poseImSz = self.cropSz_, self.poseImSz_
#Read the image
if(isinstance(imName, str)):
im = scm.imread(imName)
else:
im = imName

if True == fixedScale:
imScale, xSt, ySt, oPos, oScale, scaleIdx = self.proc_fixedScale(im, cropSz, poseImSz, bodyPt, scaleIdx)
else:
imScale, xSt, ySt, oPos, oScale, scaleIdx = self.proc_netScale(im, cropSz, poseImSz, bodyPt)

#Seed pose
currPose_ = np.zeros((1,17,2,1)).astype(np.float32)
for i in range(16):
currPose_[0,i,0] = copy.deepcopy(self.seedPose_[0,i] - xSt)
currPose_[0,i,1] = copy.deepcopy(self.seedPose_[1,i] - ySt)
#The marking point is the center of the image
currPose_[0, 16, 0] = poseImSz / 2
currPose_[0, 16, 1] = poseImSz / 2

if False == initialPose:
currPose = currPose_
#cv2.imshow('imScale', imScale[0])
#cv2.waitKey(1)
currPose = np.add(np.multiply(currPose, loopfactor), np.multiply(currPose_, 1.0-loopfactor))
#Predict Pose
for step in range(4):
poseOp = self.netPose_.forward(blobs=['cls3_fc'], image=imScale,
kp_pos=copy.deepcopy(currPose), label=labels)
kPred = copy.deepcopy(poseOp['cls3_fc'].squeeze())
for i in range(16):
dx, dy = kPred[i], kPred[16 + i]
currPose[0,i,0] = currPose[0,i,0] + self.mxStepSz_ * dx
currPose[0,i,1] = currPose[0,i,1] + self.mxStepSz_ * dy

for step in range(noIterations):
currPose = self.proc_netPose(imScale, currPose)
#Convert the pose in the original image coordinated
origPose = (currPose.squeeze() + np.array([xSt, ySt]).reshape(1,2)) * oScale + oPos


print('predict time: {:.3f}s').format(time.time() - tt)
if returnIm:
#return origPose, copy.deepcopy(currPose), imScale[0]
return origPose, im
Expand Down
Loading