-
Notifications
You must be signed in to change notification settings - Fork 14
/
cc-tag-commercials
167 lines (125 loc) · 5.17 KB
/
cc-tag-commercials
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/bin/bash
#
# /usr/local/bin/cc-tag-commercials
#
# Convert caption style information from CCExtractor to segment tags for commercials
#
# Called by the recording scripts
#
# Written by FFS on 14 Oct 2013
#
# Dependencies: sponge (moreutils)
#
# Changelog:
#
# 2016-09-07 Strip stray spaces before caption style tag
# 2015-04-12 Corner case -- tag ends in commercial
# 2014-01-05 Use the start of the next story rather than the end of the POP to define the commercial block
#
#------------------------------------------------------------------------------------
# Help screen
if [ "$1" = "-h" -o "$1" = "--help" -o "$1" = "help" ]
then echo -e "\n\tSyntax: `basename $0` <filename>"
echo -e "\n\t `basename $0` 2008-01-30_1830_KNBC_NBC_Nightly_News.txt"
echo -e "\n\tConvert caption style information to segment tags for commercials"
echo -e "\tand add story start tags before triple chevrons (>>>)."
echo -e "\n\tThe original file is given the extension cs (caption styles).\n"
exit
fi
# OSX customizations (use GNU Core Utilities from MacPorts coreutils -- on garuda, GNU is default)
if [ "$(uname)" = "Darwin" ]
then DAT="gdate" SED="gsed" STAT="gstat" SEQ="gseq" MV="gmv" CP="gcp" TEE="gtee"
else DAT="date" SED="sed" STAT="stat" SEQ="seq" MV="mv" CP="cp" TEE="tee"
fi
# Check for flags and assign variables
if [ -n "$1" ]
then FIL="$1"
else echo -e "\nUsage: `basename $0` -h\n" ; exit
fi
# Verify presence
if [ ! -f $FIL ] ; then echo -e "\n\tNot seeing the file $FIL" ; exit ; fi
# Verify extension
if [ "${FIL#*.}" != "txt" ] ; then echo -e "\n\tThe `basename $0` script processes .txt files.\n" ; exit ; fi
# Skip files that lack caption styles
if [ ! "$( egrep -m 1 "\|RU2\||\|RU3\||\|RU4\||\|POP\|" $FIL )" ] ; then echo -e "\tNot seeing caption styles in $FIL" ; exit ; fi
# Welcome
echo -en "\tCreating commercial and story tags for $FIL\t"
# Host system
HOST="$( hostname -s )"
# File length
NLIN="$( cat $FIL | wc -l )"
# Internal field separator
OFS=$IFS
# Strip extension
FIL=${FIL%.*}
# Remove extra space after regular caption style (affects The_OReilly_Factor and AlJazeera)
$SED -i -e 's/RU2|[ ]*/RU2|/' -e 's/RU3|[ ]*/RU3|/' -e 's/RU4|[ ]*/RU4|/' $FIL.txt
# Save the original
mv $FIL.txt $FIL.cs
# Examine the file a line at a time
for N in `$SEQ 1 $NLIN` ; do
# Debug
#if [ "$N" -gt "20" ] ; then break ; fi
# Capture the line
read LIN <<< $( $SED -n "$N p" $FIL.cs )
# At the end of the file
if [ "${LIN:0:3}" = "END" ] ; then
# If we end in the middle of a commercial (corner case)
if [ "$SAD" != "" -a "$EAD" != "" ] ; then
# Use the end time of the last POP line
SEG="$SAD|$EAD|SEG_00|Type=Commercial"
# Insert the commercial block tag before the start
$SED -i "1,/^$SAD|$SEAD/ {/^$SAD|$SEAD/i\
$SEG
}" $FIL.txt
SAD="" ; EAD="" ; echo -en "."
fi
# Write the END line
echo -e "$LIN" >> $FIL.txt ; continue
fi
# Keep the lines that start with a letter (header)
if [[ "${LIN:0:1}" =~ [A-Z] ]] ; then echo -e "$LIN" >> $FIL.txt ; continue ; fi
#if [[ "${LIN:0:1}" =~ [A-Z] ]] ; then echo -e "$LIN" | tee -a $FIL.txt ; continue ; fi
# Capture the field values in each line (for all other lines) in an array
IFS=$'\n' ; FLD=( $( echo "$LIN" | $SED -e 's/|/\n/g' ) )
# Rewrite non-commercial lines
if [ "${FLD[3]}" != "POP" ] ; then
# Initial story start
#if [ -z "$FIRST" ] ; then FIRST=$N ; echo "${FLD[0]}|${FLD[1]}|SEG_00|Type=Story start" >> $FIL.txt
# Get the starting timestamp of a triple chevron (>>>) indicating a story boundary -- but not in US Spanish files
#elif [[ "${FLD[4]}" =~ ">>>" && "${FLD[2]}" != "CC3" && $FIL != *KMEX* ]] ; then echo "${FLD[0]}|${FLD[1]}|SEG_00|Type=Story start" >> $FIL.txt
#elif [[ "${FLD[4]}" =~ ">>>" && "${FLD[2]}" != "CC3" ]] ; then echo "${FLD[0]}|${FLD[1]}|SEG_00|Type=Story start" >> $FIL.txt
#fi
echo "${FLD[0]}|${FLD[1]}|${FLD[2]}|${FLD[4]}" >> $FIL.txt
fi
# Get the start and end time of the first line of the commercial
if [ "${FLD[3]}" = "POP" -a "$SAD" = "" ] ; then SAD="${FLD[0]}" SEAD="${FLD[1]}" ; fi
# Rewrite the commercial lines and store the successive end times
if [ "${FLD[3]}" = "POP" ] ; then echo "${FLD[0]}|${FLD[1]}|${FLD[2]}|${FLD[4]}" >> $FIL.txt ; EAD="${FLD[1]}" ; fi
# Debug
#echo -e "\n\t{FLD[3]} is ${FLD[3]} and EAD is $EAD\n"
# Get the end of the commercial
if [ "${FLD[3]}" != "POP" -a "$EAD" != "" ] ; then
# Either use the end time of the last POP line
#SEG="$SAD|$EAD|SEG_00|Type=Commercial"
# Or better, the start time of the first non-POP line
SEG="$SAD|${FLD[0]}|SEG_00|Type=Commercial"
# Insert the commercial block tag before the start
$SED -i "1,/^$SAD|$SEAD/ {/^$SAD|$SEAD/i\
$SEG
}" $FIL.txt
SAD="" ; EAD="" ; echo -en "."
# At the same time, insert a single story start tag at the end of the commercial
SEG="${FLD[0]}|${FLD[1]}|SEG_00|Type=Story start"
$SED -i "1,/^${FLD[0]}|${FLD[1]}/ {/^${FLD[0]}|${FLD[1]}/i\
$SEG
}" $FIL.txt
fi
done
# Internal field separator
IFS=$OFS
# Remove duplicate lines (SEG lines after POP and before >>>)
#uniq $FIL.txt | sponge $FIL.txt
# Receipt
echo -e "\tdone"
# EOF