-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
955 lines (806 loc) · 38.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
#requires oscar.mp4 video file in same folder
#transcript text file "oscar4.txt"
#might have to be in a folder name called oscar
#change certain character variables
import imageio
imageio.plugins.ffmpeg.download()
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import time
from time import strftime,localtime
from postprocess_and_fuse_subs import compileSubs
import pickle
import os
#adjust sleeping time as needed - ES
#adjust switches as needed
sleepingTime = 400
#___SWITCHES(defaults)___#
#ES: cut transcript into snippets based on the transcript's timestamps (must be set to True for other processes to run)
snipTranscript = True
#ES: cut video into snippets based on the transcript's timestamps (must be set to True for other processes to run)
snipVideos = True
#ES: upload video snippets
uploadVideos = True
#ES: if the video/caption upload process was terminated unexpectedly before and you want to continue where you left off (uploadVideos must still be set to True):
resumeUploads = False
#ES: upload snippet transcripts (.txt)
uploadTranscripts = True
#ES: download snippet subtitle files (.vtt)
downloadCaptions = True
#ES: delete uploaded video snippets from your Youtube account once subtitle processing is complete
deleteVideos = False
#ES: upload the full video and compiled transcript to your Youtube account once complete
uploadFull = False
#ES: combine vtt snippets that were downloaded from Youtube into a total subtitle file.
combineSubtitles = True
#ES: the following switches control how subtitles are altered when concatenating snippets (i.e. when combineSubtitles = True)
#ES A feature created by RG that has yet to be explored...
placeBasedTimestamping = False
#ES: resample subtitles to prevent cut-up phrases, lone-word subtitles, and improve the subtitle structure overall (can lead to short, choppy, fast subtitles that are hard to read)
resampleSubtitles = False
#ES: IF you enabled 'resampleSubtitles' (above), you have the option to make subtitle entries full sentences (not recommended, since some timestamp/subtitle units can end up being excessively large)
fullSentenceSubtitles = False
#ES: IF you enabled 'resampleSubtitles' (above), you have the option to remove subtitle entries which may be a single word (and put them in an adjacent subtitle (verify))
removeLoneWords = False
#____________#
#ES: USER INTERVIEW SECTION
def verify_y_n(a):
while True:
a = a.lower().strip()
if a == 'y' or a == 'n':
return a
else:
a = raw_input("Please answer 'y' or 'n': ")
continue
def verify_y_n_none(a):
while True:
a = a.lower().strip()
if a == 'y' or a == 'n' or a == '':
return a
else:
a = raw_input("Please answer 'y' or 'n', or leave the answer blank by hitting 'Enter': ")
continue
print "\n\n"
print "This application creates subtitles for a video for which you have an associated transcript. Make sure you have gone over README.md before proceeding."
time.sleep(1)
print "You may terminate the application at any point by pressing Ctrl+C (Cmd+C on Mac)."
time.sleep(1)
print "\n"
print "This tool:\n- snips your transcript (.txt) into text snippets based on its timestamps,\n- snips the associated video accordingly into video snippets,\n- uploads these video snippets to Youtube as private videos only visible to your account,\n- uploads the text snippets to Youtube as transcript files for these video snippets,\n- allows Youtube to sync the video and text snippets\n- downloads the text snippets as subtitle files (.vtt),\n- stitches these subtitle files together into a single subtitle file for your video.\n\nYou may switch these processes 'on' or 'off' depending on which steps you would like to run. If this is your first time running the tool, simply leave the following answers blank. For more advanced users or users who have already used this tool, please select which processes you would like to run: \n\n"
time.sleep(2)
answer = raw_input("\n1/7 Will you be cutting your video into video snippets (y) ")
answer = verify_y_n_none(answer)
if answer == 'y':
snipVideos = True
elif answer == 'n':
snipVideos = False
elif answer == '':
snipVideos = True
answer = raw_input("\n2/7 Will you be uploading video snippets to Youtube for syncing? (y) ")
answer = verify_y_n_none(answer)
if answer == 'y':
uploadVideos = True
snipVideos = True
elif answer == 'n':
uploadVideos = False
elif answer == '':
uploadVideos = True
answer = raw_input("\n3/7 Will you be resuming video uploads from a previously-initiated process? (n) ")
answer = verify_y_n_none(answer)
if answer == 'y':
resumeUploads = True
elif answer == 'n':
resumeUploads = False
elif answer == '':
resumeUploads = False
answer = raw_input("\n4/7 Will you be uploading text snippets for syncing with your video snippets? (y) ")
answer = verify_y_n_none(answer)
if answer == 'y':
uploadTranscripts = True
elif answer == 'n':
uploadTranscripts = False
elif answer == '':
uploadTranscripts = True
answer = raw_input("\n5/7 Will you be downloading the generated subtitle snippets from Youtube? (y) ")
answer = verify_y_n_none(answer)
if answer == 'y':
downloadCaptions = True
elif answer == 'n':
downloadCaptions = False
elif answer == '':
downloadCaptions = True
answer = raw_input("\n6/7 Would you like your uploaded video snippets to be deleted from Youtube once subtitles have been successfully generated? (n) ")
answer = verify_y_n_none(answer)
if answer == 'y':
deleteVideos = True
elif answer == 'n':
deleteVideos = False
elif answer == '':
deleteVideos = False
answer = raw_input("\n7/7 Will you be combining the downloaded subtitle snippets into a single subtitle file for your video? (y) ")
answer = verify_y_n_none(answer)
if answer == 'y':
combineSubtitles = True
elif answer == 'n':
combineSubtitles = False
elif answer == '':
combineSubtitles = True
if combineSubtitles == True:
answer = raw_input("\n7.1 Would you like to reorganize subtitles according to punctuation? (Experimental; can lead to short, choppy, fast subtitles that are hard to read) (n) ")
answer = verify_y_n_none(answer)
if answer == 'y':
resampleSubtitles = True
elif answer == 'n':
resampleSubtitles = False
elif answer == '':
resampleSubtitles = False
if resampleSubtitles == True:
answer = raw_input("\n7.1.1 Would you like to reorganize subtitles to prioritize keeping full sentences intact? (Experimental; this feature is not recommended since subtitle units tend to become excessively long) (n) ")
answer = verify_y_n_none(answer)
if answer == 'y':
fullSentenceSubtitles = True
elif answer == 'n':
fullSentenceSubtitles = False
elif answer == '':
fullSentenceSubtitles = False
answer = raw_input("\n7.1.2 Would you like to reorganize subtitles to remove lone words? (Experimental) (n) ")
answer = verify_y_n_none(answer)
if answer == 'y':
removeLoneWords = True
elif answer == 'n':
removeLoneWords = False
elif answer == '':
removeLoneWords = False
answer = raw_input("\n7.2 Would you like to reorganize subtitles according to the presence of place names? (Experimental) (n) ")
answer = verify_y_n_none(answer)
if answer == 'y':
placeBasedTimestamping = True
elif answer == 'n':
placeBasedTimestamping = False
elif answer == '':
placeBasedTimestamping = False
print "\n"
folderName = raw_input("Enter the name of the folder containing your transcript and/or video and/or subtitle files\n(this folder must be located inside the 'files' folder): ")
try:
verifyExistence = os.stat(folderName).st_size
except Exception as e:
print e
print "The folder named '" + folderName + "' does not exist in the current directory. Please see README.md for instructions."
print "exiting application..."
time.sleep(2)
exit()
print "\n"
if snipVideos == True or uploadTranscripts == True or resumeUploads == True or downloadCaptions == True or deleteVideos == True:
combine_only = False
fileName = raw_input("Enter the file name of your transcript (excluding the \".txt\" extention): ")
try:
verifyExistence = os.stat(folderName + '/' + fileName + '.txt').st_size
except Exception as e:
print e
print "The file named '" + fileName + ".txt' does not exist in the folder '" + folderName + "'. Please see README.md for instructions."
print "exiting application..."
time.sleep(2)
exit()
print "\n"
originalVideo = raw_input("Enter the file name of your video (this time including the file's extention): ")
try:
verifyExistence = os.stat(folderName + '/' + originalVideo).st_size
except Exception as e:
print e
print "The file named '" + originalVideo + "' does not exist in the folder '" + folderName + "'. Please see README.md for instructions."
print "exiting application..."
time.sleep(2)
exit()
print "\n"
videoSize = os.stat(folderName + '/' + originalVideo).st_size/1000000
answer = raw_input("If this is your first time running this tool on the files you have indicated, you will temporarily require " + str(videoSize) + " Mb available space on your hard drive to run this program. Continue? (y/n) ")
answer = verify_y_n(answer)
if answer == "n":
print "Please make sure you have the available space on your hard drive, and then restart the program."
print "exiting application..."
time.sleep(2)
exit()
print "\n"
elif combineSubtitles == True:
#in this case, the user has chosen to only combine subtitles. the switch combine_only allows some different functionality down the road
combine_only = True
fileName = raw_input("In order to accurately combine subtitle files, you will need to create a list of timestamps demarcating the length of each video to which your subtitle files are associated. These values will be used as offsets for accurately combining your subtitle files.\nEach timestamp should be written as follows [HH:MM:SS.00], followed by a newline.\n\nPlease enter the file name of your timestamp list (excluding the \".txt\" extention): ")
else:
print "You have not chosen any options for running this application. Exiting..."
exit()
while True:
language = raw_input("Enter the language code of your video and transcript or the intended language code of your subtitles (e.g. en, fr, es, etc.):\n(You can refer to the second column in http://www.loc.gov/standards/iso639-2/php/code_list.php for the appropriate two-letter 'ISO 639-1' language code.)\n")
if language != '':
verifyLanguage = raw_input("\nYou have entered '" + language + "' as the language code for your transcript and video files. Youtube will use this code for processing your files. Continue? (y/n) ")
if verifyLanguage.lower() == '' or 'y':
break
#if combineSubtitles == True:
print "\n\n"
print "\n6.3 If your transcript has speaker names (e.g. the interviewer or interviewee's names) that precede their discourse (e.g. \"Emmanuel: Hi, I'd like to ask you a few questions...\"), please input them. If this does not apply to your transcript, simply leave the following two answers blank by pressing the 'Enter' key."
time.sleep(1)
interviewer = raw_input("\n6.3.1 Please input your interviewer's name as it appears in the transcript: ")
interviewee = raw_input("\n6.3.2 Please input your interviewee's name as it appears in the transcript: ")
print "\n"
#____________#
# let rodolphe know if there is a problem with playlist id, might need to create a playlist in youtube online and copy url id to script
#playlistID = "PLSbFnWujSxCZxm7tYAGNeG9l5s19m4T65"
#language = 'fr'
#change these variables according to what story you want to process - ES
#interviewer = "C.V."
#interviewee = "V.S."
#where the video and txt files are stored
#folderName = 'venant'
#fileName refers to the name of the input .txt file (excluding .txt)
#fileName = 'venant'
#originalVideo refers to the name of the video file including its ext
#originalVideo = "venant.mp4"
#interviewer = "E.H."
#interviewee = "E.M."
#fileName = 'Frederic'
#originalVideo = "Frederic.mov"
#interviewer = "M.M."
#interviewee = "B.K."
#fileName = 'Berthe'
#originalVideo = "DD2FD4AE-FEE4-4DF3-9AF7-A4D6BF453B49.flv"
#interviewer = "S.G."
#interviewee = "O.G."
#folderName = 'oscar'
#fileName = 'oscar'
#originalVideo = "Oscar.mp4"
### START BOILERPLATE CODE
# Sample Python code for user authorization
import httplib2
import os
import sys
import httplib
import random
from apiclient.discovery import build
from apiclient.errors import HttpError
from apiclient.http import MediaFileUpload
from oauth2client.client import flow_from_clientsecrets
from oauth2client.file import Storage
from oauth2client.tools import argparser, run_flow
# The CLIENT_SECRETS_FILE variable specifies the name of a file that contains
# the OAuth 2.0 information for this application, including its client_id and
# client_secret.
"""
to create a client secret file:
google apis dashboard --> create a new project
on the resulting dashboard, "enable apis and get credntials like keys"
search for youtube api
click "YouTube Data API v3" and ENABLE it
click "create credentials"
create and "OAUT client id"
"""
#CLIENT_SECRETS_FILE = "client_secret.json"
#api key is AIzaSyBtMCqWafhLmcFZWS3_lK0wer2edvi69Lg
#client id is in client_id.json
CLIENT_SECRETS_FILE = "client_id.json"
# This OAuth 2.0 access scope allows for full read/write access to the
# authenticated user's account and requires requests to use an SSL connection.
YOUTUBE_READ_WRITE_SSL_SCOPE = "https://www.googleapis.com/auth/youtube.force-ssl"
API_SERVICE_NAME = "youtube"
API_VERSION = "v3"
# This variable defines a message to display if the CLIENT_SECRETS_FILE is
# missing.
MISSING_CLIENT_SECRETS_MESSAGE = "WARNING: Please configure OAuth 2.0"
# Authorize the request and store authorization credentials.
def get_authenticated_service(args):
flow = flow_from_clientsecrets(CLIENT_SECRETS_FILE, scope=YOUTUBE_READ_WRITE_SSL_SCOPE,
message=MISSING_CLIENT_SECRETS_MESSAGE)
storage = Storage("youtube-api-snippets-oauth2.json")
credentials = storage.get()
if credentials is None or credentials.invalid:
credentials = run_flow(flow, storage, args)
# Trusted testers can download this discovery document from the developers page
# and it should be in the same directory with the code.
return build(API_SERVICE_NAME, API_VERSION,
http=credentials.authorize(httplib2.Http()))
# Explicitly tell the underlying HTTP transport library not to retry, since
# we are handling retry logic ourselves.
httplib2.RETRIES = 1
# Maximum number of times to retry before giving up.
MAX_RETRIES = 10
# Always retry when these exceptions are raised.
RETRIABLE_EXCEPTIONS = (httplib2.HttpLib2Error, IOError, httplib.NotConnected,
httplib.IncompleteRead, httplib.ImproperConnectionState,
httplib.CannotSendRequest, httplib.CannotSendHeader,
httplib.ResponseNotReady, httplib.BadStatusLine)
# Always retry when an apiclient.errors.HttpError with one of these status
# codes is raised.
RETRIABLE_STATUS_CODES = [500, 502, 503, 504]
# This method implements an exponential backoff strategy to resume a
# failed upload.
def resumable_upload(request, resource, method):
response = None
error = None
retry = 0
while response is None:
try:
print "Uploading file..."
status, response = request.next_chunk()
if response is not None:
if method == 'insert' and 'id' in response:
print "Video id '%s' was successfully uploaded." % response['id']
videoid = response['id']
elif method != 'insert' or 'id' not in response:
print response
else:
exit("The upload failed with an unexpected response: %s" % response)
except HttpError, e:
if e.resp.status in RETRIABLE_STATUS_CODES:
error = "A retriable HTTP error %d occurred:\n%s" % (e.resp.status,e.content)
else:
raise
except RETRIABLE_EXCEPTIONS, e:
error = "A retriable error occurred: %s" % e
if error is not None:
print error
retry += 1
if retry > MAX_RETRIES:
exit("No longer attempting to retry.")
max_sleep = 2 ** retry
sleep_seconds = random.random() * max_sleep
print "Sleeping %f seconds and then retrying..." % sleep_seconds
time.sleep(sleep_seconds)
return response['id']
if uploadTranscripts == True or resumeUploads == True or downloadCaptions == True or deleteVideos == True:
args = argparser.parse_args()
service = get_authenticated_service(args)
def print_results(results):
print(results)
# Build a resource based on a list of properties given as key-value pairs.
# Leave properties with empty values out of the inserted resource.
def build_resource(properties):
resource = {}
for p in properties:
# Given a key like "snippet.title", split into "snippet" and "title", where
# "snippet" will be an object and "title" will be a property in that object.
prop_array = p.split('.')
ref = resource
for pa in range(0, len(prop_array)):
is_array = False
key = prop_array[pa]
# Convert a name like "snippet.tags[]" to snippet.tags, but handle
# the value as an array.
if key[-2:] == '[]':
key = key[0:len(key)-2:]
is_array = True
if pa == (len(prop_array) - 1):
# Leave properties without values out of inserted resource.
if properties[p]:
if is_array:
ref[key] = properties[p].split(',')
else:
ref[key] = properties[p]
elif key not in ref:
# For example, the property is "snippet.title", but the resource does
# not yet have a "snippet" object. Create the snippet object here.
# Setting "ref = ref[key]" means that in the next time through the
# "for pa in range ..." loop, we will be setting a property in the
# resource's "snippet" object.
ref[key] = {}
ref = ref[key]
else:
# For example, the property is "snippet.description", and the resource
# already has a "snippet" object.
ref = ref[key]
return resource
# Remove keyword arguments that are not set
def remove_empty_kwargs(**kwargs):
good_kwargs = {}
if kwargs is not None:
for key, value in kwargs.iteritems():
if value:
good_kwargs[key] = value
return good_kwargs
### END BOILERPLATE CODE
# Sample python code for videos.insert
def videos_insert(properties, media_file, **kwargs):
resource = build_resource(properties) # See full sample for function
kwargs = remove_empty_kwargs(**kwargs) # See full sample for function
request = service.videos().insert(
body=resource,
media_body=MediaFileUpload(media_file, chunksize=-1,
resumable=True),
**kwargs
)
vid = resumable_upload(request, 'video', 'insert') # See full sample for function
return vid
def hms_to_s(time):
time = unicode(time, "UTF-8")
time = time.split(" --> ")
t_0 = time[0].split(":")
t_1 = time[1].split(":")
t0 = float(int(t_0[0])*3600) + int(float(t_0[1])*60) + int(float(t_0[2]))
t1 = float(int(t_1[0])*3600) + int(float(t_1[1])*60) + int(float(t_1[2]))
return [t0,t1]
def s_to_hms(seconds):
m, sec = divmod(seconds, 60)
h, m = divmod(m, 60)
#print str(int(h)) + ":" + str(int(m)) + ":" + str(int(s))
return str(int(h)) + ":" + str(int(m)) + ":" + str(int(sec))
#ES: open anita/Anita.txt as myfile
try:
with open(folderName + "/" + fileName + ".txt", 'r') as myfile:
text = myfile.read().replace('\n', '')
#print "ES: replace \\n with ''"
with open(folderName + "/" + fileName + ".txt") as f:
text = f.readlines()
except IOError as e:
print "No text file found because you are not running the entire pipeline. Creating dummy file 'delete me.txt' to finish pipeline."
foo = open(folderName + "/" + "delete me.txt","w+")
foo.close()
with open(folderName + "/" + "delete me.txt", 'r') as myfile:
text = myfile.read().replace('\n', '')
with open(folderName + "/" + "delete me.txt") as f:
text = f.readlines()
pass
#print "ES: text is the following" + str(text)
#ES: strip whitespace
text = [x.strip() for x in text]
#split times (?)
splits = []
#list of cut-up texts
texts = [""]
t0 = 0
c = 0
#ES: several print commands were added for guidance. they can be removed.
#ES: a list of the transcript's timestamps
t_list = []
#ES: PREPARE INPUT TEXT FOR PROCESSING
if snipTranscript == True:
for t in text:
#add a \n to the end of each line (why?)
t += "\n"
#ES: if the beginning of the line is not a digit and is not a next-line char
#ES: removing punctuation from '[00:00:01.09]' since it is never qualified as a digit (False) and therefore the following condition is almost always met.
if not t.replace('[','').replace(']','').replace(':','').replace('.','').replace('\n','').isdigit() and t != "\n":
#ES: add t to position c of texts
texts[c] += t#.encode('utf8')
#print t.replace('[','').replace(']','').replace(':','').replace('.','').replace('\n','').isdigit()
#ES: this will aggregate phrases (t) into one list item (a text) until a timestamp is reached
#ES: if t is a timestamp
#ES: removing punctuation from '[00:00:01.09]' since it is never qualified as a digit (False) and therefore the following condition is never met.
if t != "" and t.replace('[','').replace(']','').replace(':','').replace('.','').replace('\n','').isdigit() and "[" in t:
#increase pos on texts by 1
c += 1
#ES: printing deets
#print t.replace('[','').replace(']','').replace(':','').replace('.','').replace('\n','').isdigit()
#print "c: " + str(c)
with open(folderName + "/" + fileName + "_" + str(c) + ".txt", 'w') as thefile:
#thefile = open(folderName + "/" + fileName + "_" + str(c) + ".txt", 'w')
try:
#ES: write the previous position of c in texts (a chunk of text prior to timestamp) to thefile
thefile.write("%s\n" % texts[c-1])
#time.sleep(.1)
texts.append("")
texts[c] = ""
#t = t.replace(" ", "")
#t = t
t = t.replace('[','').replace(']','').replace('\n','')
t = unicode(t, "UTF-8")
#split the timestamps at : (into 3)
t = t.split(":")
if len(t) > 3 or len(t) < 3:
print "\nOne of your timestamps (",':'.join(t) ,") isn't formatted correctly. Consult README.md for guidelines on proper timestamp formatting."
print "\nexiting application..."
time.sleep(2)
exit()
if len(t) == 2:
if combine_only == True:
t1 = int(t[0])*60 + int(t[1])
splits.append([t0,t0+t1])
t_list.append(t1)
t0 = t0 + t1
else:
t1 = int(t[0])*60 + int(t[1])
splits.append([t0,t1])
t_list.append(t1)
t0 = t1
elif len(t) == 3:
#if we are only combining subtitle files, and we are using a .txt file with a list of video lengths, then we need to make this into a list of cumulative times so that the rest of the pipeline can run
if combine_only == True:
t1 = int(t[0])*3600 + int(t[1])*60 + float(t[2])
splits.append([t0,t0+t1])
#print int(t[0])*3600 + int(t[1])*60 + int(t[2])
t_list.append(t1)
t0 = t0 + t1
else:
t1 = int(t[0])*3600 + int(t[1])*60 + float(t[2])
splits.append([t0,t1])
#print int(t[0])*3600 + int(t[1])*60 + int(t[2])
t_list.append(t1)
t0 = t1
except ValueError as e:
print e
print "\n One of your timestamps isn't formatted correctly. Consult README.md for guidelines on proper timestamp formatting."
print "\nVerifying if timestamps are in ascending order..."
sp1 = 0
num = 0
#print str(splits)
#print str(t_list)
for sp in splits:
if num > 0:
if sp[1] <= sp1[1]:
print "\nThere is a problem with one of your timestamps:"
print "Timestamp number #",str(num+2)," (equivalent to ",str(sp[1])," seconds) should be a larger number than the timestamp that comes before it (",str(sp1[1])," seconds), but it is smaller."
print "Please make sure your timestamps are in ascending order and that there are no mistakes (see README.md) and restart the program."
exit()
sp1 = sp
num+=1
print "\nThe document named '" + fileName + ".txt' was cut into " + str(len(splits)) + " text snippets based on it containing " + str(len(splits)) + " timestamps formatted like such '[HH:MM:SS.00]'."
else:
print "Please set the variable 'snipTranscript' to True so that the code can properly run."
exit()
#ES print texts[c]
#print "splits: " + str(splits)
#for i in splits:
# print s_to_hms(i[0]),"->",s_to_hms(i[1])
#time.sleep(60)
#print splits,splits[len(splits)-1][1]
#splits.append([splits[len(splits)-1][1],7200])
#print splits
#print "Wait"
#time.sleep(30)
c = 0
#print splits
videoids = []
#videoids = [u'jDAZHgL-nG4', u'cMNTnd8pApk', u's5hLO6T_BhY', u'gOAoCh5Mecc', u'p0PX5s6k5DU', u'hSmPkLqOt0M', u'2Ik7_biRs9g', u'G64A_hpNWfI', u'ZzVVEcGekv0', u'ZxKJhN3JFfI', u'TsDnqWmpvrw', u'Kvem1XnPHF0', u'VwqhkmbiLh0', u'V1sv1MYLdC0']
#videoids = [u'cj62vgUfnik', u'5k9WCcWCLiU', u'MexTd0EGfRc', u'hWY_30yHOec', u'GrMtKARI9kQ', u'YDHnQAE7U0w', u'yc4IXkGHuXs', u'ZauR51lBjQo', u'kisoEOTjmVI', u'V9XdpjtUU4Q', u'eOdKfhePfTs', u'AAQ9YuybUxM', u'3BaTzSSL4_c', u'OriOoB5yF0s', u'91qOFKithgE', u'WQJQkGEwG-Q', u'n4eW0T6Oek0', u'2dRf-EbKYHA', u'RUgi4NfoPEw', u'n40bGD_9eZI', u'OWWAQTGKyMI', u'8a2De6Gzfek', u'VQJgxR3iAoA', u'UEzrAMq6fGc', u'PXCHMF-Z7X4', u'SU_Rbp9V_Zo', u'VLhSxDh9gI0', u'80rY1RlbVQw', u'1yumt5fRBF4', u'u5qAHXhhJoo', u'G3gO6DW-wrM', u'qAU_8DNEqP8', u'fbGaOVHXkvY', u'_Knl1rP8Z9w', u'O6f8ZWjSgiw', u'uXY-00DuLjY', u'WpreZ_gbEyw']
#with open(folderName + "/" + 'videoids.pkl', 'wb') as f:
# pickle.dump(videoids, f)
if resumeUploads == True:
print "\nResuming video uploads...\n"
time.sleep(1)
try:
with open(folderName + "/" + 'videoids.pkl', 'rb') as f:videoids = pickle.load(f)
except Exception as e:
print e
print "\nThe program is unable to resume uploads because there are no uploads to resume or your 'videoids.pkl' file has gone missing. The program will restart by uploading all videos. You may need to remove any previously-uploaded videos if the videos you are uploading are identical. If so, do this manually on youtube.com and then restart the program."
uploadVideos = True
wait = False
def yes_or_no(question):
while "the answer is invalid":
reply = str(raw_input(question+' (y/n): ')).lower().strip()
if reply[0] == 'y':
return True
if reply[0] == '':
return True
if reply[0] == 'n':
exit()
if uploadVideos == False and snipVideos == True:
#ES: the following is called when videos are being uploaded (uploadVideos = True) to warn the user as to how many videos will be uploaded.
question = "\nThere were " + str(len(splits)) + " timestamps detected in " + fileName + ". " + str(len(splits)) + " video snippets will created. Continue?"
print "\nIf all input was correct, the program will begin snipping"
yes_or_no(question)
print "\n1. Slicing into " + str(len(splits)) + " parts"
time.sleep(1)
for s in splits:
c += 1
if c > len(videoids):
ffmpeg_extract_subclip(folderName + "/" + originalVideo, s[0], s[1], targetname=folderName + "/" + fileName + "_" + str(c) +".mp4")
media_file = folderName + '/' + fileName + "_" + str(c) + ".mp4"
if not os.path.exists(media_file):
exit('Please specify a valid file location.')
print "\nSnipping completed. No further options were selected. Exiting..."
exit()
#ES: UPLOADS THE VIDEOS
if uploadVideos == True:
#ES: the following is called when videos are being uploaded (uploadVideos = True) to warn the user as to how many videos will be uploaded.
question = "\nThere were " + str(len(splits)) + " timestamps detected in " + fileName + ". " + str(len(splits)) + " video snippets will therefore be uploaded to YouTube for processing. YouTube allows a maximum of 100 video uploads per 24h using the current API credentials. Continue?"
print "\nIf all input was correct, the program will begin snipping and uploading content to Youtube for processing. This may take between 20 minutes and several hours, depending on the size of your video file (" + str(videoSize) + " Mb)."
yes_or_no(question)
print "\n1. Slicing into " + str(len(splits)) + " parts & uploading videos..."
time.sleep(1)
if len(videoids) > 0:
print "(However, it looks like ",len(videoids)," video snippets were already uploaded to Youtube. Now trying to resume uploading the remaining snippets...)"
time.sleep(1)
for s in splits:
c += 1
if c > len(videoids):
ffmpeg_extract_subclip(folderName + "/" + originalVideo, s[0], s[1], targetname=folderName + "/" + fileName + "_" + str(c) +".mp4")
media_file = folderName + '/' + fileName + "_" + str(c) + ".mp4"
if not os.path.exists(media_file):
exit('Please specify a valid file location.')
vid = videos_insert(
{'snippet.categoryId': '22',
'snippet.defaultLanguage': language,
'snippet.defaultAudioLanguage': language,
'snippet.description': 'Description of uploaded video.',
'snippet.tags[]': '',
'snippet.title': media_file,
'status.embeddable': '',
'status.license': '',
'status.privacyStatus': 'unlisted',
'status.publicStatsViewable': ''},
media_file,
part='snippet,status')
videoids.append(vid)
print videoids
#c += 1
wait = True
with open(folderName + "/" + 'videoids.pkl', 'wb') as f:
pickle.dump(videoids, f)
else:
if resumeUploads == True or deleteVideos == True or uploadTranscripts == True:
with open(folderName + "/" + 'videoids.pkl', 'rb') as f:
videoids = pickle.load(f)
print "\nThe video IDs are composed of the following: " + str(videoids)
#print videoids
if resumeUploads == True or deleteVideos == True or uploadTranscripts == True:
with open(folderName + "/" + 'videoids.pkl', 'wb') as f:
pickle.dump(videoids, f)
if wait == True:
print "\nWaiting for videos to be processed. It is",strftime("%H:%M:%S", localtime()),". Script will resume in " + str(sleepingTime/60) + " minutes..."
time.sleep(sleepingTime)
#search_response = service.search().list(
# q="Anita",
# part="id",
# type="video",
# fields="items/id"
#).execute()
#
#videos = []
#
#for search_result in search_response.get("items", []):
# videos.append("%s" % (search_result["id"]["videoId"]))
#
#print "Videos:\n", "\n".join(videos), "\n"
#ES: I don't think this function is ever called...
# Call the API's captions.insert method to upload a caption track in draft status.
def upload_caption(youtube, video_id, language, name, file):
insert_result = youtube.captions().insert(
part="snippet",
body=dict(
snippet=dict(
videoId=video_id,
language=language,
name=name,
isDraft=True
)
),
media_body=file
).execute()
id = insert_result["id"]
name = insert_result["snippet"]["name"]
language = insert_result["snippet"]["language"]
status = insert_result["snippet"]["status"]
#print "Uploaded caption track '%s(%s) in '%s' language, '%s' status." % (name,
# id, language, status)
c = 1
captionsids = []
wait = False
if uploadTranscripts == True:
#print splits,videoids
#uploads transcripts
print "\nUploading transcripts..."
for s in splits:
print c,s
media_file = folderName + '/' + fileName + "_" + str(c) + ".flv"
caption_file = folderName + '/' + fileName + "_" + str(c) + ".txt"
#print s,media_file,caption_file,videoids[c-1]
a = service.captions().insert(
part="snippet",
body=dict(
snippet=dict(
videoId=videoids[c-1],
language=language,
name=media_file,
isDraft=True,
sync=True
)
),
media_body=caption_file
).execute()
captionsids.append(a['id'])
c += 1
#print a
wait = True
with open(folderName + "/" + 'captionsids.pkl', 'wb') as f:
pickle.dump(captionsids, f)
print "Waiting for transcripts to be processed into captions. It is",strftime("%H:%M:%S", localtime()),". Script will resume in " + str(2 * sleepingTime / 60) + " minutes..."
time.sleep(2 * sleepingTime)
else:
if downloadCaptions == True:
with open(folderName + "/" + 'captionsids.pkl', 'rb') as f:
captionsids = pickle.load(f)
#if wait == True:
if downloadCaptions == True:
print "\nDownloading captions..."
c = 1
waitLonger = True
for s in splits:
print c,s,captionsids[c-1]
sub_txt = ""
# while waitLonger == True:
# try:
subtitle = service.captions().download(id=captionsids[c-1],tfmt='vtt').execute()
# waitLonger = False
# except:
# waitLonger = True
# print "Waiting for transcripts " + str(c) + " " + captionsids[c-1] + " to be processed into captions. It is",strftime("%H:%M:%S", localtime()),". Script will resume in " + str(2) + " minutes..."
# time.sleep(120)
sub_txt += subtitle
cc = ""
if c < 10:
cc = "0" + str(c)
else:
cc = str(c)
#print subtitle
print cc
with open(folderName + "/" + fileName + "_" + str(cc) + ".vtt", 'w') as thefile:
#thefile.write(sub_txt)
thefile.write(subtitle)
if cc == "31":
print subtitle
c += 1
time.sleep(3)
#deletes videos from youtube -ES
if deleteVideos == True:
print "\nDeleting videos...\n"
c = 1
for s in splits:
print c,videoids[c-1]
service.videos().delete(
id=videoids[c-1]
).execute()
c += 1
time.sleep(10)
if combineSubtitles == True:
#compiles them all
print "\nCombining subtitle snippets ..."
#ES: this is a feature that needs exploration so as to make sure that place names are never split between 2 timestamps, at the least.
#place-based time stamping can be set to True or False (make a variable for this)
compiledSubs = compileSubs(folderName,fileName,[['_high-frequency-timestamps',0,placeBasedTimestamping]],t_list,interviewer,interviewee,False,language,resampleSubtitles,fullSentenceSubtitles,removeLoneWords)
time.sleep(10)
#thefile = open(folderName + "/" + fileName + ".srt", 'w')
#thefile.write(compiledSubs)
if uploadFull == True:
print "\nUploading full video..."
vid = videos_insert(
{'snippet.categoryId': '22',
'snippet.defaultLanguage': language,
'snippet.description': 'Description of uploaded video.',
'snippet.tags[]': '',
'snippet.title': fileName,
'status.embeddable': '',
'status.license': '',
'status.privacyStatus': 'unlisted',
'status.publicStatsViewable': ''},
folderName + "/" + originalVideo,
part='snippet,status')
# place video in custom playlist
def playlist_items_insert(properties, **kwargs):
resource = build_resource(properties) # See full sample for function
kwargs = remove_empty_kwargs(**kwargs) # See full sample for function
results = service.playlistItems().insert(
body=resource,
**kwargs
).execute()
print_results(results)
#'snippet.playlistId': playlistID,
playlist_items_insert(
{'snippet.resourceId.kind': 'youtube#video',
'snippet.resourceId.videoId': vid,
'snippet.position': ''},
part='snippet',
onBehalfOfContentOwner='')
print "Waiting for full video to be processed. It is",strftime("%H:%M:%S", localtime()),". Script will resume in " + str(sleepingTime/60) + " minutes..."
time.sleep(sleepingTime)
id = vid
print "\nUploading compiled subtitles..."
caption_file = folderName + '/' + fileName + ".srt"
service.captions().insert(
part="snippet",
body=dict(
snippet=dict(
videoId=id,
language=language,
name=originalVideo,
isDraft=True,
sync=False
)
),
media_body=caption_file
).execute()
print "\nFull video is soon available on your Youtube channel for you to check and adjust captions."