-
Notifications
You must be signed in to change notification settings - Fork 1
/
vocal_extractor.py
146 lines (119 loc) · 5.81 KB
/
vocal_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from pytube import YouTube
import subprocess
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import timedelta
from moviepy.video.io.VideoFileClip import VideoFileClip
from subprocess import Popen, PIPE, STDOUT
def download_video(source):
"""
Gets a YouTube link as source and downloads it to the current directory as video.mp4
"""
video = YouTube(source)
video.streams.first().download(filename='video')
def video_attrs():
"""
Spleeter library can only process audio up to 10 minutes, the function calculates the duration
of the video and # of 10 minue parts to be processed seperately later.
Returns:
nr_of_parts (int): # of 10 minute parts of the video
duration_of_video (str): The duration of the video in hh:mm:ss format proper for
a command line call
"""
seconds = int(VideoFileClip("video.mp4").duration)
duration_of_video = timedelta(seconds=seconds)
if seconds >= 3600 and seconds < 36000:
duration_of_video = '0' + str(duration_of_video)
elif seconds >= 36000:
duration_of_video = str(duration_of_video)[-8:]
else:
duration_of_video = str(duration_of_video)[-5:]
nr_of_parts = int(seconds / 600) + 1
return nr_of_parts, duration_of_video
def cutting_points(nr_of_parts, duration_of_video):
"""
Function define the cutting points of the video for each 10 minute.
Args:
nr_of_parts (int): # of 10 minute parts of the video
duration_of_video (str): The duration of the video in hh:mm:ss format
Returns:
beginning(list): Beginning time of each 10 minute part
end(list): Ending time of each 10 minute part
parts(list): Filename of each 10 minute part.
Examples:
>>> nr_of_parts = 4
>>> duration_of_video = '00:38:26'
>>> cutting_points(nr_of_parts,duration_of_video))
(['00:00', '10:00', '20:00', '30:00'],
['10:00', '20:00', '30:00', '00:38:26'],
['part_1.mp4', 'part_2.mp4', 'part_3.mp4', 'part_4.mp4'])
"""
beginning = []
end = []
if nr_of_parts < 6:
parts = [str('part_') + str(i + 1) + str('.mp4') for i in range(nr_of_parts)]
beginning = [str(i) + '0:00' for i in range(nr_of_parts)]
end = [str(i + 1) + '0:00' for i in range(nr_of_parts - 1)]
end.append(duration_of_video)
else:
for i in range(int(nr_of_parts / 6)):
beginning = beginning + ['0' + str(i) + ':' + str(j) + '0:00' for j in range(6)]
end = end + ['0' + str(i) + ':' + str(j) + '0:00' for j in range(6)]
end = end[1:]
beginning = beginning + ['0' + str(int(nr_of_parts / 6)) + ':' + str(j) + '0:00' for j in
range(nr_of_parts % 6)]
end = end + ['0' + str(int(nr_of_parts / 6)) + ':' + str(j) + '0:00' for j in range(nr_of_parts % 6)]
end.append(duration_of_video)
parts = [str('part_') + str(i + 1) + str('.mp4') for i in range(nr_of_parts + 1)]
return beginning, end, parts
def extract_vocals(link, filename='merged_video'):
"""
After downloading the video and defining the cutting points, function processes each 10 minute
part of the video in an iterable fashion.
cmds:
1) Creates a copy of the video by cutting it according to the defined beginning and end points.
2) Converts the 10 minute parts of the video to silent.
3) Creates audio files from 10 minute parts.
4) Splits audio files to 2 parts: vocals.wav and accompaniment.wav. Stores both audio files.
vocals.wav includes only the human voice while all the other audio is stored in the
accompaniment.wav file.
5) Output is silent video parts with vocals reunited.
Lastly, the function combines the 10 minute parts of vocal-only videos to into one video and saves it
with the filename defined as the second argument.
Args:
link(str): YouTube link of the video to extract the vocals from.
filename(str): Optional argument for the name of the vocal-only video.
Default is merged_video(.mp4)
"""
download_video(link)
nr_of_parts, duration_of_video = video_attrs()
beginning, end, parts = cutting_points(nr_of_parts, duration_of_video)
for i in range(len(parts)):
parts_ = parts[i]
beginning_ = beginning[i]
end_ = end[i]
cmds = [["ffmpeg", "-i", "video.mp4", "-ss", beginning_, "-to", end_, "-c", "copy", parts_, '-y'],
["ffmpeg", "-i", parts_, "-codec", "copy", "-an", str("silent_video_" + str(i + 1) + ".mp4"), '-y'],
["ffmpeg", "-i", parts_, "-vn", "-f", "wav", "audio_" + str(i + 1) + ".wav", '-y'],
["python", "-m", "spleeter", "separate", "-i", "audio_" + str(i + 1) + ".wav", "-p", "spleeter:2stems",
"-o", "myoutput" + str(i + 1)],
["ffmpeg", "-i", "silent_video_" + str(i + 1) + ".mp4", "-i",
"myoutput" + str(i + 1) + "/audio_" + str(i + 1) + "/vocals.wav",
"-c:v", "copy", "-c:a", "aac", "vocals_" + str(i + 1) + "_video.mp4", '-y']]
for cmd in tqdm(cmds):
p = subprocess.run(cmd, stdout=PIPE, stderr=PIPE)
print(p.stdout.decode("utf-8"))
print(p.stderr.decode("utf-8"))
concat = ["file" + " 'vocals_" + str(i + 1) + "_video.mp4'" for i in range(len(parts))]
f = open('files.txt', 'w')
s1 = '\n'.join(concat)
f.write(s1)
f.close()
cmd = ["ffmpeg", "-f", "concat", "-safe", "0", "-i", "files.txt", "-c", "copy", filename + ".mp4", '-y']
subprocess.run(cmd, stderr=subprocess.STDOUT)
print('Done!')
if __name__ == '__main__':
filename = "video_output"
link = "https://www.youtube.com/watch?v=z0GtmPnqAd8"
extract_vocals(link, filename)