-
Notifications
You must be signed in to change notification settings - Fork 1
/
onset_detection.py
89 lines (74 loc) · 3.22 KB
/
onset_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/python3
import librosa
import librosa.display
import sys
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import find_peaks_cwt
FIXED_DELAY = 0.04 # 40 ms in s
def main():
# if we need this to be more versatile can use argparse library
if len(sys.argv) == 2:
infile = sys.argv[1]
else:
infile = "./static/test_data/kismet-tv_on_0-24/kismet-tv_on_0-24_without_effects.mp3"
input_audio, sample_rate = librosa.load(infile)
od = OnsetDetect(input_audio, sample_rate)
print(od.get_times())
od.visualize()
class OnsetDetect(object):
''' Right now, I'm just using the librosa implementation
In the future we can use this paper to implement, if we think it wil lgather better results
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.67.5843&rep=rep1&type=pdf
'''
def __init__(self, audio, sr):
self.input_audio = audio
self.sample_rate = sr
self.od_result = self._run_detection()
def _run_detection(self):
''' Perform acual detection here '''
onset_env = librosa.onset.onset_strength(y=self.input_audio,
sr=self.sample_rate,
aggregate=np.mean)
return librosa.onset.onset_detect(onset_envelope=onset_env,
units='time',
sr=self.sample_rate,
backtrack=True)
def get_times(self):
''' Returns the onset detection peaks as a 1-d array (in seconds)'''
return self.od_result
def get_onset_clips(self, duration):
"""
Get audio clips in length duration (seconds) following detected onsets
:param duration: length of clip in seconds
:return: array of audio clips
"""
sample_width = int(librosa.core.time_to_samples([duration], self.sample_rate)[0])
clips = []
for i, sample in enumerate(librosa.core.time_to_samples(self.od_result, self.sample_rate)):
clip = self.input_audio[sample: sample + sample_width]
if len(clip) > 0:
clips.append(clip)
else:
self.od_result.remove(i)
return clips
def visualize(self):
''' Show the detected events in a pyplot window '''
o_env = librosa.onset.onset_strength(self.input_audio, sr=self.sample_rate)
times = librosa.frames_to_time(np.arange(len(o_env)), sr=self.sample_rate)
event_frames = librosa.core.time_to_frames(self.get_times(), sr=self.sample_rate)
D = librosa.stft(self.input_audio)
plt.figure()
ax1 = plt.subplot(2, 1, 1)
librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max),
x_axis='time', y_axis='log')
plt.title('Power spectrogram')
plt.subplot(2, 1, 2, sharex=ax1)
plt.plot(times, o_env, label='Onset strength')
plt.vlines(times[event_frames], 0, o_env.max(), color='r', alpha=0.9,
linestyle='--', label='Onsets')
plt.axis('tight')
plt.legend(frameon=True, framealpha=0.75)
plt.show()
if __name__ == "__main__":
main()