-
Notifications
You must be signed in to change notification settings - Fork 5
/
music_genre_classification.py
392 lines (311 loc) · 14.3 KB
/
music_genre_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
import numpy as np
import librosa
import matplotlib.pyplot as plt
import librosa.display
import sklearn
import os
import csv
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras import models #(2 use tf1, if it is using tf2)
from tensorflow.keras import layers
""" TODO: PLot Acoustic signal of an Audio file, Extract its features """
def f():
plt.show()
"""1. Loading Audio file"""
#audio signal is a frequencies data
audio_file = 'T08-violin.wav'
audio_path = '/home/sumanthmeenan/Desktop/projects/music genre classification/T08-violin.wav'
#output - timeseries as numpy array,sampling rate (default = 22KHZ)
# input signal(audio time series)
#sample rate is the number of samples of audio carried per second, measured in Hz or kHz.
#we're loading input audio signal
time_series, sampling_rate = librosa.load(audio_path)
print('lenght of time-series array:', time_series)
print('Default sampling rate:', sampling_rate)
print(type(time_series), type(sampling_rate))
print(time_series)
print(time_series.shape, sampling_rate)
#we can change default sampling rate value
librosa.load(audio_path, sr = 44100)
librosa.load(audio_path, sr = None)
"""2. Playing Audio only in jupiter notebook"""
# from Ipython.display import display
# import IPython.display.Audio as ipd
# ipd.Audio(audio_path)
"""3. Visualise the audio"""
#1. waveform
# timeseries numpy array is plotted
#plot of amplitude of waveform
plt.figure(figsize= (14, 5))
librosa.display.waveplot(time_series, sr=sampling_rate)
f()
#2. spectrogram - visual representation of spectrum of frequencies of sound
#freq v/s time
#stft - short time fourier transform
X = librosa.stft(time_series)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize = (14, 5))
librosa.display.specshow(Xdb, sr = sampling_rate, x_axis = 'time', y_axis= 'hz')
f()
#convert freq librosa.output.write_wav('example.wav', x, sr)is to logarithmic one
librosa.display.specshow(Xdb, sr=sampling_rate, x_axis='time', y_axis='log')
f()
"""3. writing audio file - numpy array to audio file"""
librosa.output.write_wav('sample.wav', time_series, sampling_rate)
"""4. creating audio signal"""
sampling_rate1 = 22050
Time = 5.0
t = np.linspace(0, Time, int(Time*sampling_rate1), endpoint=False)
sine_wave = 0.5*np.sin(2*np.pi*220*t)
#sine wave = sound signal/ audio signal
#saving the audio
librosa.output.write_wav('tone_220.wav', t, sampling_rate1)
"""5. feature extraction"""
#1. zero-crossing rate
time_series, sampling_rate = librosa.load('/home/sumanthmeenan/Desktop/projects/music genre classification/T08-violin.wav')
librosa.display.waveplot(time_series, sr=sampling_rate)
n0 = 9000
n1 = 9100
plt.figure(figsize=(14, 5))
plt.plot(time_series[n0:n1])
plt.grid()
f()
zero_crossings = librosa.zero_crossings(time_series[n0:n1], pad=False)
print('No. of times signal is crossing zero:', sum(zero_crossings))
#2. Spectral Centroid
#calculates centre of mass for a sound is located, weighted mean of the frequencies present in the sound,
#librosa.feature.spectral_centroid computes the spectral centroid for each frame in a signal:
spectral_centroids = librosa.feature.spectral_centroid(time_series, sr=sampling_rate)
spectral_centroids.shape
centroid_frequencies = spectral_centroids[0]
centroid_frequencies.shape
print('spectral centroid frequencies:',centroid_frequencies)
#Computing the time variable for visualization
frames = range(len(centroid_frequencies))
print('NO. of frames are:', len(frames))
t = librosa.frames_to_time(frames)
t.shape
plt.hist(t)
f()
def normalization(input, axis = 0):
return sklearn.preprocessing.minmax_scale(input, axis = axis)
#plot spectral centroid along waveform
norm_centroid_freq = normalization(centroid_frequencies)
print('normalised centroid frequencies:', norm_centroid_freq)
librosa.display.waveplot(time_series, sr = sampling_rate)
plt.plot(t, norm_centroid_freq, color = 'r')
plt.title('spectral centroid along waveform')
#1st dimension of t and norm_centroid_Freq shld be same
f()
#3. spectral rolloff
# shape of signal which represents the freq below which x% of total spectral energy lies
# cal. rolloff freq for each frame in signal
"""The roll-off frequency is defined for each frame as the center frequency
for a spectrogram bin such that at least roll_percent (0.85 by default)
of the energy of the spectrum in this frame is contained in this bin and the bins below"""
spectral_rolloff = librosa.feature.spectral_rolloff(time_series+0.01 , sr = sampling_rate)
spectral_rolloff.shape
spectral_rolloff[0].shape
#(1,775) v/s (775,)
norm_spectral_rolloff = normalization(spectral_rolloff[0])
librosa.display.waveplot(time_series, sr = sampling_rate)
plt.plot(t, norm_spectral_rolloff, color = 'r')
plt.title('spectral roll-off')
f()
"""4. Mel frequency cepstral coefficients (MFCCs)
Tells overall shape of a spectral envelope. it models the characteristics of a human voice
librosa mfcc feature cal mfccs across an audio signal"""
time_series, sampling_rate = librosa.load('/home/sumanthmeenan/Desktop/projects/music genre classification/simple-loop.wav.crdownload')
librosa.display.waveplot(time_series, sr = sampling_rate)
f()
mfcc = librosa.feature.mfcc(time_series, sr = sampling_rate)
print('Shape Of MFCCs:',mfcc.shape)
print('mfcc calculated {} MFCCS over {} frames'.format(mfcc.shape[0], mfcc.shape[1]))
# 'display mfccs'
librosa.display.specshow(mfcc, sr = sampling_rate, x_axis='time')
f()
#normalising mfcc valules 2 make each co-ef dimension 0 mean and 1 variance
scaled_mfcc = sklearn.preprocessing.scale(mfcc, axis = 1)
scaled_mfcc
scaled_mfcc.shape
scaled_mfcc[0].shape
norm_mfcc = normalization(mfcc, axis = 1)
norm_mfcc
print(mfcc.mean(axis = 0))
print(mfcc.mean(axis = 0).shape)
print(mfcc.mean(axis = 1))
print(mfcc.mean(axis = 1).shape)
print(scaled_mfcc.mean(axis = 1))
print(norm_mfcc.mean(axis = 1))
print(mfcc.var(axis = 1))
print(scaled_mfcc.var(axis = 1))
print(norm_mfcc.var(axis = 1))
"""DOUBT - NORM_MFCC MEAN = 0 AND SCALED_MFCC VAR = 1 """
librosa.display.specshow(scaled_mfcc, sr = sampling_rate, x_axis='time')
plt.title('scaled_mfcc')
f()
librosa.display.specshow(norm_mfcc, sr = sampling_rate, x_axis='time')
plt.title('norm_mfcc')
entire spectrum is projected onto 12 bins representing the 12 distinct semitones (or chroma)f()
librosa.display.specshow(mfcc, sr = sampling_rate, x_axis='time')
plt.title('mfcc')
f()
#5. Chroma frequencies
#In chroma features, entire spectrum is projected onto 12 bins representing the 12 distinct semitones (or chroma)
time_series, sampling_rate = librosa.load('/home/sumanthmeenan/Desktop/projects/music genre classification/Grand Piano.wav')
hop_length = 512
chromagram = librosa.feature.chroma_stft(time_series, sr = sampling_rate, hop_length = hop_length)
chromagram.shape
np.mean(chromagram, axis = 1).shape #Try axis = 0 v/s 1`
"""chromagram shape: np.ndarray [shape=(n_chroma, t)]
Normalized energy for each chroma bin at each frame."""
plt.figure(figsize=(15,5))
librosa.display.specshow(chromagram, sr = sampling_rate, x_axis='time',y_axis='chroma', hop_length=hop_length, cmap = 'viridis')
plt.title('chromagram')
f()
"""Use an energy (magnitude) spectrum instead of power spectrogram"""
S = np.abs(librosa.stft(time_series))
chroma = librosa.feature.chroma_stft(S=S, sr=sampling_rate)
plt.figure(figsize=(15,5))
librosa.display.specshow(chroma, sr = sampling_rate, x_axis='time',y_axis='chroma', hop_length=hop_length, cmap = 'coolwarm')
plt.title('Energy spectrum - chroma1')
f()
"""Use a pre-computed power spectrogram with a larger frame"""
S = np.abs(librosa.stft(time_series, n_fft=4096))**2
chroma = librosa.feature.chroma_stft(S=S, sr=sampling_rate)
plt.figure(figsize=(15,5))
librosa.display.specshow(chroma, sr = sampling_rate, x_axis='time',y_axis='chroma', hop_length=hop_length, cmap = 'coolwarm')
plt.title('Energy spectrum - chroma2')
f()
#6 spectral bandwidth
time_series, sampling_rate = librosa.load('/home/sumanthmeenan/Desktop/projects/music genre classification/Grand Piano.wav')
spectral_bandwidth = librosa.feature.spectral_bandwidth(time_series, sr=sampling_rate)
"""convert .au to .wav 4 compatability with pythons wave module 4 reading audio files.
we need ffmpeg,pydub"""
from pydub import AudioSegment
genres = os.listdir('/home/sumanthmeenan/Desktop/projects/music genre classification/genres')
for i in genres:
au_files = os.listdir('/home/sumanthmeenan/Desktop/projects/music genre classification/genres/' + i + '/au')
for j in au_files:
sound = AudioSegment.from_file('/home/sumanthmeenan/Desktop/projects/music genre classification/genres/' + i + '/au/' + j, "au")
sound.export('/home/sumanthmeenan/Desktop/projects/music genre classification/genres/' + i + '/wav/' + str(j[:-3]) + '.wav', format="wav")
sound = AudioSegment.from_mp3('/home/sumanthmeenan/Desktop/projects/music genre classification/genres/disco/disco.00082.au')
sound.export('/home/sumanthmeenan/Desktop/projects/music genre classification/got.wav', format="wav")
"""Save spectrogram of every Audio File"""
cmap = plt.get_cmap('inferno')
plt.figure(figsize = (10,10))
genres = os.listdir('/home/sumanthmeenan/Desktop/projects/music genre classification/genres')
for i in genres:
au_files = os.listdir('/home/sumanthmeenan/Desktop/projects/music genre classification/genres/' + i + '/au')
for j in au_files:
time_series,sampling_rate = librosa.load('/home/sumanthmeenan/Desktop/projects/music genre classification/genres/' + i + '/au/' + j, "au")
plt.specgram(time_series, NFFT = 2048, Fs = 2, Fc = 0, noverlap=128,cmap = cmap,sides='default',
mode='default',scale='dB')
plt.savefig('/home/sumanthmeenan/Desktop/projects/music genre classification/genres/' + i + '/img/' + str(j[:-3]) + '.jpg')
"""Extract 5 features(MFCC, SPECTRAL CENTROID, SPECTRAL ROLL-OFF,
Chroma Frequencies, Zero crossing rate) from each spectrogram and store it in a CSV file"""
#feature names
features = 'filename zero_crossing_rate spectral_centroid spectral_rolloff chroma rmse spectral_bandwidth '
for i in range(1, 21):
features += 'mfcc'+str(i) + " " #f-strings
features += 'label'
features = features.split()
""" Writing Data to CSV file """
file1 = open('/home/sumanthmeenan/Desktop/projects/music genre classification/created_data.csv', 'w')
with file1:
writer = csv.writer(file1)
writer.writerow(features)
genres = os.listdir('/home/sumanthmeenan/Desktop/projects/music genre classification/genres')
for i in genres:
au_files = os.listdir('/home/sumanthmeenan/Desktop/projects/music genre classification/genres/' + i + '/au')
for j in au_files:
x = []
time_series,sampling_rate = librosa.load('/home/sumanthmeenan/Desktop/projects/music genre classification/genres/' + i + '/au/' + j)
x.append(j)
zero_crossing_rate = librosa.zero_crossings(time_series, pad=False)
x.append(np.mean(zero_crossing_rate))
spectral_centroid = librosa.feature.spectral_centroid(time_series, sr=sampling_rate)
x.append(np.mean(spectral_centroid))
spectral_rolloff = librosa.feature.spectral_rolloff(time_series+0.01 , sr = sampling_rate)
x.append(np.mean(spectral_rolloff))
chroma = librosa.feature.chroma_stft(time_series, sr = sampling_rate, hop_length = 512) #chroma
x.append(np.mean(chroma))
rmse = librosa.feature.rmse(time_series)
x.append(np.mean(rmse))
spectral_bandwidth = librosa.feature.spectral_bandwidth(time_series, sr=sampling_rate)
x.append(np.mean(spectral_bandwidth))
mfcc = librosa.feature.mfcc(time_series, sr=sampling_rate)
#if we use {} - dynamic values. Instead of append() we can use fstrings
for k in mfcc:
x.append(np.mean(k))
x.append(i)
print(x)
file = open('/home/sumanthmeenan/Desktop/projects/music genre classification/created_data.csv', 'a', newline='')
with file:
writer = csv.writer(file)
writer.writerow(x)
data = pd.read_csv('/home/sumanthmeenan/Desktop/projects/music genre classification/created_data.csv')
data.head()
data.shape
data1 = data.drop(data.columns[0], axis = 1)
#Label Encoding
LabelEncoder = preprocessing.LabelEncoder()
LabelEncoder.fit(data['label'])
list(LabelEncoder.classes_)
labels = LabelEncoder.transform(data['label'])
#Feature Scaling
feature_scaler = StandardScaler()
features = feature_scaler.fit_transform(np.array(data1.iloc[:, :-1], dtype = float))
#Train-Test-Split
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3)
print('shape of x_train is:', x_train.shape)
print('shape of y_train is:', y_train.shape)
print('shape of x_test is:', x_test.shape)
print('shape of y_test is:', y_test.shape)
#Applying classification Algorithms to Data
#Initialize a NN
model = models.Sequential()
#1st hidden layer has 256 nuerons, input layer has 26 nuerons(26 features in data)
model.add(layers.Dense(256, activation='relu', input_shape=(x_train.shape[1],)))
#2nd hidden layer has 128 nuerons
model.add(layers.Dense(128, activation='relu'))
#3rd hidden layer has 64 nuerons
model.add(layers.Dense(64, activation='relu'))
#output layer has 10 nuerons
model.add(layers.Dense(10, activation='softmax'))
#loss = 'sparse_categorical_crossentropy' for multiclass labels
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
history = model.fit(x_train,
y_train,
epochs=20,
batch_size=128)
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print('Accuracy of test data is:', test_accuracy)
""" SPlit DATA into TRAINING + VALIDATION + TESTING """
x_val = x_train[:200]
x_train1 = x_train[200:]
y_val = y_train[:200]
y_train1 = y_train[200:]
#x_train1, y_train1 - training data
#x_val, y_val - validation data
#x_test, y_test - testing data
history = model.fit(x_train1,
y_train1,
epochs=20,
batch_size=128)
test_loss, test_accuracy = model.evaluate(x_val, y_val)
#Model memorized not generalized - Overfitting
y_pred = model.predict(x_test)
y_pred[0]
print('predicted genre is:', np.argmax(y_pred[0]))
x_test[0]
print('Actual genre is:', y_test[0])
y_pred[20]
print('predicted genre is:', np.argmax(y_pred[20]))
x_test[20]
print('Actual genre is:', y_test[20])