From 0a0df7e5fb9fc6da63eccbcb66850f8e3cc071f0 Mon Sep 17 00:00:00 2001 From: "Iskren.Stanislavov" Date: Mon, 20 Apr 2015 17:55:06 +0300 Subject: [PATCH 1/2] port branch --- INSTALLATION.md | 1 + dejavu/__init__.py | 75 ++++++++++++++++++++++++++++++-- dejavu/decoder.py | 10 +++++ requirements.txt | 1 + setup.py | 2 +- test_fingerprint_by_splitting.py | 48 ++++++++++++++++++++ 6 files changed, 132 insertions(+), 5 deletions(-) create mode 100644 test_fingerprint_by_splitting.py diff --git a/INSTALLATION.md b/INSTALLATION.md index 34161ac2..82c99e1f 100644 --- a/INSTALLATION.md +++ b/INSTALLATION.md @@ -21,6 +21,7 @@ Install the dependencies: sudo yum install numpy scipy python-matplotlib ffmpeg portaudio-devel pip install PyAudio pip install pydub + pip install audioread Now setup virtualenv ([howto?](http://www.pythoncentral.io/how-to-install-virtualenv-python/)): diff --git a/dejavu/__init__.py b/dejavu/__init__.py index 4f6e6e8a..c3ce582a 100755 --- a/dejavu/__init__.py +++ b/dejavu/__init__.py @@ -6,6 +6,24 @@ import traceback import sys +import shutil +import subprocess +import os.path +from dejavu.decoder import get_duration + +def assure_path_exists(path): + if not os.path.isdir(path): + os.makedirs(path) + +class SplitError(Exception): + def __init__(self, file_path, output_file, error_code): + Exception.__init__(self) + self.file_path = file_path + self.error_code = error_code + self.output_file = output_file + + def __str__(self): + return "Spliting of file({0}) failed to ({1}). ffmpeg returned error code: {2}".format(self.file_path, self.output_file, self.error_code) class Dejavu(object): @@ -16,6 +34,11 @@ class Dejavu(object): OFFSET = 'offset' OFFSET_SECS = 'offset_seconds' + SPLIT_DIR = "split_dir" + SLICE_LIMIT_WHEN_SPLITTING = 3 # in minutes + LIMIT_CPU_CORES_FOR_SPLITS = 3 + OVERWRITE_TEMP_FILES_WHEN_SPLITING = 1 + def __init__(self, config): super(Dejavu, self).__init__() @@ -42,7 +65,7 @@ def get_fingerprinted_songs(self): song_hash = song[Database.FIELD_FILE_SHA1] self.songhashes_set.add(song_hash) - def fingerprint_directory(self, path, extensions, nprocesses=None): + def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_split=False, song_name_for_the_split=""): # Try to use the maximum amount of processes if not given. try: nprocesses = nprocesses or multiprocessing.cpu_count() @@ -57,7 +80,7 @@ def fingerprint_directory(self, path, extensions, nprocesses=None): for filename, _ in decoder.find_files(path, extensions): # don't refingerprint already fingerprinted files - if decoder.unique_hash(filename) in self.songhashes_set: + if decoder.path_to_songname(filename) in self.songnames_set: print "%s already fingerprinted, continuing..." % filename continue @@ -70,6 +93,8 @@ def fingerprint_directory(self, path, extensions, nprocesses=None): # Send off our tasks iterator = pool.imap_unordered(_fingerprint_worker, worker_input) + if treat_as_split and song_name_for_the_split: + sid = self.db.insert_song(song_name_for_the_split, file_hash) # Loop till we have all of them while True: @@ -84,8 +109,8 @@ def fingerprint_directory(self, path, extensions, nprocesses=None): # Print traceback because we can't reraise it here traceback.print_exc(file=sys.stdout) else: - sid = self.db.insert_song(song_name, file_hash) - + if not treat_as_split: + sid = self.db.insert_song(song_name, file_hash) self.db.insert_hashes(sid, hashes) self.db.set_song_fingerprinted(sid) self.get_fingerprinted_songs() @@ -112,6 +137,48 @@ def fingerprint_file(self, filepath, song_name=None): self.db.set_song_fingerprinted(sid) self.get_fingerprinted_songs() + def fingerprint_with_duration_check(self, input_file, song_name=None): + duration = get_duration(input_file) + split_length = self.SLICE_LIMIT_WHEN_SPLITTING * 60 + if duration < split_length: + return self.fingerprint_file(input_file) + songname, extension = os.path.splitext(os.path.basename(input_file)) + song_name = song_name or songname + # don't refingerprint already fingerprinted files + if song_name in self.songnames_set: + print "%s already fingerprinted, continuing..." % song_name + return + file_directory = os.path.dirname(input_file) + output_path = os.path.join(file_directory, self.SPLIT_DIR, song_name) + assure_path_exists(output_path) + start_offset = 0 + end_offset = split_length + retcode = 0 + sid = self.db.insert_song(song_name) + while start_offset < duration: + output_file = os.path.join(output_path, "start_sec{0}_end_sec{1}{2}".format(start_offset, end_offset, extension)) + convertion_command = [ 'ffmpeg', + '-i', input_file, + "-acodec", "copy", #fastest convertion possible 1:1 copy + ["-n","-y"][self.OVERWRITE_TEMP_FILES_WHEN_SPLITING], # always overwrite existing files + "-vn", # Drop any video streams if there are any + '-ss', str(start_offset), + '-t', str(split_length), + output_file] + retcode = subprocess.call(convertion_command, stderr=open(os.devnull)) + if retcode != 0: + raise SplitError(input_file, output_file, retcode) + start_offset += split_length + end_offset += split_length + end_offset = min(end_offset, duration) + + self.db.set_song_fingerprinted(sid) + self.get_fingerprinted_songs() + self.fingerprint_directory(output_path, [extension], + nprocesses=self.LIMIT_CPU_CORES_FOR_SPLITS, + treat_as_split=True, song_name_for_the_split=song_name) + shutil.rmtree(output_path) + def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS): hashes = fingerprint.fingerprint(samples, Fs=Fs) return self.db.return_matches(hashes) diff --git a/dejavu/decoder.py b/dejavu/decoder.py index 04aa39f4..a0df001c 100755 --- a/dejavu/decoder.py +++ b/dejavu/decoder.py @@ -6,6 +6,16 @@ import wavio from hashlib import sha1 +# https://github.com/sampsyo/audioread +import audioread + +def get_duration(file_path): + duration = 0 + with audioread.audio_open(file_path) as f: + duration = f.duration + f.close() + return duration + def unique_hash(filepath, blocksize=2**20): """ Small function to generate a hash to uniquely generate a file. Inspired by MD5 version here: diff --git a/requirements.txt b/requirements.txt index 9478f734..a0e3a808 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ ### BEGIN ### pydub>=0.9.4 PyAudio>=0.2.7 +audioread>=1.2.1 numpy>=1.8.2 scipy>=0.12.1 matplotlib>=1.3.1 diff --git a/setup.py b/setup.py index 8312d1d5..01240099 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ def parse_requirements(requirements): author_email='will.drevo@gmail.com', maintainer="Will Drevo", maintainer_email="will.drevo@gmail.com", - url='http://github.com/tuxdna/dejavu', + url='http://github.com/worldveil/dejavu', license='MIT License', include_package_data=True, packages=find_packages(), diff --git a/test_fingerprint_by_splitting.py b/test_fingerprint_by_splitting.py new file mode 100644 index 00000000..f825364d --- /dev/null +++ b/test_fingerprint_by_splitting.py @@ -0,0 +1,48 @@ +from dejavu import Dejavu +import warnings +import json +import os, subprocess +warnings.filterwarnings("ignore") + +# load config from a JSON file (or anything outputting a python dictionary) +with open("dejavu.cnf.SAMPLE") as f: + config = json.load(f) + +class ConcatError(Exception): + def __init__(self, list_file, output_file, error_code): + Exception.__init__(self) + self.list_file = list_file + self.error_code = error_code + self.output_file = output_file + + def __str__(self): + return "Problem with list file({0}). Failed to create({1}). ffmpeg returned error code: {2}".format(self.list_file, self.output_file, self.error_code) + + +if __name__ == '__main__': + ''' + Concatenates ./mp3/*.mp3 + Test fingerprinting the long concatenated file + ''' + list_file = "mp3/concatenation_list.txt" + long_song = "mp3/concatenated.mp3" + + concat_mp3_file_for_test = "ffmpeg -f concat -i {0} -y -c copy {1}".format(list_file, long_song) + retcode = subprocess.call(concat_mp3_file_for_test, stderr=open(os.devnull)) + if retcode != 0: + raise ConcatError(list_file, long_song, retcode) + + # create a Dejavu instance + djv = Dejavu(config) + + try: + djv.fingerprint_file(long_song) + except Exception as err: + err = str(err) or "Memory Error" # Memory Errors does not have a string representation (as tested in Windows) + print "Exception raised during common fingerprint_file():({0}) so will split the file".format(err) + else: + raise "This file was successfully ingerprinted and splitting was not needed" + + djv.fingerprint_with_duration_check(long_song, song_name="Concatenates12345") + + From d15a90ab42fcac408d9affc14bb63586112b655e Mon Sep 17 00:00:00 2001 From: "Iskren.Stanislavov" Date: Wed, 5 Aug 2015 20:19:11 +0300 Subject: [PATCH 2/2] fix to match with the master branch --- dejavu/__init__.py | 16 +++++++++------- test_fingerprint_by_splitting.py | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/dejavu/__init__.py b/dejavu/__init__.py index c3ce582a..84116ecc 100755 --- a/dejavu/__init__.py +++ b/dejavu/__init__.py @@ -65,7 +65,7 @@ def get_fingerprinted_songs(self): song_hash = song[Database.FIELD_FILE_SHA1] self.songhashes_set.add(song_hash) - def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_split=False, song_name_for_the_split=""): + def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_split=False, song_splitted_sid=""): # Try to use the maximum amount of processes if not given. try: nprocesses = nprocesses or multiprocessing.cpu_count() @@ -80,7 +80,7 @@ def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_spli for filename, _ in decoder.find_files(path, extensions): # don't refingerprint already fingerprinted files - if decoder.path_to_songname(filename) in self.songnames_set: + if decoder.path_to_songname(filename) in self.songhashes_set: print "%s already fingerprinted, continuing..." % filename continue @@ -93,8 +93,6 @@ def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_spli # Send off our tasks iterator = pool.imap_unordered(_fingerprint_worker, worker_input) - if treat_as_split and song_name_for_the_split: - sid = self.db.insert_song(song_name_for_the_split, file_hash) # Loop till we have all of them while True: @@ -111,6 +109,8 @@ def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_spli else: if not treat_as_split: sid = self.db.insert_song(song_name, file_hash) + else: + sid = song_splitted_sid self.db.insert_hashes(sid, hashes) self.db.set_song_fingerprinted(sid) self.get_fingerprinted_songs() @@ -145,7 +145,7 @@ def fingerprint_with_duration_check(self, input_file, song_name=None): songname, extension = os.path.splitext(os.path.basename(input_file)) song_name = song_name or songname # don't refingerprint already fingerprinted files - if song_name in self.songnames_set: + if song_name in self.songhashes_set: print "%s already fingerprinted, continuing..." % song_name return file_directory = os.path.dirname(input_file) @@ -154,7 +154,9 @@ def fingerprint_with_duration_check(self, input_file, song_name=None): start_offset = 0 end_offset = split_length retcode = 0 - sid = self.db.insert_song(song_name) + song_hash = decoder.unique_hash(input_file) + + sid = self.db.insert_song(song_name, song_hash) while start_offset < duration: output_file = os.path.join(output_path, "start_sec{0}_end_sec{1}{2}".format(start_offset, end_offset, extension)) convertion_command = [ 'ffmpeg', @@ -176,7 +178,7 @@ def fingerprint_with_duration_check(self, input_file, song_name=None): self.get_fingerprinted_songs() self.fingerprint_directory(output_path, [extension], nprocesses=self.LIMIT_CPU_CORES_FOR_SPLITS, - treat_as_split=True, song_name_for_the_split=song_name) + treat_as_split=True, song_splitted_sid=sid) shutil.rmtree(output_path) def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS): diff --git a/test_fingerprint_by_splitting.py b/test_fingerprint_by_splitting.py index f825364d..19a5f307 100644 --- a/test_fingerprint_by_splitting.py +++ b/test_fingerprint_by_splitting.py @@ -43,6 +43,6 @@ def __str__(self): else: raise "This file was successfully ingerprinted and splitting was not needed" - djv.fingerprint_with_duration_check(long_song, song_name="Concatenates12345") + djv.fingerprint_with_duration_check(long_song, song_name="Concatenates_test")