diff --git a/INSTALLATION.md b/INSTALLATION.md index 34161ac2..82c99e1f 100644 --- a/INSTALLATION.md +++ b/INSTALLATION.md @@ -21,6 +21,7 @@ Install the dependencies: sudo yum install numpy scipy python-matplotlib ffmpeg portaudio-devel pip install PyAudio pip install pydub + pip install audioread Now setup virtualenv ([howto?](http://www.pythoncentral.io/how-to-install-virtualenv-python/)): diff --git a/dejavu/__init__.py b/dejavu/__init__.py index 4f6e6e8a..84116ecc 100755 --- a/dejavu/__init__.py +++ b/dejavu/__init__.py @@ -6,6 +6,24 @@ import traceback import sys +import shutil +import subprocess +import os.path +from dejavu.decoder import get_duration + +def assure_path_exists(path): + if not os.path.isdir(path): + os.makedirs(path) + +class SplitError(Exception): + def __init__(self, file_path, output_file, error_code): + Exception.__init__(self) + self.file_path = file_path + self.error_code = error_code + self.output_file = output_file + + def __str__(self): + return "Spliting of file({0}) failed to ({1}). ffmpeg returned error code: {2}".format(self.file_path, self.output_file, self.error_code) class Dejavu(object): @@ -16,6 +34,11 @@ class Dejavu(object): OFFSET = 'offset' OFFSET_SECS = 'offset_seconds' + SPLIT_DIR = "split_dir" + SLICE_LIMIT_WHEN_SPLITTING = 3 # in minutes + LIMIT_CPU_CORES_FOR_SPLITS = 3 + OVERWRITE_TEMP_FILES_WHEN_SPLITING = 1 + def __init__(self, config): super(Dejavu, self).__init__() @@ -42,7 +65,7 @@ def get_fingerprinted_songs(self): song_hash = song[Database.FIELD_FILE_SHA1] self.songhashes_set.add(song_hash) - def fingerprint_directory(self, path, extensions, nprocesses=None): + def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_split=False, song_splitted_sid=""): # Try to use the maximum amount of processes if not given. try: nprocesses = nprocesses or multiprocessing.cpu_count() @@ -57,7 +80,7 @@ def fingerprint_directory(self, path, extensions, nprocesses=None): for filename, _ in decoder.find_files(path, extensions): # don't refingerprint already fingerprinted files - if decoder.unique_hash(filename) in self.songhashes_set: + if decoder.path_to_songname(filename) in self.songhashes_set: print "%s already fingerprinted, continuing..." % filename continue @@ -84,8 +107,10 @@ def fingerprint_directory(self, path, extensions, nprocesses=None): # Print traceback because we can't reraise it here traceback.print_exc(file=sys.stdout) else: - sid = self.db.insert_song(song_name, file_hash) - + if not treat_as_split: + sid = self.db.insert_song(song_name, file_hash) + else: + sid = song_splitted_sid self.db.insert_hashes(sid, hashes) self.db.set_song_fingerprinted(sid) self.get_fingerprinted_songs() @@ -112,6 +137,50 @@ def fingerprint_file(self, filepath, song_name=None): self.db.set_song_fingerprinted(sid) self.get_fingerprinted_songs() + def fingerprint_with_duration_check(self, input_file, song_name=None): + duration = get_duration(input_file) + split_length = self.SLICE_LIMIT_WHEN_SPLITTING * 60 + if duration < split_length: + return self.fingerprint_file(input_file) + songname, extension = os.path.splitext(os.path.basename(input_file)) + song_name = song_name or songname + # don't refingerprint already fingerprinted files + if song_name in self.songhashes_set: + print "%s already fingerprinted, continuing..." % song_name + return + file_directory = os.path.dirname(input_file) + output_path = os.path.join(file_directory, self.SPLIT_DIR, song_name) + assure_path_exists(output_path) + start_offset = 0 + end_offset = split_length + retcode = 0 + song_hash = decoder.unique_hash(input_file) + + sid = self.db.insert_song(song_name, song_hash) + while start_offset < duration: + output_file = os.path.join(output_path, "start_sec{0}_end_sec{1}{2}".format(start_offset, end_offset, extension)) + convertion_command = [ 'ffmpeg', + '-i', input_file, + "-acodec", "copy", #fastest convertion possible 1:1 copy + ["-n","-y"][self.OVERWRITE_TEMP_FILES_WHEN_SPLITING], # always overwrite existing files + "-vn", # Drop any video streams if there are any + '-ss', str(start_offset), + '-t', str(split_length), + output_file] + retcode = subprocess.call(convertion_command, stderr=open(os.devnull)) + if retcode != 0: + raise SplitError(input_file, output_file, retcode) + start_offset += split_length + end_offset += split_length + end_offset = min(end_offset, duration) + + self.db.set_song_fingerprinted(sid) + self.get_fingerprinted_songs() + self.fingerprint_directory(output_path, [extension], + nprocesses=self.LIMIT_CPU_CORES_FOR_SPLITS, + treat_as_split=True, song_splitted_sid=sid) + shutil.rmtree(output_path) + def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS): hashes = fingerprint.fingerprint(samples, Fs=Fs) return self.db.return_matches(hashes) diff --git a/dejavu/decoder.py b/dejavu/decoder.py index 04aa39f4..a0df001c 100755 --- a/dejavu/decoder.py +++ b/dejavu/decoder.py @@ -6,6 +6,16 @@ import wavio from hashlib import sha1 +# https://github.com/sampsyo/audioread +import audioread + +def get_duration(file_path): + duration = 0 + with audioread.audio_open(file_path) as f: + duration = f.duration + f.close() + return duration + def unique_hash(filepath, blocksize=2**20): """ Small function to generate a hash to uniquely generate a file. Inspired by MD5 version here: diff --git a/requirements.txt b/requirements.txt index 9478f734..a0e3a808 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ ### BEGIN ### pydub>=0.9.4 PyAudio>=0.2.7 +audioread>=1.2.1 numpy>=1.8.2 scipy>=0.12.1 matplotlib>=1.3.1 diff --git a/setup.py b/setup.py index 8312d1d5..01240099 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ def parse_requirements(requirements): author_email='will.drevo@gmail.com', maintainer="Will Drevo", maintainer_email="will.drevo@gmail.com", - url='http://github.com/tuxdna/dejavu', + url='http://github.com/worldveil/dejavu', license='MIT License', include_package_data=True, packages=find_packages(), diff --git a/test_fingerprint_by_splitting.py b/test_fingerprint_by_splitting.py new file mode 100644 index 00000000..19a5f307 --- /dev/null +++ b/test_fingerprint_by_splitting.py @@ -0,0 +1,48 @@ +from dejavu import Dejavu +import warnings +import json +import os, subprocess +warnings.filterwarnings("ignore") + +# load config from a JSON file (or anything outputting a python dictionary) +with open("dejavu.cnf.SAMPLE") as f: + config = json.load(f) + +class ConcatError(Exception): + def __init__(self, list_file, output_file, error_code): + Exception.__init__(self) + self.list_file = list_file + self.error_code = error_code + self.output_file = output_file + + def __str__(self): + return "Problem with list file({0}). Failed to create({1}). ffmpeg returned error code: {2}".format(self.list_file, self.output_file, self.error_code) + + +if __name__ == '__main__': + ''' + Concatenates ./mp3/*.mp3 + Test fingerprinting the long concatenated file + ''' + list_file = "mp3/concatenation_list.txt" + long_song = "mp3/concatenated.mp3" + + concat_mp3_file_for_test = "ffmpeg -f concat -i {0} -y -c copy {1}".format(list_file, long_song) + retcode = subprocess.call(concat_mp3_file_for_test, stderr=open(os.devnull)) + if retcode != 0: + raise ConcatError(list_file, long_song, retcode) + + # create a Dejavu instance + djv = Dejavu(config) + + try: + djv.fingerprint_file(long_song) + except Exception as err: + err = str(err) or "Memory Error" # Memory Errors does not have a string representation (as tested in Windows) + print "Exception raised during common fingerprint_file():({0}) so will split the file".format(err) + else: + raise "This file was successfully ingerprinted and splitting was not needed" + + djv.fingerprint_with_duration_check(long_song, song_name="Concatenates_test") + +