worldveil · thesunlover · Apr 20, 2015 · Aug 5, 2015
diff --git a/INSTALLATION.md b/INSTALLATION.md
@@ -21,6 +21,7 @@ Install the dependencies:
     sudo yum install numpy scipy python-matplotlib ffmpeg portaudio-devel
     pip install PyAudio
     pip install pydub
+    pip install audioread
 
 Now setup virtualenv ([howto?](http://www.pythoncentral.io/how-to-install-virtualenv-python/)):
 

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
@@ -6,6 +6,24 @@
 import traceback
 import sys
 
+import shutil
+import subprocess
+import os.path
+from dejavu.decoder import get_duration
+
+def assure_path_exists(path):
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+class SplitError(Exception):
+    def __init__(self, file_path, output_file, error_code):
+        Exception.__init__(self)
+        self.file_path = file_path
+        self.error_code = error_code
+        self.output_file = output_file
+
+    def __str__(self):
+        return "Spliting of file({0}) failed to ({1}). ffmpeg returned error code: {2}".format(self.file_path, self.output_file, self.error_code)
 
 class Dejavu(object):
 
@@ -16,6 +34,11 @@ class Dejavu(object):
     OFFSET = 'offset'
     OFFSET_SECS = 'offset_seconds'
 
+    SPLIT_DIR = "split_dir"
+    SLICE_LIMIT_WHEN_SPLITTING = 3 # in minutes
+    LIMIT_CPU_CORES_FOR_SPLITS = 3
+    OVERWRITE_TEMP_FILES_WHEN_SPLITING = 1
+
     def __init__(self, config):
         super(Dejavu, self).__init__()
 
@@ -42,7 +65,7 @@ def get_fingerprinted_songs(self):
             song_hash = song[Database.FIELD_FILE_SHA1]
             self.songhashes_set.add(song_hash)
 
-    def fingerprint_directory(self, path, extensions, nprocesses=None):
+    def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_split=False, song_splitted_sid=""):
         # Try to use the maximum amount of processes if not given.
         try:
             nprocesses = nprocesses or multiprocessing.cpu_count()
@@ -57,7 +80,7 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
         for filename, _ in decoder.find_files(path, extensions):
 
             # don't refingerprint already fingerprinted files
-            if decoder.unique_hash(filename) in self.songhashes_set:
+            if decoder.path_to_songname(filename) in self.songhashes_set:
                 print "%s already fingerprinted, continuing..." % filename
                 continue
 
@@ -84,8 +107,10 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
                 # Print traceback because we can't reraise it here
                 traceback.print_exc(file=sys.stdout)
             else:
-                sid = self.db.insert_song(song_name, file_hash)
-
+                if not treat_as_split:
+                    sid = self.db.insert_song(song_name, file_hash)
+                else:
+                    sid = song_splitted_sid
                 self.db.insert_hashes(sid, hashes)
                 self.db.set_song_fingerprinted(sid)
                 self.get_fingerprinted_songs()
@@ -112,6 +137,50 @@ def fingerprint_file(self, filepath, song_name=None):
             self.db.set_song_fingerprinted(sid)
             self.get_fingerprinted_songs()
 
+    def fingerprint_with_duration_check(self, input_file, song_name=None):
+        duration = get_duration(input_file)
+        split_length =  self.SLICE_LIMIT_WHEN_SPLITTING * 60
+        if duration < split_length:
+            return self.fingerprint_file(input_file)
+        songname, extension = os.path.splitext(os.path.basename(input_file))
+        song_name = song_name or songname
+        # don't refingerprint already fingerprinted files
+        if song_name in self.songhashes_set:
+            print "%s already fingerprinted, continuing..." % song_name
+            return
+        file_directory = os.path.dirname(input_file)
+        output_path = os.path.join(file_directory, self.SPLIT_DIR, song_name)
+        assure_path_exists(output_path)
+        start_offset = 0
+        end_offset = split_length
+        retcode = 0
+        song_hash = decoder.unique_hash(input_file)
+
+        sid = self.db.insert_song(song_name, song_hash)
+        while start_offset < duration:
+            output_file = os.path.join(output_path, "start_sec{0}_end_sec{1}{2}".format(start_offset, end_offset, extension))
+            convertion_command = [ 'ffmpeg',
+                                    '-i', input_file,
+                                    "-acodec", "copy", #fastest convertion possible 1:1 copy
+                                    ["-n","-y"][self.OVERWRITE_TEMP_FILES_WHEN_SPLITING],  # always overwrite existing files
+                                    "-vn",  # Drop any video streams if there are any
+                                    '-ss', str(start_offset),
+                                    '-t', str(split_length),
+                                    output_file]
+            retcode = subprocess.call(convertion_command, stderr=open(os.devnull))
+            if retcode != 0:
+                raise SplitError(input_file, output_file, retcode)
+            start_offset += split_length
+            end_offset += split_length
+            end_offset = min(end_offset, duration)
+
+        self.db.set_song_fingerprinted(sid)
+        self.get_fingerprinted_songs()
+        self.fingerprint_directory(output_path, [extension],
+            nprocesses=self.LIMIT_CPU_CORES_FOR_SPLITS,
+            treat_as_split=True, song_splitted_sid=sid)
+        shutil.rmtree(output_path)
+
     def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS):
         hashes = fingerprint.fingerprint(samples, Fs=Fs)
         return self.db.return_matches(hashes)

diff --git a/dejavu/decoder.py b/dejavu/decoder.py
@@ -6,6 +6,16 @@
 import wavio
 from hashlib import sha1
 
+# https://github.com/sampsyo/audioread
+import audioread
+
+def get_duration(file_path):
+    duration = 0
+    with audioread.audio_open(file_path) as f:
+        duration = f.duration
+        f.close()
+    return duration
+
 def unique_hash(filepath, blocksize=2**20):
     """ Small function to generate a hash to uniquely generate
     a file. Inspired by MD5 version here:

diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,7 @@
 ### BEGIN ###
 pydub>=0.9.4
 PyAudio>=0.2.7
+audioread>=1.2.1
 numpy>=1.8.2
 scipy>=0.12.1
 matplotlib>=1.3.1

diff --git a/setup.py b/setup.py
@@ -42,7 +42,7 @@ def parse_requirements(requirements):
     author_email='[email protected]',
     maintainer="Will Drevo",
     maintainer_email="[email protected]",
-    url='http://github.com/tuxdna/dejavu',
+    url='http://github.com/worldveil/dejavu',
     license='MIT License',
     include_package_data=True,
     packages=find_packages(),

diff --git a/test_fingerprint_by_splitting.py b/test_fingerprint_by_splitting.py
@@ -0,0 +1,48 @@
+from dejavu import Dejavu
+import warnings
+import json
+import os, subprocess
+warnings.filterwarnings("ignore")
+
+# load config from a JSON file (or anything outputting a python dictionary)
+with open("dejavu.cnf.SAMPLE") as f:
+    config = json.load(f)
+
+class ConcatError(Exception):
+    def __init__(self, list_file, output_file, error_code):
+        Exception.__init__(self)
+        self.list_file = list_file
+        self.error_code = error_code
+        self.output_file = output_file
+
+    def __str__(self):
+        return "Problem with list file({0}). Failed to create({1}). ffmpeg returned error code: {2}".format(self.list_file, self.output_file, self.error_code)
+
+
+if __name__ == '__main__':
+    '''
+    Concatenates ./mp3/*.mp3
+    Test fingerprinting the long concatenated file
+    '''
+    list_file = "mp3/concatenation_list.txt"
+    long_song = "mp3/concatenated.mp3"
+
+    concat_mp3_file_for_test = "ffmpeg -f concat -i {0} -y -c copy {1}".format(list_file, long_song)
+    retcode = subprocess.call(concat_mp3_file_for_test, stderr=open(os.devnull))
+    if retcode != 0:
+        raise ConcatError(list_file, long_song, retcode)
+
+    # create a Dejavu instance
+    djv = Dejavu(config)
+
+    try:
+        djv.fingerprint_file(long_song)
+    except Exception as err:
+        err = str(err) or "Memory Error" # Memory Errors does not have a string representation (as tested in Windows)
+        print "Exception raised during common fingerprint_file():({0}) so will split the file".format(err)
+    else:
+        raise "This file was successfully ingerprinted and splitting was not needed"
+
+    djv.fingerprint_with_duration_check(long_song, song_name="Concatenates_test")
+
+