From 0a0df7e5fb9fc6da63eccbcb66850f8e3cc071f0 Mon Sep 17 00:00:00 2001
From: "Iskren.Stanislavov" <iskren.jgd@gmail.com>
Date: Mon, 20 Apr 2015 17:55:06 +0300
Subject: [PATCH 1/2] port branch

---
 INSTALLATION.md                  |  1 +
 dejavu/__init__.py               | 75 ++++++++++++++++++++++++++++++--
 dejavu/decoder.py                | 10 +++++
 requirements.txt                 |  1 +
 setup.py                         |  2 +-
 test_fingerprint_by_splitting.py | 48 ++++++++++++++++++++
 6 files changed, 132 insertions(+), 5 deletions(-)
 create mode 100644 test_fingerprint_by_splitting.py

diff --git a/INSTALLATION.md b/INSTALLATION.md
index 34161ac2..82c99e1f 100644
--- a/INSTALLATION.md
+++ b/INSTALLATION.md
@@ -21,6 +21,7 @@ Install the dependencies:
     sudo yum install numpy scipy python-matplotlib ffmpeg portaudio-devel
     pip install PyAudio
     pip install pydub
+    pip install audioread
     
 Now setup virtualenv ([howto?](http://www.pythoncentral.io/how-to-install-virtualenv-python/)):
 
diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index 4f6e6e8a..c3ce582a 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -6,6 +6,24 @@
 import traceback
 import sys
 
+import shutil
+import subprocess
+import os.path
+from dejavu.decoder import get_duration
+
+def assure_path_exists(path):
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+class SplitError(Exception):
+    def __init__(self, file_path, output_file, error_code):
+        Exception.__init__(self)
+        self.file_path = file_path
+        self.error_code = error_code
+        self.output_file = output_file
+
+    def __str__(self):
+        return "Spliting of file({0}) failed to ({1}). ffmpeg returned error code: {2}".format(self.file_path, self.output_file, self.error_code)
 
 class Dejavu(object):
 
@@ -16,6 +34,11 @@ class Dejavu(object):
     OFFSET = 'offset'
     OFFSET_SECS = 'offset_seconds'
 
+    SPLIT_DIR = "split_dir"
+    SLICE_LIMIT_WHEN_SPLITTING = 3 # in minutes
+    LIMIT_CPU_CORES_FOR_SPLITS = 3
+    OVERWRITE_TEMP_FILES_WHEN_SPLITING = 1
+
     def __init__(self, config):
         super(Dejavu, self).__init__()
 
@@ -42,7 +65,7 @@ def get_fingerprinted_songs(self):
             song_hash = song[Database.FIELD_FILE_SHA1]
             self.songhashes_set.add(song_hash)
 
-    def fingerprint_directory(self, path, extensions, nprocesses=None):
+    def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_split=False, song_name_for_the_split=""):
         # Try to use the maximum amount of processes if not given.
         try:
             nprocesses = nprocesses or multiprocessing.cpu_count()
@@ -57,7 +80,7 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
         for filename, _ in decoder.find_files(path, extensions):
 
             # don't refingerprint already fingerprinted files
-            if decoder.unique_hash(filename) in self.songhashes_set:
+            if decoder.path_to_songname(filename) in self.songnames_set:
                 print "%s already fingerprinted, continuing..." % filename
                 continue
 
@@ -70,6 +93,8 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
         # Send off our tasks
         iterator = pool.imap_unordered(_fingerprint_worker,
                                        worker_input)
+        if treat_as_split and song_name_for_the_split:
+            sid = self.db.insert_song(song_name_for_the_split, file_hash)
 
         # Loop till we have all of them
         while True:
@@ -84,8 +109,8 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
                 # Print traceback because we can't reraise it here
                 traceback.print_exc(file=sys.stdout)
             else:
-                sid = self.db.insert_song(song_name, file_hash)
-
+                if not treat_as_split:
+                    sid = self.db.insert_song(song_name, file_hash)
                 self.db.insert_hashes(sid, hashes)
                 self.db.set_song_fingerprinted(sid)
                 self.get_fingerprinted_songs()
@@ -112,6 +137,48 @@ def fingerprint_file(self, filepath, song_name=None):
             self.db.set_song_fingerprinted(sid)
             self.get_fingerprinted_songs()
 
+    def fingerprint_with_duration_check(self, input_file, song_name=None):
+        duration = get_duration(input_file)
+        split_length =  self.SLICE_LIMIT_WHEN_SPLITTING * 60
+        if duration < split_length:
+            return self.fingerprint_file(input_file)
+        songname, extension = os.path.splitext(os.path.basename(input_file))
+        song_name = song_name or songname
+        # don't refingerprint already fingerprinted files
+        if song_name in self.songnames_set:
+            print "%s already fingerprinted, continuing..." % song_name
+            return
+        file_directory = os.path.dirname(input_file)
+        output_path = os.path.join(file_directory, self.SPLIT_DIR, song_name)
+        assure_path_exists(output_path)
+        start_offset = 0
+        end_offset = split_length
+        retcode = 0
+        sid = self.db.insert_song(song_name)
+        while start_offset < duration:
+            output_file = os.path.join(output_path, "start_sec{0}_end_sec{1}{2}".format(start_offset, end_offset, extension))
+            convertion_command = [ 'ffmpeg',
+                                    '-i', input_file,
+                                    "-acodec", "copy", #fastest convertion possible 1:1 copy
+                                    ["-n","-y"][self.OVERWRITE_TEMP_FILES_WHEN_SPLITING],  # always overwrite existing files
+                                    "-vn",  # Drop any video streams if there are any
+                                    '-ss', str(start_offset),
+                                    '-t', str(split_length),
+                                    output_file]
+            retcode = subprocess.call(convertion_command, stderr=open(os.devnull))
+            if retcode != 0:
+                raise SplitError(input_file, output_file, retcode)
+            start_offset += split_length
+            end_offset += split_length
+            end_offset = min(end_offset, duration)
+
+        self.db.set_song_fingerprinted(sid)
+        self.get_fingerprinted_songs()
+        self.fingerprint_directory(output_path, [extension],
+            nprocesses=self.LIMIT_CPU_CORES_FOR_SPLITS,
+            treat_as_split=True, song_name_for_the_split=song_name)
+        shutil.rmtree(output_path)
+
     def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS):
         hashes = fingerprint.fingerprint(samples, Fs=Fs)
         return self.db.return_matches(hashes)
diff --git a/dejavu/decoder.py b/dejavu/decoder.py
index 04aa39f4..a0df001c 100755
--- a/dejavu/decoder.py
+++ b/dejavu/decoder.py
@@ -6,6 +6,16 @@
 import wavio
 from hashlib import sha1
 
+# https://github.com/sampsyo/audioread
+import audioread
+
+def get_duration(file_path):
+    duration = 0
+    with audioread.audio_open(file_path) as f:
+        duration = f.duration
+        f.close()
+    return duration
+
 def unique_hash(filepath, blocksize=2**20):
     """ Small function to generate a hash to uniquely generate
     a file. Inspired by MD5 version here:
diff --git a/requirements.txt b/requirements.txt
index 9478f734..a0e3a808 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@
 ### BEGIN ###
 pydub>=0.9.4
 PyAudio>=0.2.7
+audioread>=1.2.1
 numpy>=1.8.2
 scipy>=0.12.1
 matplotlib>=1.3.1
diff --git a/setup.py b/setup.py
index 8312d1d5..01240099 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@ def parse_requirements(requirements):
     author_email='will.drevo@gmail.com',
     maintainer="Will Drevo",
     maintainer_email="will.drevo@gmail.com",
-    url='http://github.com/tuxdna/dejavu',
+    url='http://github.com/worldveil/dejavu',
     license='MIT License',
     include_package_data=True,
     packages=find_packages(),
diff --git a/test_fingerprint_by_splitting.py b/test_fingerprint_by_splitting.py
new file mode 100644
index 00000000..f825364d
--- /dev/null
+++ b/test_fingerprint_by_splitting.py
@@ -0,0 +1,48 @@
+from dejavu import Dejavu
+import warnings
+import json
+import os, subprocess
+warnings.filterwarnings("ignore")
+
+# load config from a JSON file (or anything outputting a python dictionary)
+with open("dejavu.cnf.SAMPLE") as f:
+    config = json.load(f)
+
+class ConcatError(Exception):
+    def __init__(self, list_file, output_file, error_code):
+        Exception.__init__(self)
+        self.list_file = list_file
+        self.error_code = error_code
+        self.output_file = output_file
+
+    def __str__(self):
+        return "Problem with list file({0}). Failed to create({1}). ffmpeg returned error code: {2}".format(self.list_file, self.output_file, self.error_code)
+
+
+if __name__ == '__main__':
+    '''
+    Concatenates ./mp3/*.mp3
+    Test fingerprinting the long concatenated file
+    '''
+    list_file = "mp3/concatenation_list.txt"
+    long_song = "mp3/concatenated.mp3"
+
+    concat_mp3_file_for_test = "ffmpeg -f concat -i {0} -y -c copy {1}".format(list_file, long_song)
+    retcode = subprocess.call(concat_mp3_file_for_test, stderr=open(os.devnull))
+    if retcode != 0:
+        raise ConcatError(list_file, long_song, retcode)
+
+    # create a Dejavu instance
+    djv = Dejavu(config)
+
+    try:
+        djv.fingerprint_file(long_song)
+    except Exception as err:
+        err = str(err) or "Memory Error" # Memory Errors does not have a string representation (as tested in Windows)
+        print "Exception raised during common fingerprint_file():({0}) so will split the file".format(err)
+    else:
+        raise "This file was successfully ingerprinted and splitting was not needed"
+
+    djv.fingerprint_with_duration_check(long_song, song_name="Concatenates12345")
+
+

From d15a90ab42fcac408d9affc14bb63586112b655e Mon Sep 17 00:00:00 2001
From: "Iskren.Stanislavov" <iskren.jgd@gmail.com>
Date: Wed, 5 Aug 2015 20:19:11 +0300
Subject: [PATCH 2/2] fix to match with the master branch

---
 dejavu/__init__.py               | 16 +++++++++-------
 test_fingerprint_by_splitting.py |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index c3ce582a..84116ecc 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -65,7 +65,7 @@ def get_fingerprinted_songs(self):
             song_hash = song[Database.FIELD_FILE_SHA1]
             self.songhashes_set.add(song_hash)
 
-    def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_split=False, song_name_for_the_split=""):
+    def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_split=False, song_splitted_sid=""):
         # Try to use the maximum amount of processes if not given.
         try:
             nprocesses = nprocesses or multiprocessing.cpu_count()
@@ -80,7 +80,7 @@ def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_spli
         for filename, _ in decoder.find_files(path, extensions):
 
             # don't refingerprint already fingerprinted files
-            if decoder.path_to_songname(filename) in self.songnames_set:
+            if decoder.path_to_songname(filename) in self.songhashes_set:
                 print "%s already fingerprinted, continuing..." % filename
                 continue
 
@@ -93,8 +93,6 @@ def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_spli
         # Send off our tasks
         iterator = pool.imap_unordered(_fingerprint_worker,
                                        worker_input)
-        if treat_as_split and song_name_for_the_split:
-            sid = self.db.insert_song(song_name_for_the_split, file_hash)
 
         # Loop till we have all of them
         while True:
@@ -111,6 +109,8 @@ def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_spli
             else:
                 if not treat_as_split:
                     sid = self.db.insert_song(song_name, file_hash)
+                else:
+                    sid = song_splitted_sid
                 self.db.insert_hashes(sid, hashes)
                 self.db.set_song_fingerprinted(sid)
                 self.get_fingerprinted_songs()
@@ -145,7 +145,7 @@ def fingerprint_with_duration_check(self, input_file, song_name=None):
         songname, extension = os.path.splitext(os.path.basename(input_file))
         song_name = song_name or songname
         # don't refingerprint already fingerprinted files
-        if song_name in self.songnames_set:
+        if song_name in self.songhashes_set:
             print "%s already fingerprinted, continuing..." % song_name
             return
         file_directory = os.path.dirname(input_file)
@@ -154,7 +154,9 @@ def fingerprint_with_duration_check(self, input_file, song_name=None):
         start_offset = 0
         end_offset = split_length
         retcode = 0
-        sid = self.db.insert_song(song_name)
+        song_hash = decoder.unique_hash(input_file)
+
+        sid = self.db.insert_song(song_name, song_hash)
         while start_offset < duration:
             output_file = os.path.join(output_path, "start_sec{0}_end_sec{1}{2}".format(start_offset, end_offset, extension))
             convertion_command = [ 'ffmpeg',
@@ -176,7 +178,7 @@ def fingerprint_with_duration_check(self, input_file, song_name=None):
         self.get_fingerprinted_songs()
         self.fingerprint_directory(output_path, [extension],
             nprocesses=self.LIMIT_CPU_CORES_FOR_SPLITS,
-            treat_as_split=True, song_name_for_the_split=song_name)
+            treat_as_split=True, song_splitted_sid=sid)
         shutil.rmtree(output_path)
 
     def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS):
diff --git a/test_fingerprint_by_splitting.py b/test_fingerprint_by_splitting.py
index f825364d..19a5f307 100644
--- a/test_fingerprint_by_splitting.py
+++ b/test_fingerprint_by_splitting.py
@@ -43,6 +43,6 @@ def __str__(self):
     else:
         raise "This file was successfully ingerprinted and splitting was not needed"
 
-    djv.fingerprint_with_duration_check(long_song, song_name="Concatenates12345")
+    djv.fingerprint_with_duration_check(long_song, song_name="Concatenates_test")