worldveil · JPery · Apr 3, 2018 · Jul 2, 2018 · Jul 9, 2018 · Oct 22, 2018
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,6 @@ mp3
 *.mp3
 .DS_Store
 *.cnf
+build/
+.idea/
+PyDejavu.egg-info/
diff --git a/dejavu.cnf.SAMPLE b/dejavu.cnf.SAMPLE
@@ -2,7 +2,7 @@
     "database": {
         "host": "127.0.0.1",
         "user": "root",
-        "passwd": "", 
+        "passwd": "",
         "db": "dejavu"
     }
 }
diff --git a/dejavu/__init__.py b/dejavu/__init__.py
@@ -1,10 +1,9 @@
 from dejavu.database import get_database, Database
 import dejavu.decoder as decoder
-import fingerprint
+import dejavu.fingerprint as fingerprint
 import multiprocessing
 import os
-import traceback
-import sys
+import logging
 
 
 class Dejavu(object):
@@ -15,6 +14,8 @@ class Dejavu(object):
     MATCH_TIME = 'match_time'
     OFFSET = 'offset'
     OFFSET_SECS = 'offset_seconds'
+    AUDIO_LENGTH = 'audio_length'
+    RELATIVE_CONFIDENCE = 'relative_confidence'
 
     def __init__(self, config):
         super(Dejavu, self).__init__()
@@ -58,7 +59,7 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
 
             # don't refingerprint already fingerprinted files
             if decoder.unique_hash(filename) in self.songhashes_set:
-                print "%s already fingerprinted, continuing..." % filename
+                logging.getLogger('dejavu').warn("%s already fingerprinted, continuing..." % filename)
                 continue
 
             filenames_to_fingerprint.append(filename)
@@ -74,22 +75,21 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
         # Loop till we have all of them
         while True:
             try:
-                song_name, hashes, file_hash = iterator.next()
+                song_name, hashes, file_hash, audio_length = iterator.next()
             except multiprocessing.TimeoutError:
                 continue
             except StopIteration:
                 break
             except:
-                print("Failed fingerprinting")
-                # Print traceback because we can't reraise it here
-                traceback.print_exc(file=sys.stdout)
+                logging.getLogger('dejavu').exception("Failed fingerprinting")
             else:
-                sid = self.db.insert_song(song_name, file_hash)
+                logging.getLogger('dejavu').debug("Inserting " + song_name + " in database")
+                sid = self.db.insert_song(song_name, file_hash, audio_length)
 
-                self.db.insert_hashes(sid, hashes)
+                self.db.insert_hashes(sid, set([(x[0], int(x[1])) for x in hashes]))
                 self.db.set_song_fingerprinted(sid)
                 self.get_fingerprinted_songs()
-
+                logging.getLogger('dejavu').info(song_name + " inserted in database")
         pool.close()
         pool.join()
 
@@ -99,24 +99,31 @@ def fingerprint_file(self, filepath, song_name=None):
         song_name = song_name or songname
         # don't refingerprint already fingerprinted files
         if song_hash in self.songhashes_set:
-            print "%s already fingerprinted, continuing..." % song_name
+            logging.getLogger('dejavu').warn("%s already fingerprinted, continuing..." % song_name)
         else:
-            song_name, hashes, file_hash = _fingerprint_worker(
+            song_name, hashes, file_hash, audio_length = _fingerprint_worker(
                 filepath,
                 self.limit,
                 song_name=song_name
             )
-            sid = self.db.insert_song(song_name, file_hash)
+            logging.getLogger('dejavu').debug("Inserting " + song_name + " in database")
+            sid = self.db.insert_song(song_name, file_hash, audio_length)
 
-            self.db.insert_hashes(sid, hashes)
+            self.db.insert_hashes(sid, set([(x[0], int(x[1])) for x in hashes]))
             self.db.set_song_fingerprinted(sid)
             self.get_fingerprinted_songs()
+            logging.getLogger('dejavu').info(song_name + " inserted in database")
 
     def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS):
         hashes = fingerprint.fingerprint(samples, Fs=Fs)
-        return self.db.return_matches(hashes)
-
-    def align_matches(self, matches):
+        mapper = {}
+        total_hashes = 0
+        for hash, offset in hashes:
+            mapper[hash.upper()[:fingerprint.FINGERPRINT_REDUCTION]] = offset
+            total_hashes += 1
+        return (self.db.return_matches(mapper), total_hashes)
+
+    def align_matches(self, matches, total_hashes):
         """
             Finds hash matches that align in time with other matches and finds
             consensus about which hashes are "true" signal from the audio.
@@ -157,6 +164,8 @@ def align_matches(self, matches):
             Dejavu.SONG_ID : song_id,
             Dejavu.SONG_NAME : songname,
             Dejavu.CONFIDENCE : largest_count,
+            Dejavu.AUDIO_LENGTH : song.get(Database.AUDIO_LENGTH, None),
+            Dejavu.RELATIVE_CONFIDENCE : (largest_count*100)/float(total_hashes),
             Dejavu.OFFSET : int(largest),
             Dejavu.OFFSET_SECS : nseconds,
             Database.FIELD_FILE_SHA1 : song.get(Database.FIELD_FILE_SHA1, None),}
@@ -177,21 +186,17 @@ def _fingerprint_worker(filename, limit=None, song_name=None):
 
     songname, extension = os.path.splitext(os.path.basename(filename))
     song_name = song_name or songname
-    channels, Fs, file_hash = decoder.read(filename, limit)
+    channels, Fs, file_hash, audio_length = decoder.read(filename, limit)
     result = set()
     channel_amount = len(channels)
 
     for channeln, channel in enumerate(channels):
-        # TODO: Remove prints or change them into optional logging.
-        print("Fingerprinting channel %d/%d for %s" % (channeln + 1,
-                                                       channel_amount,
-                                                       filename))
+        logging.getLogger('dejavu').info("Fingerprinting channel %d/%d for %s" % (channeln + 1, channel_amount, filename))
         hashes = fingerprint.fingerprint(channel, Fs=Fs)
-        print("Finished channel %d/%d for %s" % (channeln + 1, channel_amount,
-                                                 filename))
+        logging.getLogger('dejavu').debug("Finished channel %d/%d for %s" % (channeln + 1, channel_amount, filename))
         result |= set(hashes)
 
-    return song_name, result, file_hash
+    return song_name, result, file_hash, audio_length
 
 
 def chunkify(lst, n):

diff --git a/dejavu/database.py b/dejavu/database.py
@@ -10,6 +10,7 @@ class Database(object):
     FIELD_SONGNAME = 'song_name'
     FIELD_OFFSET = 'offset'
     FIELD_HASH = 'hash'
+    AUDIO_LENGTH = 'audio_length'
 
     # Name of your Database subclass, this is used in configuration
     # to refer to your class

diff --git a/dejavu/database_sql.py b/dejavu/database_sql.py
@@ -1,12 +1,22 @@
 from __future__ import absolute_import
-from itertools import izip_longest
-import Queue
 
-import MySQLdb as mysql
-from MySQLdb.cursors import DictCursor
+try:
+    import queue
+    from itertools import zip_longest
+except ImportError:
+    import Queue as queue
+    from itertools import izip_longest as zip_longest
+import math
+
+import pymysql as mysql
+from pymysql.cursors import DictCursor
 
 from dejavu.database import Database
+from dejavu.fingerprint import FINGERPRINT_REDUCTION
+
+from multiprocessing import cpu_count
 
+from itertools import chain
 
 class SQLDatabase(Database):
     """
@@ -56,14 +66,14 @@ class SQLDatabase(Database):
     # creates
     CREATE_FINGERPRINTS_TABLE = """
         CREATE TABLE IF NOT EXISTS `%s` (
-             `%s` binary(10) not null,
+             `%s` binary (%s) not null,
              `%s` mediumint unsigned not null,
              `%s` int unsigned not null,
          INDEX (%s),
          UNIQUE KEY `unique_constraint` (%s, %s, %s),
          FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
     ) ENGINE=INNODB;""" % (
-        FINGERPRINTS_TABLENAME, Database.FIELD_HASH,
+        FINGERPRINTS_TABLENAME, Database.FIELD_HASH, str(math.ceil(FINGERPRINT_REDUCTION/2.)),
         Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
         Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
         Database.FIELD_SONG_ID, SONGS_TABLENAME, Database.FIELD_SONG_ID
@@ -75,11 +85,12 @@ class SQLDatabase(Database):
             `%s` varchar(250) not null,
             `%s` tinyint default 0,
             `%s` binary(20) not null,
+            `%s` float,
         PRIMARY KEY (`%s`),
         UNIQUE KEY `%s` (`%s`)
     ) ENGINE=INNODB;""" % (
         SONGS_TABLENAME, Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, FIELD_FINGERPRINTED,
-        Database.FIELD_FILE_SHA1,
+        Database.FIELD_FILE_SHA1, Database.AUDIO_LENGTH,
         Database.FIELD_SONG_ID, Database.FIELD_SONG_ID, Database.FIELD_SONG_ID,
     )
 
@@ -89,8 +100,8 @@ class SQLDatabase(Database):
             (UNHEX(%%s), %%s, %%s);
     """ % (FINGERPRINTS_TABLENAME, Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET)
 
-    INSERT_SONG = "INSERT INTO %s (%s, %s) values (%%s, UNHEX(%%s));" % (
-        SONGS_TABLENAME, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1)
+    INSERT_SONG = "INSERT INTO %s (%s, %s, %s) values (%%s, UNHEX(%%s), %%s);" % (
+        SONGS_TABLENAME, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.AUDIO_LENGTH)
 
     # selects
     SELECT = """
@@ -107,8 +118,8 @@ class SQLDatabase(Database):
     """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME)
 
     SELECT_SONG = """
-        SELECT %s, HEX(%s) as %s FROM %s WHERE %s = %%s;
-    """ % (Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, SONGS_TABLENAME, Database.FIELD_SONG_ID)
+        SELECT %s, HEX(%s) as %s, %s FROM %s WHERE %s = %%s;
+    """ % (Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, Database.AUDIO_LENGTH, SONGS_TABLENAME, Database.FIELD_SONG_ID)
 
     SELECT_NUM_FINGERPRINTS = """
         SELECT COUNT(*) as n FROM %s
@@ -234,12 +245,12 @@ def insert(self, hash, sid, offset):
         with self.cursor() as cur:
             cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset))
 
-    def insert_song(self, songname, file_hash):
+    def insert_song(self, songname, file_hash, audio_length):
         """
         Inserts song in the database and returns the ID of the inserted record.
         """
         with self.cursor() as cur:
-            cur.execute(self.INSERT_SONG, (songname, file_hash))
+            cur.execute(self.INSERT_SONG, (songname, file_hash, audio_length))
             return cur.lastrowid
 
     def query(self, hash):
@@ -272,34 +283,39 @@ def insert_hashes(self, sid, hashes):
         for hash, offset in hashes:
             values.append((hash, sid, offset))
 
+        base_query = "INSERT IGNORE INTO fingerprints (%s, %s, %s) values " % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET)
         with self.cursor() as cur:
+            values.sort(key=lambda tup: tup[0])
+            cur.execute("START TRANSACTION;")
             for split_values in grouper(values, 1000):
-                cur.executemany(self.INSERT_FINGERPRINT, split_values)
+                split_values = list(split_values)
+                values2tuple = tuple(chain.from_iterable(split_values))
+                query = base_query + ', '.join(['(UNHEX(%s), %s, %s)'] * len(split_values))
+                query += ";"
+                cur.execute(query, values2tuple)
+            cur.execute("COMMIT;")
+
 
-    def return_matches(self, hashes):
+    def return_matches(self, mapper):
         """
         Return the (song_id, offset_diff) tuples associated with
         a list of (sha1, sample_offset) values.
         """
         # Create a dictionary of hash => offset pairs for later lookups
-        mapper = {}
-        for hash, offset in hashes:
-            mapper[hash.upper()] = offset
 
         # Get an iteratable of all the hashes we need
-        values = mapper.keys()
+        values = list(mapper.keys())
 
         with self.cursor() as cur:
-            for split_values in grouper(values, 1000):
-                # Create our IN part of the query
-                query = self.SELECT_MULTIPLE
-                query = query % ', '.join(['UNHEX(%s)'] * len(split_values))
+            # Create our IN part of the query
+            query = self.SELECT_MULTIPLE
+            query = query % ', '.join(['UNHEX(%s)'] * len(values))
 
-                cur.execute(query, split_values)
+            cur.execute(query, values)
 
-                for hash, sid, offset in cur:
-                    # (sid, db_offset - song_sampled_offset)
-                    yield (sid, offset - mapper[hash])
+            for hash, sid, offset in cur:
+                # (sid, db_offset - song_sampled_offset)
+                yield (sid, offset - mapper[hash])
 
     def __getstate__(self):
         return (self._options,)
@@ -312,7 +328,7 @@ def __setstate__(self, state):
 def grouper(iterable, n, fillvalue=None):
     args = [iter(iterable)] * n
     return (filter(None, values) for values
-            in izip_longest(fillvalue=fillvalue, *args))
+            in zip_longest(fillvalue=fillvalue, *args))
 
 
 def cursor_factory(**factory_options):
@@ -333,14 +349,14 @@ class Cursor(object):
         cur.execute(query)
     ```
     """
-    _cache = Queue.Queue(maxsize=5)
 
     def __init__(self, cursor_type=mysql.cursors.Cursor, **options):
         super(Cursor, self).__init__()
 
+        self._cache = queue.Queue(maxsize=cpu_count())
         try:
             conn = self._cache.get_nowait()
-        except Queue.Empty:
+        except queue.Empty:
             conn = mysql.connect(**options)
         else:
             # Ping the connection before using it from the cache.
@@ -352,7 +368,7 @@ def __init__(self, cursor_type=mysql.cursors.Cursor, **options):
 
     @classmethod
     def clear_cache(cls):
-        cls._cache = Queue.Queue(maxsize=5)
+        cls._cache = queue.Queue(maxsize=cpu_count())
 
     def __enter__(self):
         self.cursor = self.conn.cursor(self.cursor_type)
@@ -369,5 +385,5 @@ def __exit__(self, extype, exvalue, traceback):
         # Put it back on the queue
         try:
             self._cache.put_nowait(self.conn)
-        except Queue.Full:
+        except queue.Full:
             self.conn.close()
diff --git a/dejavu/decoder.py b/dejavu/decoder.py
@@ -3,8 +3,12 @@
 import numpy as np
 from pydub import AudioSegment
 from pydub.utils import audioop
-import wavio
+import dejavu.wavio as wavio
 from hashlib import sha1
+from sys import version
+
+if int(version[0]) > 2:
+    xrange = range
 
 def unique_hash(filepath, blocksize=2**20):
     """ Small function to generate a hash to uniquely generate
@@ -72,8 +76,7 @@ def read(filename, limit=None):
         channels = []
         for chn in audiofile:
             channels.append(chn)
-
-    return channels, audiofile.frame_rate, unique_hash(filename)
+    return channels, audiofile.frame_rate, unique_hash(filename), float(len(audiofile))/1000.0
 
 
 def path_to_songname(path):
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,6 @@ mp3 @@
     *.mp3
     .DS_Store
     *.cnf
+    build/
+    .idea/
+    PyDejavu.egg-info/