Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split fingerprinting #87

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions INSTALLATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Install the dependencies:
sudo yum install numpy scipy python-matplotlib ffmpeg portaudio-devel
pip install PyAudio
pip install pydub
pip install audioread

Now setup virtualenv ([howto?](http://www.pythoncentral.io/how-to-install-virtualenv-python/)):

Expand Down
77 changes: 73 additions & 4 deletions dejavu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,24 @@
import traceback
import sys

import shutil
import subprocess
import os.path
from dejavu.decoder import get_duration

def assure_path_exists(path):
if not os.path.isdir(path):
os.makedirs(path)

class SplitError(Exception):
def __init__(self, file_path, output_file, error_code):
Exception.__init__(self)
self.file_path = file_path
self.error_code = error_code
self.output_file = output_file

def __str__(self):
return "Spliting of file({0}) failed to ({1}). ffmpeg returned error code: {2}".format(self.file_path, self.output_file, self.error_code)

class Dejavu(object):

Expand All @@ -16,6 +34,11 @@ class Dejavu(object):
OFFSET = 'offset'
OFFSET_SECS = 'offset_seconds'

SPLIT_DIR = "split_dir"
SLICE_LIMIT_WHEN_SPLITTING = 3 # in minutes
LIMIT_CPU_CORES_FOR_SPLITS = 3
OVERWRITE_TEMP_FILES_WHEN_SPLITING = 1

def __init__(self, config):
super(Dejavu, self).__init__()

Expand All @@ -42,7 +65,7 @@ def get_fingerprinted_songs(self):
song_hash = song[Database.FIELD_FILE_SHA1]
self.songhashes_set.add(song_hash)

def fingerprint_directory(self, path, extensions, nprocesses=None):
def fingerprint_directory(self, path, extensions, nprocesses=None, treat_as_split=False, song_splitted_sid=""):
# Try to use the maximum amount of processes if not given.
try:
nprocesses = nprocesses or multiprocessing.cpu_count()
Expand All @@ -57,7 +80,7 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
for filename, _ in decoder.find_files(path, extensions):

# don't refingerprint already fingerprinted files
if decoder.unique_hash(filename) in self.songhashes_set:
if decoder.path_to_songname(filename) in self.songhashes_set:
print "%s already fingerprinted, continuing..." % filename
continue

Expand All @@ -84,8 +107,10 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
# Print traceback because we can't reraise it here
traceback.print_exc(file=sys.stdout)
else:
sid = self.db.insert_song(song_name, file_hash)

if not treat_as_split:
sid = self.db.insert_song(song_name, file_hash)
else:
sid = song_splitted_sid
self.db.insert_hashes(sid, hashes)
self.db.set_song_fingerprinted(sid)
self.get_fingerprinted_songs()
Expand All @@ -112,6 +137,50 @@ def fingerprint_file(self, filepath, song_name=None):
self.db.set_song_fingerprinted(sid)
self.get_fingerprinted_songs()

def fingerprint_with_duration_check(self, input_file, song_name=None):
duration = get_duration(input_file)
split_length = self.SLICE_LIMIT_WHEN_SPLITTING * 60
if duration < split_length:
return self.fingerprint_file(input_file)
songname, extension = os.path.splitext(os.path.basename(input_file))
song_name = song_name or songname
# don't refingerprint already fingerprinted files
if song_name in self.songhashes_set:
print "%s already fingerprinted, continuing..." % song_name
return
file_directory = os.path.dirname(input_file)
output_path = os.path.join(file_directory, self.SPLIT_DIR, song_name)
assure_path_exists(output_path)
start_offset = 0
end_offset = split_length
retcode = 0
song_hash = decoder.unique_hash(input_file)

sid = self.db.insert_song(song_name, song_hash)
while start_offset < duration:
output_file = os.path.join(output_path, "start_sec{0}_end_sec{1}{2}".format(start_offset, end_offset, extension))
convertion_command = [ 'ffmpeg',
'-i', input_file,
"-acodec", "copy", #fastest convertion possible 1:1 copy
["-n","-y"][self.OVERWRITE_TEMP_FILES_WHEN_SPLITING], # always overwrite existing files
"-vn", # Drop any video streams if there are any
'-ss', str(start_offset),
'-t', str(split_length),
output_file]
retcode = subprocess.call(convertion_command, stderr=open(os.devnull))
if retcode != 0:
raise SplitError(input_file, output_file, retcode)
start_offset += split_length
end_offset += split_length
end_offset = min(end_offset, duration)

self.db.set_song_fingerprinted(sid)
self.get_fingerprinted_songs()
self.fingerprint_directory(output_path, [extension],
nprocesses=self.LIMIT_CPU_CORES_FOR_SPLITS,
treat_as_split=True, song_splitted_sid=sid)
shutil.rmtree(output_path)

def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS):
hashes = fingerprint.fingerprint(samples, Fs=Fs)
return self.db.return_matches(hashes)
Expand Down
10 changes: 10 additions & 0 deletions dejavu/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,16 @@
import wavio
from hashlib import sha1

# https://github.com/sampsyo/audioread
import audioread

def get_duration(file_path):
duration = 0
with audioread.audio_open(file_path) as f:
duration = f.duration
f.close()
return duration

def unique_hash(filepath, blocksize=2**20):
""" Small function to generate a hash to uniquely generate
a file. Inspired by MD5 version here:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
### BEGIN ###
pydub>=0.9.4
PyAudio>=0.2.7
audioread>=1.2.1
numpy>=1.8.2
scipy>=0.12.1
matplotlib>=1.3.1
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def parse_requirements(requirements):
author_email='[email protected]',
maintainer="Will Drevo",
maintainer_email="[email protected]",
url='http://github.com/tuxdna/dejavu',
url='http://github.com/worldveil/dejavu',
license='MIT License',
include_package_data=True,
packages=find_packages(),
Expand Down
48 changes: 48 additions & 0 deletions test_fingerprint_by_splitting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from dejavu import Dejavu
import warnings
import json
import os, subprocess
warnings.filterwarnings("ignore")

# load config from a JSON file (or anything outputting a python dictionary)
with open("dejavu.cnf.SAMPLE") as f:
config = json.load(f)

class ConcatError(Exception):
def __init__(self, list_file, output_file, error_code):
Exception.__init__(self)
self.list_file = list_file
self.error_code = error_code
self.output_file = output_file

def __str__(self):
return "Problem with list file({0}). Failed to create({1}). ffmpeg returned error code: {2}".format(self.list_file, self.output_file, self.error_code)


if __name__ == '__main__':
'''
Concatenates ./mp3/*.mp3
Test fingerprinting the long concatenated file
'''
list_file = "mp3/concatenation_list.txt"
long_song = "mp3/concatenated.mp3"

concat_mp3_file_for_test = "ffmpeg -f concat -i {0} -y -c copy {1}".format(list_file, long_song)
retcode = subprocess.call(concat_mp3_file_for_test, stderr=open(os.devnull))
if retcode != 0:
raise ConcatError(list_file, long_song, retcode)

# create a Dejavu instance
djv = Dejavu(config)

try:
djv.fingerprint_file(long_song)
except Exception as err:
err = str(err) or "Memory Error" # Memory Errors does not have a string representation (as tested in Windows)
print "Exception raised during common fingerprint_file():({0}) so will split the file".format(err)
else:
raise "This file was successfully ingerprinted and splitting was not needed"

djv.fingerprint_with_duration_check(long_song, song_name="Concatenates_test")