-
Notifications
You must be signed in to change notification settings - Fork 1
/
utility.py
138 lines (112 loc) · 4.63 KB
/
utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import re
import unidecode
import string
import time
from difflib import SequenceMatcher
import requests.exceptions
# Closer to 1 == strings must match more closely to be considered a match
REQUIRED_ARTIST_SCORE = 0.2
REQUIRED_SONG_SCORE = 0.3
EXCLUDED_GENIUS_TERMS = ["Songs That Reference Drugs"]
EXTRANEOUS_TEXT = ["Translations.+\n", r"\[[a-zA-Z]+\]\n",
# TODO[reece]: Do not match to end of line for the translations substitution
# Either use a list of langauges to match with
# Or find or make a pull request to remove translation info from the HTML
"[0-9]+Embed", "EmbedShare URLCopyEmbedCopy", "Embed$",
"You might also like", r"See $BAND$ Live",
r"Get tickets as low as \$[0-9]+", r"$SONG$ Lyrics",
"[0-9]+ Contributors?"]
def clean_paragraphs(paragraphs, artist_name, song_name):
"""Remove extraneous lines of text from paragraphs"""
clean_paragraphs = []
for paragraph in paragraphs:
for extraneous_pattern in EXTRANEOUS_TEXT:
extraneous_pattern = extraneous_pattern.replace("$BAND$", unidecode.unidecode(re.escape(artist_name)))
extraneous_pattern = extraneous_pattern.replace("$SONG$", unidecode.unidecode(re.escape(song_name)))
paragraph = re.sub(extraneous_pattern, "", unidecode.unidecode(paragraph), flags=re.IGNORECASE)
clean_paragraphs.append(paragraph)
return clean_paragraphs
def distance(a: str, b: str):
"""Get the distance ratio between two strings."""
return 1 - SequenceMatcher(None, a, b).ratio()
def remove_extra(name):
"""Remove the parentheses and hyphens from a song name."""
cleaned = re.sub(r"-[\S\s]*", "", re.sub(r"\([\w\W]*\)", "", name))
if not cleaned:
# if the entire name is within e.g. parens, don't remove everything
return name
return cleaned
def clean(name):
"""Remove potential discrepencies from the string."""
name = remove_extra(name)
name = unidecode.unidecode(name) # Remove diacritics
name = "".join(
list(filter(lambda c: c in (string.ascii_letters + string.digits + " "), name))
)
name = name.lower().strip()
return name
def match(song: tuple[str, str], other: tuple[str, str], log=None):
"""
Determine whether a song matches the result.
song: (song_name, artist_name)
other: (song_name, artist_name)
"""
if not isinstance(song, list) and not isinstance(song, tuple):
raise ValueError("Song must be a tuple")
if not isinstance(other, list) and not isinstance(other, tuple):
raise ValueError("Other must be a tuple")
artist_name = clean(song[1])
other_artist = clean(other[1])
artist_dist = distance(artist_name, other_artist)
if artist_dist > REQUIRED_ARTIST_SCORE:
if log:
log(f"{artist_name} != {other_artist}: {artist_dist} < {REQUIRED_ARTIST_SCORE}")
return False
song_name = clean(song[0])
other_name = clean(other[0])
song_dist = distance(song_name, other_name)
if (
song_dist <= REQUIRED_SONG_SCORE
or (song_name and song_name in other_name)
or (other_name and other_name in song_name)
):
return True
if log:
log(f"{song_name} does not match {other_name}: {song_dist} < {REQUIRED_SONG_SCORE}")
return False
def get_genius_song(song_name, artist_name, genius, log=None):
"""Get the corresponding song from Genius."""
song_search = song_name
for i in range(0, 2):
# Try once as is and once cleaned
song = None
for i in range(1, 8):
# Try several more times if there's timeouts but backoff
try:
song = genius.search_song(song_search, artist_name)
except requests.exceptions.Timeout:
print("Timeout from genius, sleeping")
time.sleep(i**2 / 10)
except Exception:
break
else:
break
else:
print("Too many timeouts, skipping song")
if isinstance(song, type(None)) or not match(
(song_search, artist_name), (song.title, song.artist)
):
if i:
if log:
log(f"Song '{song_search}' by '{artist_name}' not found on Genius")
return
else:
if log:
log(f"Song '{song_search}' by '{artist_name}' not found on Genius trying cleaning")
song_search = clean(song_search)
else:
if i:
if log:
log(f"Found match for '{song_search}' by '{artist_name}'")
break
return song