-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
392 lines (309 loc) · 13.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
import pinboard
import requests
import typer
from bs4 import BeautifulSoup
from environs import Env
from pathlib import Path
from rich import print
from stop_words import safe_get_stop_words
from titlecase import titlecase
from unidecode import unidecode
from yarl import URL
env = Env()
GITHUB_TOKEN = env.str("GITHUB_TOKEN")
GITHUB_USERNAME = env.str("GITHUB_USERNAME")
PINBOARD_TOKEN = env.str("PINBOARD_TOKEN")
IGNORE_WORDS = set(
[word.lower() for word in Path("IGNORE_WORDS.txt").read_text().split()]
)
STOP_WORDS = set([word.lower() for word in Path("STOP_WORDS.txt").read_text().split()])
STOP_WORDS.update(set(safe_get_stop_words("english")))
IGNORE_TAGS = IGNORE_WORDS | STOP_WORDS
def get_dev_to_info_for_url(url):
try:
req = requests.get(url, timeout=1.0)
soup = BeautifulSoup(req.text, "html.parser")
data = {
"tags": [
tag.text.lstrip("#") for tag in soup.find_all("a", {"class": "tag"})
]
}
return data
except Exception as e:
print(e)
return {}
def get_github_info_for_url(url):
bits = url.replace("https://github.com/", "").split("/")
owner, repo = bits[0], bits[1]
url = "https://api.github.com/repos/{owner}/{repo}".format(owner=owner, repo=repo)
req = requests.get(
url,
auth=(GITHUB_USERNAME, GITHUB_TOKEN),
headers={"Accept": "application/vnd.github.mercy-preview+json"},
timeout=1.0,
)
try:
return req.json()
except Exception as e:
print(e)
return {}
def normalize_tags(original_tags, ignore_meta_tags=False):
tags = [unidecode(tag.lower()) for tag in original_tags if len(tag)]
if ignore_meta_tags:
tags = [tag for tag in tags if ":" not in tag]
tags = set(tags).difference(IGNORE_TAGS)
return tags
class Bookmarks(object):
def __init__(self, *, pinboard_token, count=20, start=0, verbose: bool = False):
self.pinboard_token = pinboard_token
self.count = count
self.pinboard = pinboard.Pinboard(pinboard_token)
self.start = start
self.verbose = verbose
"""
TODO:
Implement a clean() and clean_fieldname() approach to help normalize
our bookmark model.
- Store the initial values.
- Run the clean script.
- clean
- clean_fieldname
- Mark bookmark as modified.
- If the link changed, delete the old, and replace the url.
- Save bookmark.
"""
def get_bookmarks(self, start=None, count=None):
return self.pinboard.posts.all(
start=start or self.start, results=count or self.count
)
def fix_tags(self, start=None, count=None):
links = self.get_bookmarks(start=start, count=count)
for link in links:
dirty = False
try:
description = unidecode(link.description)
titlecase_description = titlecase(description)
extended = link.extended = unidecode(link.extended)
url = URL(link.url)
"""
TODO: Add better support for common websites like:
- dev.to
- github.com
- medium.com
Possible features:
- more accurate tags
- check for meta descriptions
"""
if url.host == "github.com":
github = get_github_info_for_url(link.url)
github_tags = set(github.get("topics", []))
description = github.get("full_name")
titlecase_description = titlecase(description)
github_description = github.get("description")
extended = (
"> {0}".format(github_description)
if github_description
else link.extended
)
# Github projects should be visible...
if not link.shared:
link.shared = True
dirty = True
if len(link.description) == 0 or link.description == "github.com":
link.description = titlecase_description
dirty = True
if len(link.extended) == 0:
link.extended = extended
dirty = True
# dev.to articles should be shared by default...
elif url.host == "dev.to":
devto_data = get_dev_to_info_for_url(link.url)
github_tags = set(devto_data.get("tags", []))
if not link.shared:
link.shared = True
dirty = True
if "- DEV" in link.description:
link.description = (link.description.split("- DEV")[0]).strip()
dirty = True
if not github_tags.issubset(set(link.tags)):
dirty = True
else:
github_tags = set([])
if len(description.split(" ")) == 1 and url.host != "github.com":
if self.verbose:
print("[red]description is blank[/red]")
try:
doc = requests.get(link.url, timeout=1.0)
soup = BeautifulSoup(doc.text, "html.parser")
description = soup.find("title").text
link.description = description
dirty = True
except (Exception, requests.exceptions.Timeout) as e:
if self.verbose:
print(f"[red]{e}[/red]")
if len(link.extended) == 0:
if self.verbose:
print("[red]extended is blank[/red]")
try:
doc = requests.get(link.url, timeout=1.0)
soup = BeautifulSoup(doc.text, "html.parser")
try:
content = ""
if soup.find("meta", {"name": "description"}):
content = soup.find(
"meta", {"name": "description"}
).get("content")
if soup.find("meta", {"name": "description"}):
content = soup.find(
"meta", {"name": "description"}
).get("value")
if soup.find("meta", {"property": "og:description"}):
content = soup.find(
"meta", {"property": "og:description"}
).get("content")
if content:
# TODO: Split this out by the first paragraph
link.extended = f"> {content.strip()}"
typer.echo(link.extended)
dirty = True
except AttributeError as e:
if self.verbose:
print(e)
# try:
# content = soup.find('meta', {'property': 'og:description'}).get('content')
# link.extended = f'> {content}'
# typer.echo(link.extended)
# dirty = True
# except AttributeError:
# pass
pass
except (Exception, requests.exceptions.Timeout) as e:
if self.verbose:
print(f"[red]{e}[/red]")
# link.extended = titlecase_description
# dirty = True
# Sets
tags = set(normalize_tags(link.tags))
suggested = self.pinboard.posts.suggest(url=link.url)
popular, recommended = suggested
popular = normalize_tags(popular.get("popular"), ignore_meta_tags=True)
recommended = normalize_tags(
recommended.get("recommended"), ignore_meta_tags=True
)
new_tags = list(tags | popular | recommended | github_tags)
if len(new_tags) != len(tags) or dirty:
if self.verbose:
typer.echo("saving... {}".format(link.url))
typer.echo("description: {}".format(titlecase_description))
if extended:
typer.echo("extended: {}".format(extended))
typer.echo("my tags: {}".format(tags))
typer.echo("updating to: {}".format(new_tags))
try:
link.tags = new_tags
link.save()
except UnicodeEncodeError:
try:
link.description = description
link.extended = extended
link.save()
except Exception as e:
if self.verbose:
typer.echo("=" * 100)
typer.echo(e)
typer.echo(type(e))
typer.echo("=" * 100)
except Exception as e:
if self.verbose:
typer.echo("=" * 100)
typer.echo(e)
typer.echo(type(e))
typer.echo("=" * 100)
except Exception as e:
if self.verbose:
typer.echo("=" * 100)
typer.echo(e)
typer.echo(type(e))
typer.echo("=" * 100)
def fix_titlecase(self, start=None, count=None):
links = self.get_bookmarks(start=start, count=count)
for link in links:
description = unidecode(link.description)
titlecase_description = titlecase(description)
extended = unidecode(link.extended)
if description != titlecase_description:
typer.echo("description: {}".format(description))
typer.echo("description: {}".format(titlecase_description))
try:
link.description = titlecase_description
link.save()
except UnicodeEncodeError:
try:
link.description = titlecase_description
link.extended = extended
link.save()
except UnicodeEncodeError:
if self.verbose:
typer.echo("*" * 60)
typer.echo(
"description: {}".format(unidecode(link.description))
)
typer.echo("extended: {}".format(unidecode(link.extended)))
typer.echo("url: {}".format(link.url))
typer.echo(
"tags: {}".format(set(normalize_tags(link.tags)))
)
typer.echo("*" * 60)
except Exception as e:
if self.verbose:
typer.echo("=" * 100)
typer.echo(e)
typer.echo(type(e))
typer.echo(link.url)
typer.echo("=" * 100)
def remove_dupes(self, start=None, count=None):
links = self.get_bookmarks(start=start, count=count)
for link in links:
tags = link.tags
tags = [
tag for tag in tags if len(tags) and tag.startswith(("http", "https"))
]
tag = tags[0] if len(tags) else ""
if tag.startswith(("http", "https")) and tag not in ["http", "https"]:
if self.verbose:
typer.echo("description: {}".format(unidecode(link.description)))
typer.echo("extended: {}".format(unidecode(link.extended)))
typer.echo("url: {}".format(link.url))
typer.echo("tags: {}".format(tags))
typer.echo("tag: {}".format(tag))
if tag.startswith("http://xn--%20https:-dk9c//"):
tag = tag.replace("http://xn--%20https:-dk9c//", "https://")
new_description = link.description
new_url = tag
link.delete()
if new_url.startswith("http://xn--%20https:-dk9c//"):
new_url = new_url.replace("http://xn--%20https:-dk9c//", "https://")
self.pinboard.posts.add(
url=new_url, description=unidecode(new_description), private=True
)
if self.verbose:
typer.echo("---")
# CLI api
app = typer.Typer()
@app.command("fix_tags")
def fix_tags(count: int = 10, start: int = 0, verbose: bool = False):
print("[green]fix_tags()...[/green]")
bookmarks = Bookmarks(pinboard_token=PINBOARD_TOKEN)
bookmarks.fix_tags(start, count)
@app.command("fix_titlecase")
def fix_titlecase(count: int = 10, start: int = 0, verbose: bool = False):
print("[green]fix_titlecase()...[/green]")
bookmarks = Bookmarks(pinboard_token=PINBOARD_TOKEN)
bookmarks.fix_titlecase(start, count)
@app.command("remove_dupes")
def remove_dupes(count: int = 10, start: int = 0, verbose: bool = False):
print("[green]remove_dupes()...[/green]")
bookmarks = Bookmarks(pinboard_token=PINBOARD_TOKEN)
bookmarks.remove_dupes(start, count)
if __name__ == "__main__":
app()