forked from shawwn/scrap
-
Notifications
You must be signed in to change notification settings - Fork 0
/
4chan-to-text
executable file
·125 lines (105 loc) · 2.87 KB
/
4chan-to-text
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
#==============================================================================
# Functionality
#==============================================================================
import pdb
import sys
import os
import re
# utility funcs, classes, etc go here.
def asserting(cond):
if not cond:
pdb.set_trace()
assert(cond)
def has_stdin():
return not sys.stdin.isatty()
def reg(pat, flags=0):
return re.compile(pat, re.VERBOSE | flags)
#==============================================================================
# Cmdline
#==============================================================================
import argparse
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
description="""
TODO
""")
parser.add_argument('-v', '--verbose',
action="store_true",
help="verbose output" )
args = None
#==============================================================================
# Main
#==============================================================================
import sys
import json
import html2text
import ftfy
import requests
from pprint import pprint as pp
h = html2text.HTML2Text()
h.body_width = 0
h.emphasis_mark = '*'
def process(line):
post = json.loads(line)
if post is None:
return
if 'posts' in post:
for item in post['posts']:
process(json.dumps(item))
return
text = ''
com = post.get("com", '')
if com is not None:
com = h.handle(com)
com = ftfy.fix_text(com)
post['com'] = com
if 'name' not in post:
post['name'] = 'Anonymous'
if 'board' not in post:
post['board'] = ''
else:
post['board'] = '/' + post['board'] + '/ '
if post.get('filename', None):
post['filename'] = post['filename'] + post['ext'] + '\n'
else:
post['filename'] = ''
#pp(post)
text = """
{board}{name} {now} No.{no} ▶
{filename}{com}
""".format(**post).strip() + '\n'
print(text)
def run():
if args.verbose:
print(args)
for url in args.args:
url = url.split('#', 1)[0] # get rid of any anchor
url = url.split('.json', 1)[0] # get rid of any .json extension
url = url + '.json' # try adding .json and fetch.
i = requests.get(url)
if not i.ok:
i.raise_for_status()
else:
process(i.text)
if has_stdin():
for line in sys.stdin:
process(line)
def main():
try:
global args
if not args:
args, leftovers = parser.parse_known_args()
args.args = leftovers
return run()
except IOError:
# http://stackoverflow.com/questions/15793886/how-to-avoid-a-broken-pipe-error-when-printing-a-large-amount-of-formatted-data
try:
sys.stdout.close()
except IOError:
pass
try:
sys.stderr.close()
except IOError:
pass
if __name__ == "__main__":
main()