forked from irfancharania/fb-feed-gen
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch.py
205 lines (142 loc) · 5.65 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from dateutil.parser import *
import requests
import urllib.parse
import re
import urllib.request
import urllib.parse
import urllib.error
import bleach
# allows us to get mobile version
user_agent_mobile = 'Mozilla/5.0 (Linux; Android 7.0; SM-G610F Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/537.36'
user_agent_desktop = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
base_url = 'https://mbasic.facebook.com/'
max_title_length = 100
def get_remote_data(url, ismobile=True, referer=None):
''' fetch website data as mobile or desktop browser'''
user_agent = user_agent_mobile if ismobile else user_agent_desktop
headers = {'User-Agent': user_agent}
if referer:
headers['Referer'] = referer
r = requests.get(url, headers=headers)
return r.content
def is_valid_username(username):
''' validate username '''
expr = '^(?:pages\/)?(?P<display>[\w\-\.]{3,50})(\/\d{3,50})?$'
result = re.match(expr, username)
display = result.group('display') if result else None
return (result, display)
def strip_invalid_html(content):
''' strips invalid tags/attributes '''
allowed_tags = ['a', 'abbr', 'acronym', 'address', 'b', 'br', 'div', 'dl', 'dt',
'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img',
'li', 'ol', 'p', 'pre', 'q', 's', 'small', 'strike', 'strong',
'span', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th',
'thead', 'tr', 'tt', 'u', 'ul']
allowed_attrs = {
'a': ['href', 'target', 'title'],
'img': ['src', 'alt', 'width', 'height'],
}
cleaned = bleach.clean(content,
tags=allowed_tags,
attributes=allowed_attrs,
strip=True)
# handle malformed html after running through bleach
tree = BeautifulSoup(cleaned, "lxml")
return tree.html
def sub_video_link(m):
expr = '\&\;source.+$'
orig = m.group(1)
unquoted = urllib.parse.unquote(orig)
new = re.sub(expr, '\" target', unquoted)
return new
def fix_video_redirect_link(content):
''' replace video redirects with direct link '''
expr = '\/video_redirect\/\?src=(.+)\"\starget'
result = re.sub(expr, sub_video_link, content)
return result
def sub_leaving_link(m):
expr = '\&\;h.+$'
orig = m.group(1)
unquoted = urllib.parse.unquote(orig)
new = re.sub(expr, '\" target', unquoted)
return new
def fix_leaving_link(content):
''' replace leaving fb links with direct link '''
expr = 'https:\/\/lm\.facebook\.com\/l.php\?u\=([a-zA-Z0-9\=\%\&\;\.\-\_]+)\"\s'
result = re.sub(expr, sub_leaving_link, content)
return result
def fix_article_links(content):
# fix video links
v_fix = fix_video_redirect_link(content)
# fix leaving links
l_fix = fix_leaving_link(v_fix)
# convert links to absolute
a_fix = l_fix.replace('href="/', 'href="{0}'.format(base_url))
return a_fix
def fix_guid_url(url):
''' add base + strip extra parameters '''
expr = '([&\?]?(?:type|refid|source)=\d+&?.+$)'
stripped = re.sub(expr, '', url)
guid = urllib.parse.urljoin(base_url, stripped)
return guid
def build_site_url(username):
return urllib.parse.urljoin(base_url, username)
def build_title(entry):
''' build title from entry '''
if not entry:
return 'Title not found'
text = entry.get_text().strip()
if text:
if len(text) > max_title_length:
last_word = text.rfind(' ', 0, max_title_length)
text = text[:last_word] + '...'
return text
else:
return entry
def build_article(text, extra):
''' fix up article content '''
content = str(text) + ' ' + str(extra)
return strip_invalid_html(fix_article_links(content))
def extract_items(username, contents):
''' extract posts from page '''
print('Extracting posts from page')
main_content = SoupStrainer('div', {'id': 'recent'})
soup = BeautifulSoup(contents, "lxml", parse_only=main_content)
items = []
if soup.div:
for item in soup.div.div.div.children:
item_link = item.find('a', text='Full Story')
if not item_link:
continue # ignore if no permalink found
url = fix_guid_url(item_link['href'])
date = parse(item.find('abbr').text.strip(), fuzzy=True)
article_byline = ''
article_text = ''
article_extra = ''
article_author = username
if item.div.div:
article_byline = item.div.div.get_text()
article_author = item.div.div.find('h3').a.get_text(strip=True)
# add photos/videos
if item.div.div.next_sibling:
article_text = item.div.div.next_sibling
if item.div.div.next_sibling.next_sibling:
article_extra = item.div.div.next_sibling.next_sibling
# cleanup article
article = build_article(article_text, article_extra)
article_title = article_byline
if not article_title or article_title == article_author:
article_title = build_title(article_text)
items.append({
'url': url,
'title': article_title,
'article': article,
'date': date,
'author': article_author
})
print('{0} posts found'.format(len(items)))
return items
# else
return None