-
Notifications
You must be signed in to change notification settings - Fork 0
/
colectorTuitsHttps.py
68 lines (52 loc) · 2.87 KB
/
colectorTuitsHttps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""
El inicio del texto del tweet es:
class="css-901oao r-18jsvk2 r-1qd0xha r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-bnwqim r-qvutc0"><span class="css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0">
El final del texto del tweet es:
<span class="css-901oao css-16my406 r-4qtqp9 r-ip8ujx r-sjv1od r-zw8f10 r-bnwqim r-h9hxbl">
Espacios:
</span><span class="css-901oao css-16my406 r-poiln3 r-b88u0q r-bcqeeo r-qvutc0">
"""
import re
inicio_texto = 'class="css-901oao r-18jsvk2 r-1qd0xha r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-bnwqim r-qvutc0"><span class="css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0">'
fin_texto = '<span class="css-901oao css-16my406 r-4qtqp9 r-ip8ujx r-sjv1od r-zw8f10 r-bnwqim r-h9hxbl">'
caracter_espacio = '</span><span class="css-901oao css-16my406 r-poiln3 r-b88u0q r-bcqeeo r-qvutc0">'
caracter_espacio2 = '</span><span class="css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0">'
def remove_paterns(line):
line = re.sub('src="([./\- A-z, áéíóú0-9])*"', " ", line)
#line = re.sub('<(["./\- A-z, áéíóú0-9])*>', " ", line)
line = re.sub('<div class="[./\- A-z, áéíóú0-9]*">', " ", line)
line = re.sub('<div lang="[./\- A-z, áéíóú0-9]*" dir="[./\- A-z, áéíóú0-9]*" ', " ", line)
line = re.sub('<[./\- A-z, áéíóú0-9]*="[./\- A-z, áéíóú0-9]*" ', " ", line)
line = re.sub('[./\- A-z, áéíóú0-9]*="[./\- A-z, áéíóú0-9]*" ', " ", line)
line = re.sub('</[./\- A-z, áéíóú0-9]*>', " ", line)
line = re.sub('class ="[./\- A-z, áéíóú0-9]*" >', " ", line)
line = re.sub('[A-záéíóú0-9]*="[./\- A-z, áéíóú0-9]*"', " ", line)
return line
def get_tweets_from_html(name_html_file, name_dest_file, mode="w+"):
fd = open(name_html_file)
fd_salida = open(name_dest_file, mode)
is_in_text = False
text_tweet = []
for line in fd.readlines():
line_origin = line
if is_in_text or line.__contains__(inicio_texto):
index_init = 0
if line.__contains__(inicio_texto):
index_init = line.index(inicio_texto)
line = line[index_init+inicio_texto.__len__():]
if line.__contains__(fin_texto):
index_end = line.index(fin_texto)
line = line[:index_end-fin_texto.__len__()]
is_in_text = True
line = line.replace(inicio_texto, "").replace(caracter_espacio, " ").replace(caracter_espacio2, " ")\
.replace("class="," ").replace("<", " <")
line = remove_paterns(line)
if line.__contains__("<"):
end_line=line.index("<")
line = line[:end_line-1]
text_tweet.append(line.replace("\n", ""))
if is_in_text and line_origin.__contains__(fin_texto):
is_in_text = False
print(text_tweet)
fd_salida.write(" ".join(text_tweet) + "\n")
text_tweet = []