-
Notifications
You must be signed in to change notification settings - Fork 1
/
BasicProxyScraper.py
169 lines (123 loc) · 4.96 KB
/
BasicProxyScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import re
import PySimpleGUI as sg
import requests
from bs4 import BeautifulSoup
from proxy_checker import ProxyChecker
import concurrent.futures
import threading
import json
uncheckedProxies = []
table_content = []
def JSON_Convert(lists, separators):
JSON_Output = json.dumps(lists, separators=separators)
with open('json_save.json', 'w') as output:
output.write(JSON_Output)
def Text_Convert(proxies):
with open("text_save.txt", "w") as output:
for i in proxies:
output.write(i + "\n")
# Function for scraping proxies from provided URLs
def proxyScraper():
URLx = []
#Opens and reads users URL Addresses
with open(values["-IMPORT-"], 'r') as f:
URLList = f.readlines()
#Looping through the list of URLS provided.
for ress in URLList:
URLx.append(ress.strip())
#Grabbing the HTML and parsing
for res in URLx:
html = requests.get(str(res))
soup = BeautifulSoup(html.content, "html.parser")
#Finding the proxies using regex
prox = re.findall("[0-9]+(?:\.[0-9]+){3}:[0-9]+", str(soup))
#Looping through the found proxies.
for proxy in prox:
#Checking if proxy exist. if not, appended.
if proxy not in uncheckedProxies:
uncheckedProxies.append(proxy)
print("Proxies Scraped.")
#Initializing ProxyChecker
checker = ProxyChecker()
#Lists used to store values for the save functions
checked_Dict = []
proxies_list = []
def ProxyCheck(ip):
# Using input IP and checking it with ProxyChecker
groupAnswer = checker.check_proxy(ip)
#Checking if there was a result
if groupAnswer != False:
#Appending to lists above; Used for the save functions
proxies_list.append(ip)
checked_Dict.append(groupAnswer)
#Converting ProxyChecker result to a list; Needed
# to output to UI table
convert = []
convert.append(ip)
for i in groupAnswer.values():
convert.append(i)
#Appending new list to table content; This is the values argument pysimplegui table
table_content.append(convert)
#Updating the table to the new table_content
window["-TABLE-"].Update(values=table_content)
# Function for multithreading using concurrent.futures.ThreadPoolExecutor
def runThread():
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=int(workerCountInput)) as executor:
#Running ProxyCheck funtion with the unchecked proxies as its argument
executor.map(ProxyCheck, uncheckedProxies)
except Exception:
print("Proxy Checker initiation failed! Please check you have selected a thread count.")
# Right side of the UI: Start button, Thread input, Import Urls,
# Save buttons, Quality check checkbox
right_layout = [
#Start Button
[sg.Button("Start", visible=True)],
#Import URLs Button
[sg.FileBrowse("Import URLs", key="-IMPORT-")],
#Thread count input
[sg.Text("Thread Count:"),
sg.Input(key="-THREAD-INPUT-", size=(5,5))],
#Invisible button allowing usi
#sg.Button("Sumbit", visible=False, bind_return_key=True)],
[sg.Checkbox("Quality Check", default=True, change_submits=True, enable_events=True, key="-Checkbox-")],
[sg.Button("Save as .txt", key="-SAVE-TEXT-")],
[sg.Button("Save JSON", key="-SAVE-JSON-")]
]
layout = [[
sg.Frame("Information Station", [[
sg.Table(values = table_content,
headings= ["IP:PORT", "PROTOCOL", "ANONYMITY", "TIMEOUT", "COUNTRY", "COUNTRY_CODE"],
expand_x = True,
key="-TABLE-"),
sg.Column(right_layout, element_justification="center", expand_x=True),
]]),
]]
window = sg.Window("Basic Proxy Scraper", layout, finalize=True)
#window["-THREAD-INPUT-"].bind("<Return>", "_Enter")
while True:
event, values = window.read()
file_path = values["-IMPORT-"]
workerCountInput = window["-THREAD-INPUT-"].get()
if event == sg.WIN_CLOSED:
break
if event == "Start":
proxyScraper()
# Checking if user wants proxies checked. If so, threads start
if values["-Checkbox-"] == True:
# Using threading to run the runThread function;
# Needed to avoid running on the main thread; pysimplegui issue.
threading.Thread(target=runThread, daemon=True).start()
# If save txt button clicked, saves as .txt file using
# Text_convert() function
if values["-Checkbox-"] == True:
if event == "-SAVE-TEXT-":
Text_Convert(proxies_list)
elif values["-Checkbox-"] == False:
if event == "-SAVE-TEXT-":
Text_Convert(uncheckedProxies)
# If save json button clicked, saves as .json file using
# JSON_convert() function
if values["-Checkbox-"] == True:
if event == "-SAVE-JSON-":
JSON_Convert(checked_Dict, separators=[",",":"])