-
Notifications
You must be signed in to change notification settings - Fork 3
/
extract_urls.py
171 lines (151 loc) · 7.97 KB
/
extract_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import os
import sys
import json
import re
# This works for files of the type:
# - upgrade.jsonlz4-YYYYMMDDHHMMSS
# - previous.jsonlz4
# They first need to be decompressed using lz4json: https://unix.stackexchange.com/a/338880
# Count size, and optionally print structure
def count_size(obj, level=0, verbose=False):
count = 0
if isinstance(obj, list):
if verbose: print()
for i, e in enumerate(obj):
if verbose: print("\t" * level, "i=", i, ", type=", type(e), end='')
size = count_size(e, level + 1, verbose)
count += size
elif isinstance(obj, dict):
if verbose: print()
for key in obj.keys():
if verbose: print("\t" * level, "key=", key, ", type=", type(obj[key]), end='')
size = count_size(obj[key], level + 1, verbose)
count += size
else: # Do not count the sizes of dict and lists as we already count the size of their items
count += sys.getsizeof(obj)
if verbose: print(", size=", count)
return count
# Count total size and key-wise size, and optionally print structure
def count_size_per_key(obj, level=0, verbose=False):
count_dict = {"total": 0}
if isinstance(obj, list):
if verbose: print()
for i, e in enumerate(obj):
if verbose: print("\t" * level, "i=", i, ", type=", type(e), end='')
cdict = count_size_per_key(e, level + 1)
for k in cdict:
if k in count_dict:
count_dict[k] += cdict[k]
else:
count_dict[k] = cdict[k]
elif isinstance(obj, dict):
if verbose: print()
for key in obj.keys():
if verbose: print("\t" * level, "key=", key, ", type=", type(obj[key]), end='')
cdict = count_size_per_key(obj[key], level + 1)
for k in cdict:
if k in count_dict:
count_dict[k] += cdict[k]
else:
count_dict[k] = cdict[k]
if key in count_dict:
count_dict[key] += cdict["total"]
else:
count_dict[key] = cdict["total"]
else: # Do not count the sizes of dict and lists as we already count the size of their items
count_dict["total"] += sys.getsizeof(obj)
if verbose: print(", size=", count_dict["total"])
return count_dict
def print_session_info(A, tablevel=0, verbose=True, exploreNestedSessions=False, exploreClosedTabs=True,
only_last_entry=True, printOpenTabs=True, printClosedTabs=False):
"""
Recursive function that takes a firefox session, prints its (potentially nested) structure, and returns urls found
This doesn't explore closed windows.
:param A: dict, with keys "windows,
:param tablevel: used in printing to indicate nesting
:param verbose: print the structure
:param exploreNestedSessions: Firefox will nest sessions in tab entry (don't know why) so set to True to explore them
:param exploreClosedTabs: Whether to explore "closed" tabs. Exploring means storing their urls.
:param only_last_entry: allow to only get the last page that I visited in a particular tab
:param printOpenTabs: print "opened" tabs and their urls
:param printClosedTabs: print "closed" tabs and their urls
:return:
"""
opened_urls = []
all_urls = []
indent = "||--" * tablevel
if verbose: print(indent, "The session contains:")
if verbose: print(indent, "- %d windows" % len(A["windows"]))
if verbose: print(indent, "- %d closed windows" % len(A["_closedWindows"]))
if verbose: print(indent, )
for i, w in enumerate(A["windows"]):
if verbose: print(indent,
"Window %d: %d open tabs, %d closed tabs" % (i, len(w["tabs"]), len(w["_closedTabs"])))
if verbose and printOpenTabs: print(indent, "Open tabs:")
for j, t in enumerate(w["tabs"]):
if verbose and printOpenTabs: print(indent, "\t- tab %d : %d entries" % (j, len(t["entries"])))
for k, e in enumerate(t["entries"]):
if not (only_last_entry and k < len(t["entries"]) - 1):
if verbose and printOpenTabs: print(indent, "\t\t", e["url"])
opened_urls.append(e["url"])
all_urls.append(e["url"])
if exploreNestedSessions and "formdata" in t:
if "id" in t["formdata"]:
if "sessionData" in t["formdata"]["id"]:
print(indent, "New session in window", i, ", tab", j, ":")
ou, au = print_session_info(t["formdata"]["id"]["sessionData"], tablevel + 1, verbose,
exploreNestedSessions, exploreClosedTabs,
only_last_entry, printOpenTabs, printClosedTabs)
opened_urls += ou
all_urls += ou
all_urls += au
if exploreClosedTabs:
if verbose and printClosedTabs: print(indent, "Closed tabs:")
for j, t in enumerate(w["_closedTabs"]):
if verbose and printClosedTabs: print(indent,
"\t- tab %d : %d entries" % (j, len(t["state"]["entries"])))
for k, e in enumerate(t["state"]["entries"]):
if not (only_last_entry and k < len(t["state"]["entries"]) - 1):
if verbose and printClosedTabs: print(indent, "\t\t", e["url"])
opened_urls.append(e["url"])
all_urls.append(e["url"])
if exploreNestedSessions and "formdata" in t:
if "id" in t["formdata"]:
if "sessionData" in t["formdata"]["id"]:
print(indent, "New session in window", i, ", tab", j, ":")
ou, au = print_session_info(t["formdata"]["id"]["sessionData"], tablevel + 1, verbose,
exploreNestedSessions, exploreClosedTabs,
only_last_entry, printOpenTabs, printClosedTabs)
opened_urls += ou
all_urls += ou
all_urls += au
return opened_urls, all_urls
if __name__ == "__main__":
# f = sys.argv[1]
# f = "session-firefox/sessionstore-backups/upgrade.jsonlz4-20210222142601.txt"
f = "session-firefox/sessionstore-backups/previous.jsonlz4.txt"
A = json.load(open(f))
# Find
openTabs, allTabs = print_session_info(A, exploreNestedSessions=True, printOpenTabs=True, printClosedTabs=False)
# for t in sorted(set(openTabs)):
# print(t)
# Find only first level of nested sessionData
# A = A["windows"][0]["tabs"][0]["formdata"]["id"]["sessionData"]
# openTabs, allTabs = print_session_info(A, printOpenTabs=True, printClosedTabs=False, exploreNestedSessions=False)
# # If file is corrupted (non-valid json), then this will find all urls independently of any structure.
# # This gets all urls, even those of closed tabs
# txt = open(f).read()
# url_search = re.findall('"url":"(http.*?)"', txt, re.IGNORECASE)
# url_list = list(set(url_search)) # Remove duplicates
#
# re.findall('"entries".*?"url":"(http.*?)"', txt, re.IGNORECASE) # Only the urls that have an "entries" tag before
# re.findall('"url":"(http.*?)".*?"entries"', txt, re.IGNORECASE) # Only the urls that have an "entries" tag before
# T = re.sub('"(?:[^"]*_base64|csp|image|value|referrerInfo|structuredCloneState)":".*?",',"",txt) # Remove long parts (binary data)
# Get size (in bytes) of the different elements in the file
# cnts = count_size(A,verbose=True)
# print(cnts)
# cntspk = count_size_per_key(A,verbose=True)
# print(cntspk)
# Get largest
# sorted_order = sorted(cntspk.items(), key=lambda x: x[1], reverse=True)
# [print(x) for x in sorted_order[:20]]