diff --git a/lazynlp/crawl.py b/lazynlp/crawl.py
index df138fc..ed1c149 100644
--- a/lazynlp/crawl.py
+++ b/lazynlp/crawl.py
@@ -3,6 +3,7 @@
 import html
 import http
 import os
+
 dir_path = os.path.dirname(os.path.realpath(__file__))
 import re
 import socket
@@ -16,83 +17,90 @@
 from lazynlp.cleaner import *
 from lazynlp.utils import *
 
+
 def exists(url):
-	request = requests.get(url)
-	return request.status_code == 200
+    request = requests.get(url)
+    return request.status_code == 200
+
 
 def get_gutenberg_link_from_id(book_id):
-	txt_tmpl1 = 'http://www.gutenberg.org/cache/epub/{}/pg{}.txt'
-	txt_tmpl2 = 'http://www.gutenberg.org/files/{}/{}.txt'
-
-	for tmpl in [txt_tmpl1, txt_tmpl2]:
-		link = tmpl.format(book_id, book_id)
-		if exists(link):
-			return link
-
-	txt_tmpl3 = 'http://www.gutenberg.org/files/{}/{}-{}.txt'
-	# idx = [0, 8] + list(range(1, 8)) + list(range(9, 15))
-	for i in [0, 8]:
-		link = txt_tmpl3.format(book_id, book_id, i)
-		if exists(link):
-			return link
-	return None
+    txt_tmpl1 = 'http://www.gutenberg.org/cache/epub/{}/pg{}.txt'
+    txt_tmpl2 = 'http://www.gutenberg.org/files/{}/{}.txt'
+
+    for tmpl in [txt_tmpl1, txt_tmpl2]:
+        link = tmpl.format(book_id, book_id)
+        if exists(link):
+            return link
+
+    txt_tmpl3 = 'http://www.gutenberg.org/files/{}/{}-{}.txt'
+    # idx = [0, 8] + list(range(1, 8)) + list(range(9, 15))
+    for i in [0, 8]:
+        link = txt_tmpl3.format(book_id, book_id, i)
+        if exists(link):
+            return link
+    return None
+
 
 def get_us_gutenberg_links(outfile, max_id=58910):
-	out = open(outfile, 'w')
-	for book_id in range(1, max_id+1):
-		link = get_gutenberg_link_from_id(book_id)
-		if link:
-			out.write(link + '\n')
-		else:
-			print("Can't find link for book id", book_id)
-	out.close()
+    out = open(outfile, 'w')
+    for book_id in range(1, max_id + 1):
+        link = get_gutenberg_link_from_id(book_id)
+        if link:
+            out.write(link + '\n')
+        else:
+            print("Can't find link for book id", book_id)
+    out.close()
+
 
 def get_id_aus(link):
-	id_ = link[link.rfind('/')+1:link.rfind('.')]
-	if id_[-1] == 'h':
-		return id_[:-1]
-	return id_
+    id_ = link[link.rfind('/') + 1:link.rfind('.')]
+    if id_[-1] == 'h':
+        return id_[:-1]
+    return id_
+
 
 def get_aus_gutenberg_links(outfile, catalog_file='https://www.gutenberg.org/dirs/GUTINDEX.AUS'):
-	req = urllib.request.Request(catalog_file)
-	response = urllib.request.urlopen(req)
-	page = response.read()
-	page = page.decode('utf-8')
-	html_links = re.findall(r'http://gutenberg.net.au/ebooks[01][0-9]/[0-9]{7}[h]?.html', page)
-	txt_links = re.findall(r'http://gutenberg.net.au/ebooks[01][0-9]/[0-9]{7}.txt', page)
-	seen_ids = set()
-	with open(outfile, 'w') as out:
-		for link in txt_links:
-			out.write(link + '\n')
-			seen_ids.add(get_id_aus(link))
-
-		for link in html_links:
-			book_id = get_id_aus(link)
-			if not book_id in seen_ids:
-				out.write(link + '\n')
-				seen_ids.add(book_id)
+    req = urllib.request.Request(catalog_file)
+    response = urllib.request.urlopen(req)
+    page = response.read()
+    page = page.decode('utf-8')
+    html_links = re.findall(r'http://gutenberg.net.au/ebooks[01][0-9]/[0-9]{7}[h]?.html', page)
+    txt_links = re.findall(r'http://gutenberg.net.au/ebooks[01][0-9]/[0-9]{7}.txt', page)
+    seen_ids = set()
+    with open(outfile, 'w') as out:
+        for link in txt_links:
+            out.write(link + '\n')
+            seen_ids.add(get_id_aus(link))
+
+        for link in html_links:
+            book_id = get_id_aus(link)
+            if not book_id in seen_ids:
+                out.write(link + '\n')
+                seen_ids.add(book_id)
+
 
 def to_skip(link, extensions=None, domains=None):
-	""" domains can be:
+    """ domains can be:
 		- just the name (as in: google)
 		- main domain (as in: google.com)
 		- subdomain (as in: news.google.com)
 	"""
-	for ext in extensions:
-		if link.endswith(ext):
-			return True
-	raw_url = get_raw_url(link)
-	subdomain, domain, suffix = tldextract.extract(link)
-	if domain in domains:
-		return True
-	if '.'.join([domain, suffix]) in domains:
-		return True
-	if '.'.join([subdomain, domain, suffix]) in domains:
-		return True
-	return False
-
-def download_page(link, context=None, timeout=None):
-	"""
+    for ext in extensions:
+        if link.endswith(ext):
+            return True
+    raw_url = get_raw_url(link)
+    subdomain, domain, suffix = tldextract.extract(link)
+    if domain in domains:
+        return True
+    if '.'.join([domain, suffix]) in domains:
+        return True
+    if '.'.join([subdomain, domain, suffix]) in domains:
+        return True
+    return False
+
+
+def download_page(link, context=None, headers=None, timeout=None):
+    """
 	Return code, page
 	0: successfully read (write to index)
 	1: bad_url (write to bad_url)
@@ -101,64 +109,68 @@ def download_page(link, context=None, timeout=None):
 
 	When code is not 0, return ''
 	"""
-	try:
-		req = urllib.request.Request(link)
-	except ValueError as e:
-		print(link, "doesn't exist.")
-		return 1, ''
-	except ConnectionResetError as e:
-		print('ConnectionResetError', link)
-		return 3, ''
-
-	try:
-		if not timeout is None:
-			response = urllib.request.urlopen(req, context=context, timeout=timeout)
-		else:
-			response = urllib.request.urlopen(req, context=context)
-	except UnicodeError as e:
-		print('UnicodeError for', link)
-		return 2, ''
-	except (urllib.error.HTTPError) as e:
-		print('Error {} for {}'.format(e.code, link))
-		return 1, ''
-	except urllib.error.URLError as e:
-		print('URLError for', link)
-		return 1, ''
-	except http.client.HTTPException as e:
-		print('HTTPException', link)
-		return 1, ''
-	except http.client.RemoteDisconnected as e:
-		print('RemoteDisconnected', link)
-		return 1, ''
-	except (ConnectionError, socket.timeout) as e:
-		print('ConnectionError or Timeout', link)
-		return 3, ''
-
-	try:
-		page = response.read()
-	except http.client.HTTPException as e:
-		print('HTTPException', link)
-		return 1, ''
-	except (ConnectionError, socket.timeout) as e:
-		print('ConnectionError or Timeout', link)
-		return 3, ''
-	return 0, page
+    headers = headers if headers else {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
+    try:
+        req = urllib.request.Request(link, headers=headers)
+    except ValueError as e:
+        print(link, "doesn't exist.")
+        return 1, ''
+    except ConnectionResetError as e:
+        print('ConnectionResetError', link)
+        return 3, ''
+
+    try:
+        if not timeout is None:
+            response = urllib.request.urlopen(req, context=context, timeout=timeout)
+            print(response)
+        else:
+            response = urllib.request.urlopen(req, context=context)
+            print(response)
+    except UnicodeError as e:
+        print('UnicodeError for', link)
+        return 2, ''
+    except (urllib.error.HTTPError) as e:
+        print('Error {} for {}'.format(e.code, link))
+        print(e)
+        return 1, ''
+    except urllib.error.URLError as e:
+        print('URLError for', link)
+        return 1, ''
+    except http.client.HTTPException as e:
+        print('HTTPException', link)
+        return 1, ''
+    except http.client.RemoteDisconnected as e:
+        print('RemoteDisconnected', link)
+        return 1, ''
+    except (ConnectionError, socket.timeout) as e:
+        print('ConnectionError or Timeout', link)
+        return 3, ''
+
+    try:
+        page = response.read()
+    except http.client.HTTPException as e:
+        print('HTTPException', link)
+        return 1, ''
+    except (ConnectionError, socket.timeout) as e:
+        print('ConnectionError or Timeout', link)
+        return 3, ''
+    return 0, page
 
 
 def get_current_idx(index_file, links):
-	lines = open(index_file, 'r').readlines()
-	idx = len(lines)
-	if idx > 0:
-		last_seen = lines[-1].strip()
-		while True:
-			link = links.readline().strip()
-			if link == last_seen:
-				break
-	return idx, links
+    lines = open(index_file, 'r').readlines()
+    idx = len(lines)
+    if idx > 0:
+        last_seen = lines[-1].strip()
+        while True:
+            link = links.readline().strip()
+            if link == last_seen:
+                break
+    return idx, links
 
 
 def download_pages(link_file, folder, timeout=30, default_skip=True, extensions=[], domains=[]):
-	"""
+    """
 	link_file:
 		file contains links to webpages to crawl. Each line contains one URL.
 	folder:
@@ -190,70 +202,70 @@ def download_pages(link_file, folder, timeout=30, default_skip=True, extensions=
 		non_ascii.urls contains the URLs that haven't been downloaded because of bad encoding issues.
 		empty.urls contains the URLs that have empty textual content.
 	"""
-	index_file = os.path.join(folder, 'index.urls')
-	idx = 0
-	links = open(link_file, 'r')
+    index_file = os.path.join(folder, 'index.urls')
+    idx = 0
+    links = open(link_file, 'r')
 
-	if os.path.isdir(folder) and os.path.exists(index_file):
-		""" If index file exists, we've downloaded from this list of URLs before,
+    if os.path.isdir(folder) and os.path.exists(index_file):
+        """ If index file exists, we've downloaded from this list of URLs before,
 		continue from where it left off the last time
 		"""
-		idx, links = get_current_idx(index_file, links)
-		print(idx)
-	else:
-		os.makedirs(folder, exist_ok=True)
-	
-	index = open(os.path.join(folder, 'index.urls'), 'a')
-	skipped_urls = open(os.path.join(folder, 'skip.urls'), 'a')
-	bad_connection_urls = open(os.path.join(folder, 'connection.urls'), 'a')
-	bad_urls = open(os.path.join(folder, 'bad.urls'), 'a')
-	non_ascii_urls = open(os.path.join(folder, 'non_ascii.urls'), 'a')
-	empty_urls = open(os.path.join(folder, 'empty.urls'), 'a')
-
-	ctx = ssl.create_default_context()
-	ctx.check_hostname = False
-	ctx.verify_mode = ssl.CERT_NONE
-
-	hashed = hashlib.sha1()
-
-	if default_skip:
-		ext_lines = open(os.path.join(dir_path, 'exclude_extensions.txt'), 'r').readlines()
-		extensions.extend([line.strip() for line in ext_lines])
-		domain_lines = open(os.path.join(dir_path, 'exclude_domains.txt'), 'r').readlines()
-		domains.extend([line.strip() for line in domain_lines])
-
-	for link in links:
-		link = link.strip()
-		if to_skip(link, extensions, domains):
-			skipped_urls.write(link + '\n')
-			print('Skip', link)
-			continue
-
-		code, page = download_page(link, ctx, timeout)
-		if code == 1:
-			bad_urls.write(link + '\n')
-		elif code == 2:
-			non_ascii_urls.write(link + '\n')
-		elif code == 3:
-			bad_connection_urls.write(link + '\n')
-		if code > 0:
-			continue
-
-		txt = clean_page(page)
-		
-		if not txt:
-			print('Empty page', link)
-			empty_urls.write(link + '\n')
-			continue
-		
-		print(idx, link)
-		hashed.update(str(time.time()).encode())
-		name = hashed.hexdigest()
-		with open(os.path.join(folder, '{}_{}.txt'.format(idx, name)), 'w') as out:
-			out.write(link + '\n' + txt)
-
-		print(find_unprintable(txt))
-		index.write('{}\n'.format(link))
-		idx += 1
-
-	links.close()
+        idx, links = get_current_idx(index_file, links)
+        print(idx)
+    else:
+        os.makedirs(folder, exist_ok=True)
+
+    index = open(os.path.join(folder, 'index.urls'), 'a')
+    skipped_urls = open(os.path.join(folder, 'skip.urls'), 'a')
+    bad_connection_urls = open(os.path.join(folder, 'connection.urls'), 'a')
+    bad_urls = open(os.path.join(folder, 'bad.urls'), 'a')
+    non_ascii_urls = open(os.path.join(folder, 'non_ascii.urls'), 'a')
+    empty_urls = open(os.path.join(folder, 'empty.urls'), 'a')
+
+    ctx = ssl.create_default_context()
+    ctx.check_hostname = False
+    ctx.verify_mode = ssl.CERT_NONE
+
+    hashed = hashlib.sha1()
+
+    if default_skip:
+        ext_lines = open(os.path.join(dir_path, 'exclude_extensions.txt'), 'r').readlines()
+        extensions.extend([line.strip() for line in ext_lines])
+        domain_lines = open(os.path.join(dir_path, 'exclude_domains.txt'), 'r').readlines()
+        domains.extend([line.strip() for line in domain_lines])
+
+    for link in links:
+        link = link.strip()
+        if to_skip(link, extensions, domains):
+            skipped_urls.write(link + '\n')
+            print('Skip', link)
+            continue
+
+        code, page = download_page(link, ctx, timeout)
+        if code == 1:
+            bad_urls.write(link + '\n')
+        elif code == 2:
+            non_ascii_urls.write(link + '\n')
+        elif code == 3:
+            bad_connection_urls.write(link + '\n')
+        if code > 0:
+            continue
+
+        txt = clean_page(page)
+
+        if not txt:
+            print('Empty page', link)
+            empty_urls.write(link + '\n')
+            continue
+
+        print(idx, link)
+        hashed.update(str(time.time()).encode())
+        name = hashed.hexdigest()
+        with open(os.path.join(folder, '{}_{}.txt'.format(idx, name)), 'w') as out:
+            out.write(link + '\n' + txt)
+
+        print(find_unprintable(txt))
+        index.write('{}\n'.format(link))
+        idx += 1
+
+    links.close()