-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Rewrite allowing download of whitepapers
- Loading branch information
Ric Harvey
committed
Jan 16, 2017
1 parent
0e6262f
commit 5b0f636
Showing
3 changed files
with
120 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,40 +1,47 @@ | ||
# getAWSdocs | ||
# getAWSdocs.py | ||
|
||
### About | ||
|
||
One thing that strikes me as odd with Amazon and the documentation on AWS is that there is no download all button, to make it easy to get all the documentation in one go. After creating a simple bash script that kept breaking and needed updating, I decided to rewrite in python to make it a little easier to maintain. | ||
## About | ||
One thing that strikes me as odd with Amazon and the documentation on AWS is that there is no download all button, to make it easy to get all the documentation in one go. After creating a simple bash script that kept breaking and needed updating, I decided to rewrite in python to make it a little easier to maintain. This is the second rewrite which now additionally allows you to pull the whitepapers. | ||
|
||
I hope some of you find this useful. | ||
|
||
### Requirements | ||
|
||
## Requirements | ||
Make sure all these python modules are intalled: | ||
|
||
+ BeautifulSoup | ||
+ urllib3+ | ||
+ urlparse3 | ||
- argparse | ||
- beautifulsoup4 | ||
- urllib3+ | ||
- urlparse3 | ||
|
||
example: | ||
|
||
```bash | ||
```bash | ||
sudo pip install -r requirements.txt | ||
``` | ||
|
||
### Usage | ||
|
||
## Usage | ||
To get all documents: | ||
|
||
```bash | ||
./getAWSdocs.py | ||
``` | ||
./getAWSdocs.py -d | ||
``` | ||
|
||
Downloading all the docs (290 at the time of writting) can take a long time ~20mins. | ||
|
||
To get all whitepapers: | ||
|
||
``` | ||
./getAWSdocs.py -w | ||
``` | ||
|
||
Files that exist on disk will not be re-downloaded (so by default only new sections/files are downloaded). To override this default and force re-download of files that exist on disk, use | ||
|
||
```bash | ||
./getAWSdocs.py --force | ||
./getAWSdocs.py -d -f | ||
``` | ||
|
||
__Note:__ You can use a combination of -d and -w to download all documents at once. | ||
|
||
Thats it! | ||
|
||
[@ric_harvey](https://twitter.com/ric_harvey) | ||
[@paulwakeford] (https://twitter.com/paulwakeford) | ||
Built by Ric: [@ric_harvey](https://twitter.com/ric_harvey) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,73 +1,101 @@ | ||
#!/usr/bin/python | ||
|
||
from bs4 import BeautifulSoup | ||
import urllib | ||
import urlparse | ||
import posixpath | ||
import os | ||
import sys | ||
import urllib, urlparse, os, argparse | ||
|
||
def get_options(): | ||
parser = argparse.ArgumentParser(description='AWS Documentation Downloader') | ||
parser.add_argument('-d','--documentation', help='Download the Documentation', action='store_true', required=False) | ||
parser.add_argument('-w','--whitepapers', help='Download White Papers', action='store_true', required=False) | ||
parser.add_argument('-f','--force', help='Overwrite old files', action='store_true', required=False) | ||
args = vars(parser.parse_args()) | ||
return (args) | ||
|
||
# Build a list of the amazon service sections | ||
def get_services(): | ||
html_page = urllib.urlopen("http://aws.amazon.com/documentation/") | ||
# Parse the HTML page | ||
soup = BeautifulSoup(html_page, 'html.parser') | ||
urls = [] | ||
services = [] | ||
# Get the A tag from the parsed page | ||
for link in soup.findAll('a'): | ||
try: | ||
url = link.get('href') | ||
# ignore links to self | ||
if url.startswith("/documentation/"): | ||
#print link.get('href') | ||
if not (url.endswith("/documentation/") or url.startswith("/documentation/?nc") ): | ||
services.append(link.get('href')) | ||
directory = "." + link.get('href') | ||
if not os.path.exists(directory): | ||
os.makedirs(directory) | ||
except: continue | ||
return services | ||
def list_pdfs(start_page): | ||
html_page = urllib.urlopen(start_page) | ||
# Parse the HTML page | ||
soup = BeautifulSoup(html_page, 'html.parser') | ||
pdfs = set() | ||
print "Generating PDF list (this may take some time)" | ||
for link in soup.findAll('a'): | ||
try: | ||
uri = link.get('href') | ||
# Allow whitepapers to be returned | ||
if "whitepapers" in start_page: | ||
if uri.endswith("pdf"): | ||
if "whitepapers" in uri: | ||
pdfs.add(uri) | ||
# Allow all documents to be returned | ||
if "documentation" in start_page: | ||
if uri.startswith("/documentation/"): | ||
if not (uri.endswith("/documentation/") or uri.endswith("/kindle/") or uri.startswith("/documentation/?nc") ): | ||
#print uri | ||
base_url = "http://aws.amazon.com" | ||
url = base_url + uri | ||
#print url | ||
# Parse the HTML sub page | ||
html_page_doc = urllib.urlopen(url) | ||
soup_doc = BeautifulSoup(html_page_doc, 'html.parser') | ||
# Get the A tag from the parsed page | ||
for link in soup_doc.findAll('a'): | ||
try: | ||
sub_url = link.get('href') | ||
if sub_url.endswith("pdf"): | ||
#print sub_url | ||
pdfs.add(sub_url) | ||
except: | ||
continue | ||
except: | ||
continue | ||
return pdfs | ||
|
||
def save_pdf(full_dir,filename,i): | ||
if not os.path.exists(full_dir): | ||
os.makedirs(full_dir) | ||
# Open the URL and retrieve data | ||
file_loc = full_dir + filename | ||
if not os.path.exists(file_loc) or force == True: | ||
if i.startswith("//"): | ||
i = "http:" + i | ||
print "Downloading : " + i | ||
web = urllib.urlopen(i) | ||
print "Saving to : " + file_loc | ||
# Save Data to disk | ||
output = open(file_loc,'wb') | ||
output.write(web.read()) | ||
output.close() | ||
else: | ||
print "Skipping " + i + " - file exists or is a dated API document, use './getAWSdocs.py --force' to force override" | ||
|
||
|
||
# Download the PDFs that exist on the service pages | ||
def get_pdfs(services): | ||
base_url = "http://aws.amazon.com" | ||
for uri in services: | ||
# Construct the ful URL for the service page | ||
url = base_url + uri | ||
print "\nDownloading PDF's for : " + url + "\n" | ||
# Parse the HTML page | ||
html_page_doc = urllib.urlopen(url) | ||
soup_doc = BeautifulSoup(html_page_doc, 'html.parser') | ||
# Get the A tag from the parsed page | ||
for link in soup_doc.findAll('a'): | ||
pdf = link.get('href') | ||
# Check link is a PDF file | ||
try: | ||
check = pdf.endswith("pdf") | ||
except: continue | ||
# Now download if the link is a PDF file | ||
if check == True: | ||
# We need to work out the file name for saving | ||
path = urlparse.urlsplit(pdf).path | ||
filename = "." + uri + posixpath.basename(path) | ||
# Nasty. AWS have uploaded ALL API versions as PDFs, not just the latest. | ||
# They are all named as <service><docname><date>.pdf so we are | ||
# just checking the last character before the dot and skipping download | ||
# if it is a digit. | ||
if (not (os.path.isfile(filename) or filename[len(filename) - 5 ].isdigit()) or (len(sys.argv) > 1 and sys.argv[1] == "--force")): | ||
print "Downloading : " + pdf | ||
# Open the URL and retrieve data | ||
try: | ||
web = urllib.urlopen(pdf) | ||
print "Saving to : " + filename | ||
# Save Data to disk | ||
output = open(filename,'wb') | ||
output.write(web.read()) | ||
output.close() | ||
except: continue | ||
else: | ||
print "Skipping " + filename + " - file exists or is a dated API document, use './getAWSdocs.py --force' to force override" | ||
def get_pdfs(pdf_list, force): | ||
for i in pdf_list: | ||
doc = i.split('/') | ||
doc_location = doc[3] | ||
if "whitepapers" in doc_location: | ||
filename = urlparse.urlsplit(i).path.split('/')[-1] | ||
full_dir = "whitepapers/" | ||
else: | ||
filename = urlparse.urlsplit(i).path.split('/')[-1] | ||
full_dir = "documentation/" | ||
directory = urlparse.urlsplit(i).path.split('/')[:-1] | ||
for path in directory: | ||
if path != "": | ||
full_dir = full_dir + path + "/" | ||
try: | ||
save_pdf(full_dir,filename,i) | ||
except: | ||
continue | ||
|
||
services_list = get_services() | ||
get_pdfs(services_list) | ||
# Main | ||
args = get_options() | ||
force = args['force'] | ||
if args['documentation']: | ||
print "Downloading Docs" | ||
pdf_list = list_pdfs("https://aws.amazon.com/documentation/") | ||
get_pdfs(pdf_list, force) | ||
if args['whitepapers']: | ||
print "Downloading Whitepapaers" | ||
pdf_list = list_pdfs("http://aws.amazon.com/whitepapers/") | ||
get_pdfs(pdf_list, force) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
BeautifulSoup | ||
argparse | ||
beautifulsoup4 | ||
urllib3 | ||
urlparse3 |