Skip to content

Commit

Permalink
Rewrite allowing download of whitepapers
Browse files Browse the repository at this point in the history
  • Loading branch information
Ric Harvey committed Jan 16, 2017
1 parent 0e6262f commit 5b0f636
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 84 deletions.
41 changes: 24 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,40 +1,47 @@
# getAWSdocs
# getAWSdocs.py

### About

One thing that strikes me as odd with Amazon and the documentation on AWS is that there is no download all button, to make it easy to get all the documentation in one go. After creating a simple bash script that kept breaking and needed updating, I decided to rewrite in python to make it a little easier to maintain.
## About
One thing that strikes me as odd with Amazon and the documentation on AWS is that there is no download all button, to make it easy to get all the documentation in one go. After creating a simple bash script that kept breaking and needed updating, I decided to rewrite in python to make it a little easier to maintain. This is the second rewrite which now additionally allows you to pull the whitepapers.

I hope some of you find this useful.

### Requirements

## Requirements
Make sure all these python modules are intalled:

+ BeautifulSoup
+ urllib3+
+ urlparse3
- argparse
- beautifulsoup4
- urllib3+
- urlparse3

example:

```bash
```bash
sudo pip install -r requirements.txt
```

### Usage

## Usage
To get all documents:

```bash
./getAWSdocs.py
```
./getAWSdocs.py -d
```

Downloading all the docs (290 at the time of writting) can take a long time ~20mins.

To get all whitepapers:

```
./getAWSdocs.py -w
```

Files that exist on disk will not be re-downloaded (so by default only new sections/files are downloaded). To override this default and force re-download of files that exist on disk, use

```bash
./getAWSdocs.py --force
./getAWSdocs.py -d -f
```

__Note:__ You can use a combination of -d and -w to download all documents at once.

Thats it!

[@ric_harvey](https://twitter.com/ric_harvey)
[@paulwakeford] (https://twitter.com/paulwakeford)
Built by Ric: [@ric_harvey](https://twitter.com/ric_harvey)
160 changes: 94 additions & 66 deletions getAWSdocs.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,101 @@
#!/usr/bin/python

from bs4 import BeautifulSoup
import urllib
import urlparse
import posixpath
import os
import sys
import urllib, urlparse, os, argparse

def get_options():
parser = argparse.ArgumentParser(description='AWS Documentation Downloader')
parser.add_argument('-d','--documentation', help='Download the Documentation', action='store_true', required=False)
parser.add_argument('-w','--whitepapers', help='Download White Papers', action='store_true', required=False)
parser.add_argument('-f','--force', help='Overwrite old files', action='store_true', required=False)
args = vars(parser.parse_args())
return (args)

# Build a list of the amazon service sections
def get_services():
html_page = urllib.urlopen("http://aws.amazon.com/documentation/")
# Parse the HTML page
soup = BeautifulSoup(html_page, 'html.parser')
urls = []
services = []
# Get the A tag from the parsed page
for link in soup.findAll('a'):
try:
url = link.get('href')
# ignore links to self
if url.startswith("/documentation/"):
#print link.get('href')
if not (url.endswith("/documentation/") or url.startswith("/documentation/?nc") ):
services.append(link.get('href'))
directory = "." + link.get('href')
if not os.path.exists(directory):
os.makedirs(directory)
except: continue
return services
def list_pdfs(start_page):
html_page = urllib.urlopen(start_page)
# Parse the HTML page
soup = BeautifulSoup(html_page, 'html.parser')
pdfs = set()
print "Generating PDF list (this may take some time)"
for link in soup.findAll('a'):
try:
uri = link.get('href')
# Allow whitepapers to be returned
if "whitepapers" in start_page:
if uri.endswith("pdf"):
if "whitepapers" in uri:
pdfs.add(uri)
# Allow all documents to be returned
if "documentation" in start_page:
if uri.startswith("/documentation/"):
if not (uri.endswith("/documentation/") or uri.endswith("/kindle/") or uri.startswith("/documentation/?nc") ):
#print uri
base_url = "http://aws.amazon.com"
url = base_url + uri
#print url
# Parse the HTML sub page
html_page_doc = urllib.urlopen(url)
soup_doc = BeautifulSoup(html_page_doc, 'html.parser')
# Get the A tag from the parsed page
for link in soup_doc.findAll('a'):
try:
sub_url = link.get('href')
if sub_url.endswith("pdf"):
#print sub_url
pdfs.add(sub_url)
except:
continue
except:
continue
return pdfs

def save_pdf(full_dir,filename,i):
if not os.path.exists(full_dir):
os.makedirs(full_dir)
# Open the URL and retrieve data
file_loc = full_dir + filename
if not os.path.exists(file_loc) or force == True:
if i.startswith("//"):
i = "http:" + i
print "Downloading : " + i
web = urllib.urlopen(i)
print "Saving to : " + file_loc
# Save Data to disk
output = open(file_loc,'wb')
output.write(web.read())
output.close()
else:
print "Skipping " + i + " - file exists or is a dated API document, use './getAWSdocs.py --force' to force override"


# Download the PDFs that exist on the service pages
def get_pdfs(services):
base_url = "http://aws.amazon.com"
for uri in services:
# Construct the ful URL for the service page
url = base_url + uri
print "\nDownloading PDF's for : " + url + "\n"
# Parse the HTML page
html_page_doc = urllib.urlopen(url)
soup_doc = BeautifulSoup(html_page_doc, 'html.parser')
# Get the A tag from the parsed page
for link in soup_doc.findAll('a'):
pdf = link.get('href')
# Check link is a PDF file
try:
check = pdf.endswith("pdf")
except: continue
# Now download if the link is a PDF file
if check == True:
# We need to work out the file name for saving
path = urlparse.urlsplit(pdf).path
filename = "." + uri + posixpath.basename(path)
# Nasty. AWS have uploaded ALL API versions as PDFs, not just the latest.
# They are all named as <service><docname><date>.pdf so we are
# just checking the last character before the dot and skipping download
# if it is a digit.
if (not (os.path.isfile(filename) or filename[len(filename) - 5 ].isdigit()) or (len(sys.argv) > 1 and sys.argv[1] == "--force")):
print "Downloading : " + pdf
# Open the URL and retrieve data
try:
web = urllib.urlopen(pdf)
print "Saving to : " + filename
# Save Data to disk
output = open(filename,'wb')
output.write(web.read())
output.close()
except: continue
else:
print "Skipping " + filename + " - file exists or is a dated API document, use './getAWSdocs.py --force' to force override"
def get_pdfs(pdf_list, force):
for i in pdf_list:
doc = i.split('/')
doc_location = doc[3]
if "whitepapers" in doc_location:
filename = urlparse.urlsplit(i).path.split('/')[-1]
full_dir = "whitepapers/"
else:
filename = urlparse.urlsplit(i).path.split('/')[-1]
full_dir = "documentation/"
directory = urlparse.urlsplit(i).path.split('/')[:-1]
for path in directory:
if path != "":
full_dir = full_dir + path + "/"
try:
save_pdf(full_dir,filename,i)
except:
continue

services_list = get_services()
get_pdfs(services_list)
# Main
args = get_options()
force = args['force']
if args['documentation']:
print "Downloading Docs"
pdf_list = list_pdfs("https://aws.amazon.com/documentation/")
get_pdfs(pdf_list, force)
if args['whitepapers']:
print "Downloading Whitepapaers"
pdf_list = list_pdfs("http://aws.amazon.com/whitepapers/")
get_pdfs(pdf_list, force)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
BeautifulSoup
argparse
beautifulsoup4
urllib3
urlparse3

0 comments on commit 5b0f636

Please sign in to comment.