Rewrite allowing download of whitepapers

richarvey · Jan 16, 2017 · 5b0f636 · 5b0f636
1 parent 0e6262f
commit 5b0f636
Show file tree

Hide file tree

Showing 3 changed files with 120 additions and 84 deletions.
diff --git a/README.md b/README.md
@@ -1,40 +1,47 @@
-# getAWSdocs
+# getAWSdocs.py
 
-### About
-
-One thing that strikes me as odd with Amazon and the documentation on AWS is that there is no download all button, to make it easy to get all the documentation in one go. After creating a simple bash script that kept breaking and needed updating, I decided to rewrite in python to make it a little easier to maintain.  
+## About
+One thing that strikes me as odd with Amazon and the documentation on AWS is that there is no download all button, to make it easy to get all the documentation in one go. After creating a simple bash script that kept breaking and needed updating, I decided to rewrite in python to make it a little easier to maintain. This is the second rewrite which now additionally allows you to pull the whitepapers.
 
 I hope some of you find this useful.
 
-### Requirements
-
+## Requirements
 Make sure all these python modules are intalled:
 
-+ BeautifulSoup
-+ urllib3+
-+ urlparse3
+ - argparse
+ - beautifulsoup4
+ - urllib3+
+ - urlparse3
 
 example:
 
-```bash 
+```bash
 sudo pip install -r requirements.txt
 ```
 
-### Usage
-
+## Usage
 To get all documents:
 
-```bash
-./getAWSdocs.py
+```
+./getAWSdocs.py -d
+```
+
+Downloading all the docs (290 at the time of writting) can take a long time ~20mins.
+
+To get all whitepapers:
+
+```
+./getAWSdocs.py -w
 ```
 
 Files that exist on disk will not be re-downloaded (so by default only new sections/files are downloaded). To override this default and force re-download of files that exist on disk, use
 
 ```bash
-./getAWSdocs.py --force
+./getAWSdocs.py -d -f
 ```
 
+__Note:__ You can use a combination of -d and -w to download all documents at once.
+
 Thats it!
 
-[@ric_harvey](https://twitter.com/ric_harvey)
-[@paulwakeford] (https://twitter.com/paulwakeford)
+Built by Ric: [@ric_harvey](https://twitter.com/ric_harvey)
diff --git a/getAWSdocs.py b/getAWSdocs.py
@@ -1,73 +1,101 @@
 #!/usr/bin/python
 
 from bs4 import BeautifulSoup
-import urllib
-import urlparse
-import posixpath
-import os
-import sys
+import urllib, urlparse, os, argparse
+
+def get_options():
+  parser = argparse.ArgumentParser(description='AWS Documentation Downloader')
+  parser.add_argument('-d','--documentation', help='Download the Documentation', action='store_true', required=False)
+  parser.add_argument('-w','--whitepapers', help='Download White Papers', action='store_true', required=False)
+  parser.add_argument('-f','--force', help='Overwrite old files', action='store_true', required=False)
+  args = vars(parser.parse_args())
+  return (args)
 
 # Build a list of the amazon service sections
-def get_services():
-    html_page = urllib.urlopen("http://aws.amazon.com/documentation/")
-    # Parse the HTML page
-    soup = BeautifulSoup(html_page, 'html.parser')
-    urls = []
-    services = []
-    # Get the A tag from the parsed page
-    for link in soup.findAll('a'):
-        try:
-            url = link.get('href')
-	    # ignore links to self
-	    if url.startswith("/documentation/"):
-                #print link.get('href')
-		if not (url.endswith("/documentation/") or url.startswith("/documentation/?nc") ):
-		    services.append(link.get('href'))
-		    directory = "." + link.get('href')
-		    if not os.path.exists(directory):
-		        os.makedirs(directory)
-        except: continue
-    return services
+def list_pdfs(start_page):
+  html_page = urllib.urlopen(start_page)
+  # Parse the HTML page
+  soup = BeautifulSoup(html_page, 'html.parser')
+  pdfs =  set()
+  print "Generating PDF list (this may take some time)"
+  for link in soup.findAll('a'):
+    try:
+      uri = link.get('href')
+      # Allow whitepapers to be returned
+      if "whitepapers" in start_page:
+        if uri.endswith("pdf"):
+          if "whitepapers" in uri:
+            pdfs.add(uri)
+      # Allow all documents to be returned
+      if "documentation" in start_page:
+        if uri.startswith("/documentation/"):
+          if not (uri.endswith("/documentation/") or uri.endswith("/kindle/") or uri.startswith("/documentation/?nc") ):
+            #print uri
+            base_url = "http://aws.amazon.com"
+            url = base_url + uri
+            #print url
+	    # Parse the HTML sub page
+    	    html_page_doc = urllib.urlopen(url)
+	    soup_doc = BeautifulSoup(html_page_doc, 'html.parser')
+	    # Get the A tag from the parsed page
+	    for link in soup_doc.findAll('a'):
+              try:
+                sub_url = link.get('href')
+                if sub_url.endswith("pdf"):
+                  #print sub_url
+                  pdfs.add(sub_url)
+              except:
+                continue
+    except:
+     continue
+  return pdfs
+
+def save_pdf(full_dir,filename,i):
+  if not os.path.exists(full_dir):
+    os.makedirs(full_dir)
+  # Open the URL and retrieve data
+  file_loc = full_dir + filename
+  if not os.path.exists(file_loc) or force == True:
+    if i.startswith("//"):
+      i = "http:" + i
+    print "Downloading : " + i  
+    web = urllib.urlopen(i)
+    print "Saving to : " + file_loc
+    # Save Data to disk
+    output = open(file_loc,'wb')
+    output.write(web.read())
+    output.close()
+  else:
+    print "Skipping " + i + " - file exists or is a dated API document, use './getAWSdocs.py --force' to force override"
+
 
-# Download the PDFs that exist on the service pages
-def get_pdfs(services):
-    base_url = "http://aws.amazon.com"
-    for uri in services:
-	# Construct the ful URL for the service page
-	url = base_url + uri
-	print "\nDownloading PDF's for : " + url + "\n"
-	# Parse the HTML page
-	html_page_doc = urllib.urlopen(url)
-	soup_doc = BeautifulSoup(html_page_doc, 'html.parser')
-	# Get the A tag from the parsed page
-	for link in soup_doc.findAll('a'):
-            pdf = link.get('href')
-	    # Check link is a PDF file
-	    try:
-		check = pdf.endswith("pdf")
-	    except: continue
-	    # Now download if the link is a PDF file
-            if check == True:
-		# We need to work out the file name for saving
-                path = urlparse.urlsplit(pdf).path
-                filename = "." + uri + posixpath.basename(path)
-        # Nasty. AWS have uploaded ALL API versions as PDFs, not just the latest.
-        # They are all named as <service><docname><date>.pdf so we are
-        # just checking the last character before the dot and skipping download
-        # if it is a digit.
-		if (not (os.path.isfile(filename) or filename[len(filename) - 5 ].isdigit()) or (len(sys.argv) > 1 and sys.argv[1] == "--force")):
-			print "Downloading : " + pdf
-			# Open the URL and retrieve data
-			try:
-				web = urllib.urlopen(pdf)
-				print "Saving to : " + filename
-				# Save Data to disk
-				output = open(filename,'wb')
-				output.write(web.read())
-				output.close()
-			except: continue
-		else:
-			print "Skipping " + filename + " - file exists or is a dated API document, use './getAWSdocs.py --force' to force override"
+def get_pdfs(pdf_list, force):
+  for i in pdf_list:
+   doc = i.split('/')
+   doc_location = doc[3]
+   if "whitepapers" in doc_location:
+     filename = urlparse.urlsplit(i).path.split('/')[-1]
+     full_dir = "whitepapers/"
+   else:
+     filename = urlparse.urlsplit(i).path.split('/')[-1]
+     full_dir = "documentation/"
+     directory = urlparse.urlsplit(i).path.split('/')[:-1]
+     for path in directory:
+       if path != "":
+         full_dir = full_dir + path + "/"
+   try:
+     save_pdf(full_dir,filename,i)
+   except:
+     continue
 
-services_list = get_services()
-get_pdfs(services_list)
+# Main
+args = get_options()
+force = args['force']
+if args['documentation']:
+  print "Downloading Docs"
+  pdf_list = list_pdfs("https://aws.amazon.com/documentation/")
+  get_pdfs(pdf_list, force)
+if args['whitepapers']:
+  print "Downloading Whitepapaers"
+  pdf_list = list_pdfs("http://aws.amazon.com/whitepapers/")
+  get_pdfs(pdf_list, force)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
-BeautifulSoup
+argparse
+beautifulsoup4
 urllib3
 urlparse3