From 53cf4f252750afa3a61087744688e77ebb512e76 Mon Sep 17 00:00:00 2001 From: Emmanuel Schmitt Date: Sun, 29 Dec 2019 23:06:08 +0100 Subject: [PATCH 1/3] Support of Dec. 2019 AWS Doc API changes --- .gitignore | 1 + getAWSdocs.py | 299 +++++++++++++++++++++++++++++--------------------- 2 files changed, 173 insertions(+), 127 deletions(-) diff --git a/.gitignore b/.gitignore index cc049a7..6128292 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .DS_Store documentation/* whitepapers/* +builderslibrary/* diff --git a/getAWSdocs.py b/getAWSdocs.py index 13eee3e..6cd9e88 100755 --- a/getAWSdocs.py +++ b/getAWSdocs.py @@ -1,158 +1,203 @@ #!/usr/bin/env python3 from bs4 import BeautifulSoup -import os, argparse +import os +import argparse from urllib.parse import urlparse, urlsplit from urllib.request import urlopen import json + def get_options(): - parser = argparse.ArgumentParser(description='AWS Documentation Downloader') - parser.add_argument('-d','--documentation', help='Download the Documentation', action='store_true', required=False) - parser.add_argument('-w','--whitepapers', help='Download White Papers', action='store_true', required=False) - parser.add_argument('-f','--force', help='Overwrite old files', action='store_true', required=False) - args = vars(parser.parse_args()) - return (args) + parser = argparse.ArgumentParser( + description='AWS Documentation Downloader') + parser.add_argument('-d', '--documentation', + help='Download the Documentation', action='store_true', required=False) + parser.add_argument('-w', '--whitepapers', help='Download White Papers', + action='store_true', required=False) + parser.add_argument('-b', '--builderlibrary', help='Download Documents in Builder Library', + action='store_true', required=False) + parser.add_argument('-f', '--force', help='Overwrite old files', + action='store_true', required=False) + args = vars(parser.parse_args()) + return (args) # Build a list of the amazon PDF's -def list_whitepaper_pdfs(start_page): - html_page = urlopen(start_page) - # Parse the HTML page - soup = BeautifulSoup(html_page, 'html.parser') - pdfs = set() - print("Generating PDF list (this may take some time)") - for link in soup.findAll('a'): - try: - uri = link.get('href') - print('URI: ', uri) - # Allow whitepapers to be returned - if "whitepapers" in start_page: - if uri.endswith("pdf"): - if "whitepapers" in uri or "enterprise-marketing" in uri: - pdfs.add(uri) - except: - continue - return pdfs +# update based on new Whitepaper page - DEC. 2019 +def list_whitepaper_pdfs(): + # Max paging in json response for whitepaper + PAGE_SIZE_CONST = 15 + # Parse the JSON Response + responseAsJson = json.loads(urlopen( + "https://aws.amazon.com/api/dirs/items/search?item.directoryId=whitepapers&sort_by=item.additionalFields.sortDate&sort_order=desc&size="+str(PAGE_SIZE_CONST)+"&item.locale=en_US&tags.id=whitepapers%23content-type%23whitepaper").read().decode('UTF-8')) + # Retreiving metadata json to get the lenght of the witepapers list metadata.count + maxNumberofDocuments = responseAsJson['metadata']['totalHits'] + print("Number of Whitepapers to be retrieved: " + str(maxNumberofDocuments)) + maxPage = maxNumberofDocuments // PAGE_SIZE_CONST + 1 + print("Number of iterations :"+ str(maxPage)) + pdfs = set() + currentPage = 0 + print("Generating PDF list (this may take some time)") + while currentPage < maxPage: + responseAsJson = json.loads(urlopen( + "https://aws.amazon.com/api/dirs/items/search?item.directoryId=whitepapers&sort_by=item.additionalFields.sortDate&sort_order=desc&size="+str(PAGE_SIZE_CONST)+"&item.locale=en_US&tags.id=whitepapers%23content-type%23whitepaper&page="+str(currentPage)).read().decode('UTF-8')) + for item in responseAsJson['items']: + print("URL to be added to pdf list: "+item['item']['additionalFields']['primaryURL']) + pdfs.add(item['item']['additionalFields']['primaryURL']) + currentPage += 1 + return pdfs + +# Build a list of the amazon builder library PDF's +def list_builderlibrary_pdfs(): + # Max paging in json response for whitepaper + PAGE_SIZE_CONST = 15 + # Parse the JSON Response + responseAsJson = json.loads(urlopen( + "https://aws.amazon.com/api/dirs/items/search?item.directoryId=amazon-redwood&sort_by=item.additionalFields.customSort&sort_order=asc&size="+str(PAGE_SIZE_CONST)+"&item.locale=en_US").read().decode('UTF-8')) + # Retreiving metadata json to get the lenght of the witepapers list metadata.count + maxNumberofDocuments = responseAsJson['metadata']['totalHits'] + print("Number of Whitepapers to be retrieved: " + str(maxNumberofDocuments)) + maxPage = maxNumberofDocuments // PAGE_SIZE_CONST + 1 + print("Number of iterations :"+ str(maxPage)) + pdfs = set() + currentPage = 0 + print("Generating PDF list (this may take some time)") + while currentPage < maxPage: + responseAsJson = json.loads(urlopen( + "https://aws.amazon.com/api/dirs/items/search?item.directoryId=amazon-redwood&sort_by=item.additionalFields.customSort&sort_order=asc&size="+str(PAGE_SIZE_CONST)+"&item.locale=en_US&page="+str(currentPage)).read().decode('UTF-8')) + for item in responseAsJson['items']: + print("URL to be added to pdf list: "+item['item']['additionalFields']['downloadUrl']) + pdfs.add(item['item']['additionalFields']['downloadUrl']) + currentPage += 1 + return pdfs + + def find_pdfs_in_html(url): - html_page_doc = urlopen(url) - soup_doc = BeautifulSoup(html_page_doc, 'html.parser') - # Get the A tag from the parsed page - pdfs = set() - for link in soup_doc.findAll('a'): - try: - sub_url = link.get('href') - if sub_url.endswith("pdf"): - pdfs.add(sub_url) - except: - continue - return pdfs + html_page_doc = urlopen(url) + soup_doc = BeautifulSoup(html_page_doc, 'html.parser') + # Get the A tag from the parsed page + pdfs = set() + for link in soup_doc.findAll('a'): + try: + sub_url = link.get('href') + if sub_url.endswith("pdf"): + pdfs.add(sub_url) + except: + continue + return pdfs def list_docs_pdfs(start_page): - locale_path = "en_us/" - base_url = "http://docs.aws.amazon.com" - - page = urlopen(start_page) - soup = BeautifulSoup(page, "xml") - pdfs = set() - print("Generating PDF list (this may take some time)") - - for link in soup.findAll('service'): - try: - uri = link.get('href') - print('URI: ', uri) - # if service uri is .html then parse as HTML - if '.html' in uri: - url = base_url + uri - pdfs = pdfs.union(find_pdfs_in_html(url)) - continue - - # if service uri ends with "/" find and parse xml landing page - if not uri.startswith('http'): - url = base_url + uri.split("?")[0] + locale_path + "landing-page.xml" - - # Fetch the XML sub page (this is where the links to the pdf's live) - sub_page_doc = urlopen(url) - soup_doc = BeautifulSoup(sub_page_doc, 'xml') - - # Get the "tile" tag from the parsed page - for sublink in soup_doc.findAll('tile'): - try: - sub_url = sublink.get('href') - directory = base_url + "/".join(urlsplit(sub_url).path.split('/')[:-1]) + locale_path = "en_us/" + base_url = "http://docs.aws.amazon.com" - guide_info_url = directory + "/meta-inf/guide-info.json" - print("Guide info url:", guide_info_url) - guide_info_doc = urlopen(guide_info_url).read() - guide_info = json.loads(guide_info_doc) + page = urlopen(start_page) + soup = BeautifulSoup(page, "xml") + pdfs = set() + print("Generating PDF list (this may take some time)") - if "pdf" in guide_info: - pdf_url = directory + "/" + guide_info["pdf"] - pdfs.add(pdf_url) + for link in soup.findAll('service'): + try: + uri = link.get('href') + print('URI: ', uri) + # if service uri is .html then parse as HTML + if '.html' in uri: + url = base_url + uri + pdfs = pdfs.union(find_pdfs_in_html(url)) + continue + + # if service uri ends with "/" find and parse xml landing page + if not uri.startswith('http'): + url = base_url + \ + uri.split("?")[0] + locale_path + "landing-page.xml" + + # Fetch the XML sub page (this is where the links to the pdf's live) + sub_page_doc = urlopen(url) + soup_doc = BeautifulSoup(sub_page_doc, 'xml') + + # Get the "tile" tag from the parsed page + for sublink in soup_doc.findAll('tile'): + try: + sub_url = sublink.get('href') + directory = base_url + \ + "/".join(urlsplit(sub_url).path.split('/')[:-1]) + + guide_info_url = directory + "/meta-inf/guide-info.json" + print("Guide info url:", guide_info_url) + guide_info_doc = urlopen(guide_info_url).read() + guide_info = json.loads(guide_info_doc) + + if "pdf" in guide_info: + pdf_url = directory + "/" + guide_info["pdf"] + pdfs.add(pdf_url) + except: + continue except: - continue - except: - continue - return pdfs - - -def save_pdf(full_dir,filename,i): - if not os.path.exists(full_dir): - os.makedirs(full_dir) - # Open the URL and retrieve data - file_loc = full_dir + filename - if not os.path.exists(file_loc) or force == True: - if i.startswith("//"): - i = "http:" + i - print("Downloading : " + i) - web = urlopen(i) - print("Saving to : " + file_loc) - # Save Data to disk - output = open(file_loc,'wb') - output.write(web.read()) - output.close() - else: - print("Skipping " + i + " - file exists or is a dated API document, use './getAWSdocs.py --force' to force override") + continue + return pdfs + + +def save_pdf(full_dir, filename, i): + if not os.path.exists(full_dir): + os.makedirs(full_dir) + # Open the URL and retrieve data + file_loc = full_dir + filename + if not os.path.exists(file_loc) or force == True: + if i.startswith("//"): + i = "http:" + i + print("Downloading : " + i) + web = urlopen(i) + print("Saving to : " + file_loc) + # Save Data to disk + output = open(file_loc, 'wb') + output.write(web.read()) + output.close() + else: + print("Skipping " + i + " - file exists or is a dated API document, use './getAWSdocs.py --force' to force override") def get_pdfs(pdf_list, force): - for i in pdf_list: - doc = i.split('/') - doc_location = doc[3] - filename = urlsplit(i).path.split('/')[-1] - # Set download dir for whitepapers - if "whitepapers" in doc_location: - full_dir = "whitepapers/" - else: - # Set download dir and sub directories for documentation - full_dir = "documentation/" - directory = urlsplit(i).path.split('/')[:-1] - for path in directory: - if path != "": - full_dir = full_dir + path + "/" - try: - save_pdf(full_dir,filename,i) - except: - continue + for i in pdf_list: + doc = i.split('/') + doc_location = doc[3] + filename = urlsplit(i).path.split('/')[-1] + # Set download dir for whitepapers + if "whitepapers" in doc_location: + full_dir = "whitepapers/" + if "builderslibrary" in doc_location: + full_dir = "builderslibrary/" + else: + # Set download dir and sub directories for documentation + full_dir = "documentation/" + directory = urlsplit(i).path.split('/')[:-1] + for path in directory: + if path != "": + full_dir = full_dir + path + "/" + try: + save_pdf(full_dir, filename, i) + except: + continue + # Main args = get_options() # allow user to overwrite files force = args['force'] if args['documentation']: - print("Downloading Docs") - pdf_list = list_docs_pdfs("https://docs.aws.amazon.com/en_us/main-landing-page.xml") - get_pdfs(pdf_list, force) + print("Downloading Docs") + pdf_list = list_docs_pdfs( + "https://docs.aws.amazon.com/en_us/main-landing-page.xml") + get_pdfs(pdf_list, force) if args['whitepapers']: - print("Downloading Whitepapaers") - pdf_list = list_whitepaper_pdfs("http://aws.amazon.com/whitepapers/") - get_pdfs(pdf_list, force) - print("Downloading SAP Whitepapaers") - pdf_list = list_whitepaper_pdfs("https://aws.amazon.com/sap/whitepapers/") - get_pdfs(pdf_list, force) - + print("Downloading Whitepapers") + pdf_list = list_whitepaper_pdfs() + get_pdfs(pdf_list, force) +if args['builderlibrary']: + print("Downloading Builder Lib document") + pdf_list = list_builderlibrary_pdfs() + get_pdfs(pdf_list, force) for p in pdf_list: - print(p) + print(p) From 3e0c23f0e856b749e2fd1526df59ad5e1c43b842 Mon Sep 17 00:00:00 2001 From: Emmanuel Schmitt Date: Tue, 15 Dec 2020 23:06:18 +0100 Subject: [PATCH 2/3] change documentation api on whitepaper & builder library --- getAWSdocs.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/getAWSdocs.py b/getAWSdocs.py index 6cd9e88..f1aab56 100755 --- a/getAWSdocs.py +++ b/getAWSdocs.py @@ -29,7 +29,7 @@ def list_whitepaper_pdfs(): PAGE_SIZE_CONST = 15 # Parse the JSON Response responseAsJson = json.loads(urlopen( - "https://aws.amazon.com/api/dirs/items/search?item.directoryId=whitepapers&sort_by=item.additionalFields.sortDate&sort_order=desc&size="+str(PAGE_SIZE_CONST)+"&item.locale=en_US&tags.id=whitepapers%23content-type%23whitepaper").read().decode('UTF-8')) + "https://aws.amazon.com/api/dirs/items/search?item.directoryId=whitepapers&sort_by=item.additionalFields.sortDate&sort_order=desc&size="+str(PAGE_SIZE_CONST)+"&item.locale=en_US&tags.id=GLOBAL%23content-type%23whitepaper").read().decode('UTF-8')) # Retreiving metadata json to get the lenght of the witepapers list metadata.count maxNumberofDocuments = responseAsJson['metadata']['totalHits'] print("Number of Whitepapers to be retrieved: " + str(maxNumberofDocuments)) @@ -40,7 +40,7 @@ def list_whitepaper_pdfs(): print("Generating PDF list (this may take some time)") while currentPage < maxPage: responseAsJson = json.loads(urlopen( - "https://aws.amazon.com/api/dirs/items/search?item.directoryId=whitepapers&sort_by=item.additionalFields.sortDate&sort_order=desc&size="+str(PAGE_SIZE_CONST)+"&item.locale=en_US&tags.id=whitepapers%23content-type%23whitepaper&page="+str(currentPage)).read().decode('UTF-8')) + "https://aws.amazon.com/api/dirs/items/search?item.directoryId=whitepapers&sort_by=item.additionalFields.sortDate&sort_order=desc&size="+str(PAGE_SIZE_CONST)+"&item.locale=en_US&tags.id=GLOBAL%23content-type%23whitepaper&page="+str(currentPage)).read().decode('UTF-8')) for item in responseAsJson['items']: print("URL to be added to pdf list: "+item['item']['additionalFields']['primaryURL']) pdfs.add(item['item']['additionalFields']['primaryURL']) @@ -49,14 +49,14 @@ def list_whitepaper_pdfs(): # Build a list of the amazon builder library PDF's def list_builderlibrary_pdfs(): - # Max paging in json response for whitepaper + # Max paging in json response for builderlib whitepaper PAGE_SIZE_CONST = 15 # Parse the JSON Response responseAsJson = json.loads(urlopen( "https://aws.amazon.com/api/dirs/items/search?item.directoryId=amazon-redwood&sort_by=item.additionalFields.customSort&sort_order=asc&size="+str(PAGE_SIZE_CONST)+"&item.locale=en_US").read().decode('UTF-8')) - # Retreiving metadata json to get the lenght of the witepapers list metadata.count + # Retreiving metadata json to get the length of the witepapers list metadata.count maxNumberofDocuments = responseAsJson['metadata']['totalHits'] - print("Number of Whitepapers to be retrieved: " + str(maxNumberofDocuments)) + print("Number of builderlib papers to be retrieved: " + str(maxNumberofDocuments)) maxPage = maxNumberofDocuments // PAGE_SIZE_CONST + 1 print("Number of iterations :"+ str(maxPage)) pdfs = set() @@ -67,13 +67,14 @@ def list_builderlibrary_pdfs(): "https://aws.amazon.com/api/dirs/items/search?item.directoryId=amazon-redwood&sort_by=item.additionalFields.customSort&sort_order=asc&size="+str(PAGE_SIZE_CONST)+"&item.locale=en_US&page="+str(currentPage)).read().decode('UTF-8')) for item in responseAsJson['items']: print("URL to be added to pdf list: "+item['item']['additionalFields']['downloadUrl']) - pdfs.add(item['item']['additionalFields']['downloadUrl']) - currentPage += 1 + try: + downloadUrl=item['item']['additionalFields']['downloadUrl'] + except KeyError: + print("Document with no downloadUrl") + pdfs.add(downloadUrl) + currentPage += 1 return pdfs - - - def find_pdfs_in_html(url): html_page_doc = urlopen(url) soup_doc = BeautifulSoup(html_page_doc, 'html.parser') @@ -183,6 +184,7 @@ def get_pdfs(pdf_list, force): # Main args = get_options() +pdf_list = set() # allow user to overwrite files force = args['force'] if args['documentation']: @@ -196,7 +198,7 @@ def get_pdfs(pdf_list, force): pdf_list = list_whitepaper_pdfs() get_pdfs(pdf_list, force) if args['builderlibrary']: - print("Downloading Builder Lib document") + print("Downloading Builder Lib documents") pdf_list = list_builderlibrary_pdfs() get_pdfs(pdf_list, force) for p in pdf_list: From d37c7bf28226f557d4d966e49ded33bd0bfca7b6 Mon Sep 17 00:00:00 2001 From: Emmanuel Schmitt Date: Sun, 31 Jan 2021 19:31:47 +0100 Subject: [PATCH 3/3] remove virtual env from commit --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 6128292..3a5b62f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ documentation/* whitepapers/* builderslibrary/* +env