-
Notifications
You must be signed in to change notification settings - Fork 0
/
DownloadRam.py
88 lines (68 loc) · 2.66 KB
/
DownloadRam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Author: Dan Stone
Date: 4/13/2019
This script will load a list of animals from text file and download ram (or other animals)
and save the results as json and images in image dir.
Run DownloadImages.py after this to download the images.
"""
import os
import requests
from pathlib2 import Path
import wikipedia
import json
from util import load_json, save_json
SCHEMA_FILE = Path.cwd() / 'Data' / 'animals.json'
SCHEMA = load_json(SCHEMA_FILE)
ANIMALS_LIST_FILE = Path.cwd() / 'Data' / 'animal_list1.txt'
def main():
with open(ANIMALS_LIST_FILE, 'r') as fp:
animals = fp.read()
animals = animals.splitlines()
for animal in animals:
file = Path.cwd() / f'Data/animals/{animal}.json'
if os.path.isfile(file): # skip downloaded already
continue
try:
wiki = get_wiki_page(animal)
if wiki:
img_url = get_wiki_image(wiki.images, wiki.title)
# save animal if there is image
if img_url:
write_animal(file, CommonName=animal, BriefSummary=wiki.summary, WikiContents=wiki.content,
ImageURL=img_url, Source=str(wiki.url))
except Exception:
# if some animals can not be found in wiki then we will just skip them
pass
def get_wiki_image(image_urls, name) -> str:
del image_urls
'''use wikipedia api to try to find main/primary image for page
other pictures can often have strange things like skeletons or special features of animal'''
url = f'https://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles={name}'
try:
response = requests.get(url)
results = json.loads(response.text)
# return image of first page found
return list(results["query"]["pages"].values())[0]["original"]["source"]
except Exception:
print("img not found: ", url)
return None
def get_wiki_page(name):
try:
wiki = wikipedia.page(name)
return wiki
except wikipedia.exceptions.DisambiguationError as e:
# try to find correct page if there are multiple
possible_entries = str(e).splitlines()
possibles = [entry for entry in possible_entries if 'species' in entry]
if possibles:
return get_wiki_page(possibles[0])
except wikipedia.exceptions.PageError:
return None
return None
def write_animal(file, **kwargs):
dict_ = {k:v for k,v in kwargs.items() if k in SCHEMA.keys()} # only save elements in schema
print(dict_)
with open(file, 'w') as fp:
json.dump(dict_, fp, indent=4)
if __name__ == '__main__':
main()