From be71350cb37b1c1b72b858eaed37d4f3247f58b2 Mon Sep 17 00:00:00 2001 From: Asabeneh Date: Thu, 8 Jul 2021 23:04:20 +0300 Subject: [PATCH] scrapping --- 22_Day_Web_scraping/22_web_scraping.md | 23 ++++---- 22_Day_Web_scraping/scrapped_data.json | 73 ++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 12 deletions(-) create mode 100644 22_Day_Web_scraping/scrapped_data.json diff --git a/22_Day_Web_scraping/22_web_scraping.md b/22_Day_Web_scraping/22_web_scraping.md index 31ce3976..5c6a69bc 100644 --- a/22_Day_Web_scraping/22_web_scraping.md +++ b/22_Day_Web_scraping/22_web_scraping.md @@ -9,10 +9,8 @@ Author: Asabeneh Yetayeh
- First Edition: Nov 22 - Dec 22, 2019 + Second Edition: July, 2021
- - [<< Day 21](../21_Day_Classes_and_objects/21_classes_and_objects.md) | [Day 23 >>](../23_Day_Virtual_environment/23_virtual_environment.md) @@ -36,28 +34,28 @@ Web scraping is the process of extracting and collecting data from websites and In this section, we will use beautifulsoup and requests package to scrape data. The package version we are using is beautifulsoup 4. -To start scraping websites you need _requests_, _beautifoulSoup4_ and _website_. +To start scraping websites you need _requests_, _beautifoulSoup4_ and a _website_. ```sh pip install requests pip install beautifulsoup4 ``` -To scrape data from websites, basic understanding of HTML tags and css selectors is needed. We target content from a website using HTML tags, classes or/and ids. -Let's import the requests and BeautifulSoup module +To scrape data from websites, basic understanding of HTML tags and CSS selectors is needed. We target content from a website using HTML tags, classes or/and ids. +Let us import the requests and BeautifulSoup module ```py import requests from bs4 import BeautifulSoup ``` -Let's declare url variable for the website which we are going to scrape. +Let us declare url variable for the website which we are going to scrape. ```py import requests from bs4 import BeautifulSoup -url = 'http://mlr.cs.umass.edu/ml/datasets.html' +url = 'https://archive.ics.uci.edu/ml/datasets.php' # Lets use the requests get method to fetch the data from url @@ -76,7 +74,7 @@ Using beautifulSoup to parse content from the page ```py import requests from bs4 import BeautifulSoup -url = 'http://mlr.cs.umass.edu/ml/datasets.html' +url = 'https://archive.ics.uci.edu/ml/datasets.php' response = requests.get(url) content = response.content # we get all the content from the website @@ -97,12 +95,13 @@ for td in table.find('tr').find_all('td'): If you run this code, you can see that the extraction is half done. You can continue doing it because it is part of exercise 1. For reference check the [beautifulsoup documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#quick-start) -🌕 You are so special, you are progressing everyday.You are left with only eight days to your way to greatness. Now do some exercises for your brain and for your muscle. +🌕 You are so special, you are progressing everyday. You are left with only eight days to your way to greatness. Now do some exercises for your brain and muscles. ## 💻 Exercises: Day 22 -1. Extract the table in this url (http://mlr.cs.umass.edu/ml/datasets.html) and change it to a json file -2. Scrape the presidents table and store the data as json(https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States) +1. Scrape the following website and store the data as json file(url = 'http://www.bu.edu/president/boston-university-facts-stats/'). +1. Extract the table in this url (https://archive.ics.uci.edu/ml/datasets.php) and change it to a json file +2. Scrape the presidents table and store the data as json(https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). The table is not very structured and the scrapping may take very long time. 🎉 CONGRATULATIONS ! 🎉 diff --git a/22_Day_Web_scraping/scrapped_data.json b/22_Day_Web_scraping/scrapped_data.json new file mode 100644 index 00000000..5a1f226e --- /dev/null +++ b/22_Day_Web_scraping/scrapped_data.json @@ -0,0 +1,73 @@ +[ + { + "category": "Community", + "Student Body": "34,589", + "Living Alumni": "398,195", + "Total Employees": "10,517", + "Faculty": "4,171", + "Nondegree Students": "2,008", + "Graduate & Professional Students": "15,645", + "Undergraduate Students": "16,936" + }, + { + "category": "Campus", + "Classrooms": "834", + "Buildings": "370", + "Laboratories": "1,681", + "Libraries": "21", + "Campus Area (acres)": "169" + }, + { + "category": "Academics", + "Study Abroad Programs": "70+", + "Average Class Size": "27", + "Faculty": "4,171", + "Student/Faculty Ratio": "10:1", + "Schools and Colleges": "17", + "Programs of Study": "300+" + }, + { + "category": "Grant & Contract Awards", + "Research Awards": "$574.1M", + "BMC Clinical Research Grants": "$88.0M" + }, + { + "category": "Undergraduate Financial Aid & Scholarships", + "Average Total Need-Based Financial Aid": "$46,252", + "Average Need-Based Grant/Scholarship": "$40,969", + "Grants & Scholarships (need-based)": "$275.6M", + "Grants & Scholarships (non-need-based)": "$28.7M" + }, + { + "category": "Student Life", + "Community Service Hours": "1.6M+", + "Alternative Service Breaks Participants": "300+", + "BU on Social": "new accounts daily", + "Cultural & Religious Organizations": "60+", + "Community Service & Justice Organizations": "80+", + "Academic & Professional Organizations": "120+", + "Art & Performance Organizations": "60+", + "Student Organizations": "450+", + "First-Year Student Outreach Project Volunteers": "800+" + }, + { + "category": "Research", + "Faculty Publications": "6,000+", + "Student UROP Participants": "450+", + "Centers & Institutes": "130+" + }, + { + "category": "International Community", + "Global Initiatives": "300+", + "Cultural Student Groups": "40+", + "Alumni Countries": "180+", + "International Students": "11,000+" + }, + { + "category": "Athletics", + "Intramural Sports & Tournaments": "15+", + "Club and Intramural Sports Participants": "7,000+", + "Club Sports Teams": "50", + "Varsity Sports": "24" + } +] \ No newline at end of file