From deba553e6bce397171dc7cb3b221145f22988980 Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz Date: Fri, 11 Feb 2022 16:32:11 -0500 Subject: [PATCH 01/17] Create scraper_practice.py Very basic scraper for practice --- scraper_practice.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 scraper_practice.py diff --git a/scraper_practice.py b/scraper_practice.py new file mode 100644 index 0000000..6620c6d --- /dev/null +++ b/scraper_practice.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 11 16:02:40 2022 + +@author: nazime +""" + +import requests +from bs4 import BeautifulSoup + +URL = "https://realpython.github.io/fake-jobs/" +page = requests.get(URL) + +soup = BeautifulSoup(page.content, "html.parser") +results = soup.find(id="ResultsContainer") +print(results.prettify()) +job_elements = results.find_all("div", class_="card-content") +for job_element in job_elements: + title_element = job_element.find("h2", class_="title") + company_element = job_element.find("h3", class_="company") + location_element = job_element.find("p", class_="location") + print(title_element.text.strip()) + print(company_element.text.strip()) + print(location_element.text.strip()) + print() +python_jobs = results.find_all("h2", string="Python") +python_jobs = results.find_all( + "h2", string=lambda text: "python" in text.lower() +) +python_job_elements = [ + h2_element.parent.parent.parent for h2_element in python_jobs +] + +for job_element in python_job_elements: + title_element = job_element.find("h2", class_="title") + company_element = job_element.find("h3", class_="company") + location_element = job_element.find("p", class_="location") + print(title_element.text.strip()) + print(company_element.text.strip()) + print(location_element.text.strip()) + print() + links = job_element.find_all("a") + for link in links: + link_url = link["href"] + print(f"Apply here: {link_url}\n") \ No newline at end of file From 0841b7b05510c532448d9bb9297093285a8ec5e4 Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz Date: Fri, 11 Feb 2022 16:42:53 -0500 Subject: [PATCH 02/17] More scraper practice This one reads a bunch of inspirational quotes and creates a table to be saved into a .csv file --- scraper_practice2.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 scraper_practice2.py diff --git a/scraper_practice2.py b/scraper_practice2.py new file mode 100644 index 0000000..c38d359 --- /dev/null +++ b/scraper_practice2.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 11 16:36:13 2022 + +@author: nazime +""" + +import requests +from bs4 import BeautifulSoup +import csv + +URL = "http://www.values.com/inspirational-quotes/" +r = requests.get(URL) +soup = BeautifulSoup(r.content, 'html5lib') +quotes=[] # a list to store quotes + +table = soup.find('div', attrs = {'id':'all_quotes'}) + +for row in table.findAll('div', + attrs = {'class':'col-6 col-lg-3 text-center margin-30px-bottom sm-margin-30px-top'}): + quote = {} + quote['theme'] = row.h5.text + quote['url'] = row.a['href'] + quote['img'] = row.img['src'] + quote['lines'] = row.img['alt'].split(" #")[0] + quote['author'] = row.img['alt'].split(" #")[1] + quotes.append(quote) + +filename = 'inspirational_quotes.csv' +with open(filename, 'w', newline='') as f: + w = csv.DictWriter(f,['theme','url','img','lines','author']) + w.writeheader() + for quote in quotes: + w.writerow(quote) \ No newline at end of file From 1077ca5bc966d1741c431e0d2781a877b71c34c8 Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz Date: Thu, 17 Feb 2022 19:25:08 -0500 Subject: [PATCH 03/17] Create github-actions-demo.yml --- workflows/github-actions-demo.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 workflows/github-actions-demo.yml diff --git a/workflows/github-actions-demo.yml b/workflows/github-actions-demo.yml new file mode 100644 index 0000000..2af4ce3 --- /dev/null +++ b/workflows/github-actions-demo.yml @@ -0,0 +1,17 @@ +name: GitHub Actions Demo +on: [push] +jobs: + Explore-GitHub-Actions: + runs-on: ubuntu-latest + steps: + - run: echo "πŸŽ‰ The job was automatically triggered by a ${{ github.event_name }} event." + - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!" + - run: echo "πŸ”Ž The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." + - name: Check out repository code + uses: actions/checkout@v2 + - run: echo "πŸ’‘ The ${{ github.repository }} repository has been cloned to the runner." + - run: echo "πŸ–₯️ The workflow is now ready to test your code on the runner." + - name: List files in the repository + run: | + ls ${{ github.workspace }} + - run: echo "🍏 This job's status is ${{ job.status }}." \ No newline at end of file From 2a6fbf8e4a1372953f6ad8bd471425ff99aa9fea Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz <98502343+nazime1@users.noreply.github.com> Date: Fri, 18 Feb 2022 00:31:20 -0500 Subject: [PATCH 04/17] GitHub Python auto-run action test (#16) * Move documentation into its own directory * Update documentation, fixes #8. * Fix misaligned table for the section properties * Reorganize scraper repository (#12) * Delete old scrapers * Move documentation into a separate directory * Remove old json files from gitignore * Add some basic goals to the README * Basic start of Clojure course catalog scraper * Clean up Clojure, it was a fun experiment while it lasted * Merge sis.rpi.edu/course_scrapper.py into lily-rewrite * Rename course scraper script * Delete data/ directory * Create python-app.yml Testing GitHub Actions with auto-executing Python files as soon as they're published to the repository Co-authored-by: Lily Ritter Co-authored-by: Trevor Brunette Co-authored-by: Lily Ritter <70179370+lilyritt@users.noreply.github.com> --- .github/workflows/python-app.yml | 47 ++++ .gitignore | 7 +- README.md | 14 +- baccalaureate_scraper.py | 153 ----------- catalog.rpi.edu/calendar_scrapper.py | 120 --------- catalog.rpi.edu/degrees_scrapper.py | 74 ------ data_json_doc.md | 100 -------- docs/format.md | 119 +++++++++ minor_scraper.py | 119 --------- pathway_scraper.py | 90 ------- quacs/quacs_scrapper.py | 238 ------------------ quacs/quacs_scrapping.txt | 135 ---------- .../course_scrapper.py => src/course.py | 0 13 files changed, 182 insertions(+), 1034 deletions(-) create mode 100644 .github/workflows/python-app.yml delete mode 100644 baccalaureate_scraper.py delete mode 100644 catalog.rpi.edu/calendar_scrapper.py delete mode 100644 catalog.rpi.edu/degrees_scrapper.py delete mode 100644 data_json_doc.md create mode 100644 docs/format.md delete mode 100644 minor_scraper.py delete mode 100644 pathway_scraper.py delete mode 100644 quacs/quacs_scrapper.py delete mode 100644 quacs/quacs_scrapping.txt rename sis.rpi.edu/course_scrapper.py => src/course.py (100%) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml new file mode 100644 index 0000000..b0e3cbc --- /dev/null +++ b/.github/workflows/python-app.yml @@ -0,0 +1,47 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: auto scraper + +on: + pull_request: + branches: [ main ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - name: checkout repo content + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: "3.10" + - name: install python packages + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: execute py script + run: python scraper_practice2.py + + - name: commit files + run: | + git config --local user.email "action@github.com" + git config --local user.name "GitHub Action + git add -A + git commit -m "update data" -a + + - name: push changes + uses: ad-m/github-push-action@v0.6.0 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + branch: main diff --git a/.gitignore b/.gitignore index 4fd488f..9e698ae 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ -fall_calendar.json -out.json -spring_calendar.json .idea/ -.vscode/ \ No newline at end of file +.vscode/ +.cpcache/ +.nrepl-port \ No newline at end of file diff --git a/README.md b/README.md index a146d63..642e05d 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,14 @@ -# Scraper +# CRISIS Scrapers + This repository contains the web scraper utilities to scrape the data (course, major, minor, HASS, etc.) from various RPI webpages and compile it into a useful format. + +## Scrapers + +- [ ] Individual courses +- [ ] Major +- [ ] Minor +- [ ] HASS Pathways + +## Infrastructure + +- [ ] GitHub action or similar to run scrapers and commit to [the data repository](https://github.com/rpi-crisis/data). diff --git a/baccalaureate_scraper.py b/baccalaureate_scraper.py deleted file mode 100644 index 91dcf38..0000000 --- a/baccalaureate_scraper.py +++ /dev/null @@ -1,153 +0,0 @@ -from selenium import webdriver -from bs4 import BeautifulSoup -import json - -op = webdriver.ChromeOptions() -op.add_argument('headless') - -PATH = "C:\Program Files (x86)\chromedriver.exe" #path on your machine -driver = webdriver.Chrome(PATH, options=op) - -url = "http://catalog.rpi.edu/content.php?catoid=22&navoid=542" - -driver.get(url) - -source_first = driver.page_source -soup_first = BeautifulSoup(source_first, "html.parser") - -# the data is arranged in a table of ul elements -# the first ul element is the list of baccalaureate programs - -table = soup_first.find('ul', {'class': 'program-list'}) - -# find all the list items in the ul -table_li = table.find_all("li") - -program_names = [] -program_links = [] -for li in table_li: #grab the text and the link in the li element - program_names.append(li.find('a').text.strip()) # must strip trailing spaces - program_links.append(li.find('a')['href']) - -dictionary = {} #our json object -bug_programs = ["Physician-Scientist", "Program for Graduates of Naval Nuclear Power Training Command’s Nuclear Power School"] -for program in program_names: - #print(program) - - if program in program_names: - i = 1 - years = {} - driver.find_element_by_link_text(program).click() - #now we are in the link for each program - source_link = driver.page_source - soup_link = BeautifulSoup(source_link, "html.parser") - - #print(soup_link.find_all()) - #there are two tables in the html, we need to find the second one - td = soup_link.find('td', {'class': 'block_content'}) - table = td.find('table', {'class': 'table_default'}) - tbody = table.find('tbody') - #we need to get the second table row, first one is a header - tr = tbody.find_all('tr')[3] - print(len(tr)) - #tr = tr[1]; # the second tr - #there is one td in the tr that holds all the data - #analog core class is the year - #custom leftpad 20 has the contents of the of that year - #within the leftpad20 class are two analog core classes that have each semester - td_new = tr.find('td', {'colspan': '4'}) - if (not td_new): - tr = tbody.find_all('tr')[4] - td_new = tr.find('td', {'colspan': '4'}) - - div = td_new.find('div', class_='custom_leftpad_20') - if(div): - num = 4 - if(program == "Architecture"): - num = 5 - leftpad20 = div.find_all('div', class_='custom_leftpad_20')[:num]#first 4 years, can be adjusted - - for term in leftpad20: - - terms_dict = {} - #there will be two analog core - analog_core = term.find_all('div', class_='acalog-core') #there should be 2, fall and spring - - #print(year_num) - for sem in analog_core: - year_num = str(i) - term_text = "" - if sem.find('h3'): - term_text = sem.find('h3').text #fall or spring - #print(sem.find('h3').text) - #within the analog core there are two ul - ul = sem.find_all('ul') - #the first ul has all the non hyperlinked classes ex: hass elective or free elective or math option - nonlinked_text = [] - terms_dict[term_text]= [] - if(len(ul) > 0): - nonlinked = ul[0] - nonlinked_text = [] - li = nonlinked.find_all('li') - for l in li: - nonlinked_text.append(l.text) - ##terms_dict[term_text].append(nonlinked_text) - linked_text = [] - if(len(ul) > 1): - linked = ul[1] - li_linked = linked.find_all('li') - for l in li_linked: - linked_text.append(l.text) - all_classes = [] - if(len(ul) == 0): - p = sem.find('p') - all_classes.append(p.text) - for l in nonlinked_text: - all_classes.append(l) - for l in linked_text: - all_classes.append(l) - parsed_classes = [] - for k in range(len(all_classes)): - #do some parsing of the string - all_classes[k] = all_classes[k].replace("\u00a0", " ") - all_classes[k] = all_classes[k].replace("\n\t", " ") - l = [char for char in all_classes[k] if char.isalnum() or char == " " or char == "-" or char == ":" or char == ","] - l = "".join(l) - s = l.find("See footnote") - if( s < 0): - s = l.find("see footnote") - if (s < 0): - s = l.find("See Footnote") - o = l.find("or") - if(s != 0 and s!= 1 and o != 0 and l != " "): - if(s > 0): - l = l[:s] - l = l.replace(" ", " ") - parsed_classes.append(l.strip()) - terms_dict[term_text].append(parsed_classes) - #print(all_classes) - ##terms_dict[term_text].append(linked_text) - years[year_num] = terms_dict - i+=1 - else: #special case for nuc e program - all_text = table.find_all('p') - years = {} - text_elements = [] - for p in all_text[1:]: - p_text = p.text - p_text.replace("\u2019", "'") - if(p_text != "\u00a0"): - text_elements.append(p_text) - years["info"] = text_elements - dictionary[program] = years - #the second ul has all the required classes hyperlinked - driver.back() -driver.close() -#print(dictionary) - #driver.close() -json_obj = json.dumps(dictionary, indent=4) -with open("baccalaureate.json", "w") as outfile: - outfile.write(json_obj) - - - diff --git a/catalog.rpi.edu/calendar_scrapper.py b/catalog.rpi.edu/calendar_scrapper.py deleted file mode 100644 index 84a315b..0000000 --- a/catalog.rpi.edu/calendar_scrapper.py +++ /dev/null @@ -1,120 +0,0 @@ -""" -Note: This is currently a work in-progress. The main functionality -works. However, there are some inconsistencies not yet accounted for. - -Notice in the HTML of the table on the catalog website, there are some -dates/events in divs as well as ps. In addition, there may be multiple -within a div section. - -This portion will need to be resolved before the json is fully functional. -""" - - -import requests as rq, json -from bs4 import BeautifulSoup as bs - -# parse spring and fall data -def parse_table_rows(out, rows): - date = "" - - for row in rows: - # make sure row has contents - if len(row)==0: continue - # bold means it's a date - # if (row.findAll("em", recursive=True)): continue - if (row.findAll("strong", recursive=False)): - try: - data = row.text.strip() - # check if date/information are in same cell - data = data.split('\r') - if len(data) == 2: - date = data[0].strip() - # remove non-printable codes - date = date.encode("ascii","ignore") - date = date.decode() - # check if date already exists - if date not in out.keys(): - out[date] = [] - # gather the information for given date - information = data[1].strip() - # remove non-printable codes - information = information.encode("ascii","ignore") - information = information.decode() - # add gathered information to the list at that date - out[date].append(information) - else: - # there is not both date/information in the cell - date = data[0] - if date not in out.keys(): - out[date] = [] - except: - continue - # the text is not bold - this cell contains information about a date - else: - data = row.text.strip() - # remove non-printable codes - data = data.encode("ascii","ignore") - data = data.decode() - if date not in out.keys(): - # there will always be a date before first information - # the date will be the last cell date if not in same cell - out[date] = [] - out[date].append(data) - -# remove null items from each list -def check_data(out): - for value in out.keys(): - out[value] = list(filter(('').__ne__, out[value])) - - -calendar_website = "http://catalog.rpi.edu/content.php?catoid=22&navoid=528" -calendar_html = rq.get(calendar_website).text -soup = bs(calendar_html, "html.parser") - -# Find the correct table -tables = soup.find_all("td",{ - "style": "vertical-align:top; width:48%" -}) # get the two tables for fall and spring - -# first table found will be fall -table_fall = tables[0] -# second table found will be spring -table_spring = tables[1] - -# get data from inside the tables -rows_fall = table_fall.findAll(["p","div"], recursive = False) -rows_spring = table_spring.findAll(["p","div"], recursive = False) - -# initialize dictionaries -calendar_fall = {} -calendar_spring = {} - -# parse data for fall and spring -parse_table_rows(calendar_fall, rows_fall) -parse_table_rows(calendar_spring, rows_spring) - -# check the data before saving -check_data(calendar_fall) -check_data(calendar_spring) - -out = open("fall_calendar.json","w") -json.dump(calendar_fall, out) -out.close() - -out = open("spring_calendar.json", "w") -json.dump(calendar_spring, out) -out.close() - -# for testing purposes -if __name__ == "__main__": - # print contents of fall calendar - for date in calendar_fall: - value = date + ':' - try: print(value, calendar_fall[date]) - except: continue - - # print contents of spring calendar - for date in calendar_spring: - value = date + ':' - try: print(value, calendar_spring[date]) - except: continue diff --git a/catalog.rpi.edu/degrees_scrapper.py b/catalog.rpi.edu/degrees_scrapper.py deleted file mode 100644 index fee8e8c..0000000 --- a/catalog.rpi.edu/degrees_scrapper.py +++ /dev/null @@ -1,74 +0,0 @@ -import requests as rq, json, re -from bs4 import BeautifulSoup as bs - - -def parse_table_rows(out, rows): - school = "" - - for row in rows: - # There are multiple cells (sometimes) inside each row. - cells = row.findAll("td", recursive=False) - - # Rows with 5 cells are for individual degree rows. - if len(cells) == 5: - # Some rows are blank and only for spacing, so if this fails it's assmued - # it doesn't have data. - try: - degree = cells[0].span.text.strip() - types = cells[2].span.text.split(',') - types = [type_.strip() for type_ in types] - - # HEGIS (Higher Education General Information Survey) codes are used in - # NY to organize degrees by numbers. - hegis = cells[4].span.text.strip() - - out[school].append({ - "degree": degree, - "offered": types, - "hegis": hegis - }) - except: continue - else: - # Rows without 5 cells are for school headings. - - # There are sometimes multiple spans in a header, and only one has a - # tag with the school. Those without a strong tab aren't headers - # and are just there for correct spacing. - try: - strong = next(c.strong for c in cells if c.strong) - except: continue - - # Some schools (like school of engineering) have other tags like in - # there text, which shouldn't be taken out (using recursive=False). - school = strong.span.find(text=True, recursive=False).text.strip() - - # Schools need to be initialized otherwises errors occur. - out[school] = [] - - -################################################################################ - - -# Get HTML from website. -degrees_website = "http://catalog.rpi.edu/content.php?catoid=22&navoid=525" -degrees_html = rq.get(degrees_website).text -soup = bs(degrees_html, 'html.parser') - -# Find the table with all of the data in it. -table = soup.find_all("table", { - "border": "0", - "cellpadding": "0", - "cellspacing": "0", - "style": "border-collapse:collapse; height:2027px; width:863px" -})[0] # There is only one table exactly like this one. - -# The first row is just the table header. -table_rows = table.tbody.find_all("tr", recursive=False)[1:] - -# Parse the data. -data = {} -parse_table_rows(data, table_rows) - -# Output to file. -out = open("out.json", "w") -json.dump(data, out) \ No newline at end of file diff --git a/data_json_doc.md b/data_json_doc.md deleted file mode 100644 index 1f01ed0..0000000 --- a/data_json_doc.md +++ /dev/null @@ -1,100 +0,0 @@ -# Data Documentation - -The data for CRISIS are stored as a dictionary, where each key is a combination of a course's department and course number, in the form: `'DEPT-CRSE'`. A keys corresponding value is an object which holds several properties about the course. - -## Properties - -| Property | Description | Example | -|-|-|-| -| `"title"` | Course name. | `"CALCULUS I"` | -| `"crns"` | List of CRNs from this course's sections. | `[61326, 60309,...]` | -| `"department"` | Department code. | `"MATH"` | -| `"id_num"` | Course code. | `1010` | -| `"credits"` | Number of credits for the course. May be a range of values. | `"4.0"`, or `"1.0-4.0"` | -| `"ci"` | Whether this course is **c**ommunication **i**ntensive. | `false` | -| `"description"` | Course description. | `"Functions, limits, continuity, derivatives, implicit..."` | -| `"offered"` | When the course can be taken. | `"Fall and spring terms annually."` | -| `"prereq"` | List of prerequisite courses. | `["MATH 1000", "MATH 1001"]` | -| `"coreq"` | List of corequisite courses. | `["MATH 1002", "MATH 1003"]` | -| `"cross"` | List of cross-listed classes. | `["CSCI 0123", "PSYC 4567"]` | -| `"required_by"` | Contains course requirements for major, minor, etc. -| |`"minor"` List of minors which require this course. | `["MATH", "GSAS"]` | -| |`"major"` List of majors which require this course. | `["CSCI", "COGS"]` | -| | `"hass"` List of HASS pathways that require this course. | `["Artificial Intelligence", "Mind, Brain, and Intelligence"]` | -| `"transfer"` | A list of courses which can be redeemed when transferring to RPI. | | -| | `"school"` Name of the college. | `"Univ Texas Austin"` | -| | `"location"` Location of the college. | `"Texas"` | -| | `"title"` Title of course at the college. | `"INTRODUCTION TO PSYCHOLOGY"` | -| | `"id"` Course ID or code. | `"PSY 301"` | - -## Example Template -``` -"MATH-1010": { - "title": "CALCULUS I", - "department": "MATH", - "id_num": 1010, - "credits": "4.0", - "ci": false, - "description": "Functions, limits, continuity, derivatives, implicit differentiation, related rates, maxima and minima, elementary transcendental functions, introduction to definite integral with applications to area and volumes of revolution.", - "offered": "Fall and spring terms annually.", - "prereq": [ - "MATH 1000", - "MATH 1001" - ], - "coreq": [ - "MATH 1002", - "MATH 1003" - ], - "cross": [ - "CSCI 0123", - "PSYC 4567" - ], - "required-by": { - "major": [ - "MATH", - "GSAS" - ], - "minor": [ - "CSCI", - "COGS" - ], - "hass": [] - }, - "transfer": [ - { - "school": "Univ of Connecticut", - "location": "Connecticut", - "title": "CALCULUS I", - "id": "MATH 1131Q" - }, - { - "school": "Univ Of New Haven", - "location": "Connecticut", - "title": "CALCULUS I", - "id": "M 117" - } - ] -} -``` - -## Blank Template -``` -"NULL-0000": { - "title": "", - "department": "", - "id_num": 0000, - "credits": "0.0", - "ci": false, - "description": "", - "offered": "", - "prereq": [], - "coreq": [], - "cross": [], - "required-by": { - "major": [], - "minor": [], - "hass": [] - }, - "transfer": [] -} -``` diff --git a/docs/format.md b/docs/format.md new file mode 100644 index 0000000..54ff0a3 --- /dev/null +++ b/docs/format.md @@ -0,0 +1,119 @@ +# Data Documentation + +The data for CRISIS are stored as a json array, where each element is +an object with the following form: + +## Course Properties + +| Property | Description | Example | +|-----------------|-------------------------------------------------------------|-------------------------------------------------------------| +| `"title"` | Course name. | `"CALCULUS I"` | +| `"department"` | Department code. | `"MATH"` | +| `"id"` | Course code. | `1010` | +| `"credits"` | Number of credits for the course. May be a range of values. | `"4.0"`, or `"1.0-4.0"` | +| `"ci"` | Whether this course is **c**ommunication **i**ntensive. | `false` | +| `"description"` | Course description. | `"Functions, limits, continuity, derivatives, implicit..."` | +| `"offered"` | When the course can be taken. | `"Fall and spring terms annually."` | +| `"prereq"` | (TODO) Free-form string of prerequisite courses. | `"MATH 1000 and MATH 1001"` | +| `"coreq"` | (TODO) Free-form string of corequisite courses. | `"MATH 1002 and MATH 1003"` | +| `"cross"` | (TODO) Free-form string of cross-listed courses. | `"CSCI 0123 and PSYC 4567"` | +| `"sections"` | A list of section objects | See below | + +## Section Properties + +| Property | Description | Example | +|---------------|-----------------------------------------------------|-----------| +| `"crn"` | CRN of the section. | `"50039"` | +| `"section"` | Section number. | `"01"` | +| `"capacity"` | Total capacity of the section. | `30` | +| `"enrolled"` | Number of current students enrolled in the section. | `25` | +| `"remaining"` | `capacity - enrolled` | `5` | +| `"meetings"` | List of meeting objects | See below | + +## Meeting Properties + +| Property | Description | | +|-----------------|----------------------------------------|-----------------------------------------| +| `"time"` | Time table of the meeting. | `"2:00 pm - 3:20 pm"` | +| `"days"` | Which days the meetings take place on. | `"MR"` | +| `"location"` | Room number or online status. | `"Darrin Communications Center 308"` | +| `"type"` | Lecture/recitation/exam etc. | `"Lecture"` | +| `"instructors"` | Instructor(s) for the meeting. | `"Wesley D Turner, Shianne M. Hulbert"` | + +## Example Template + +```json +[ + { + "department": "CSCI", + "id": 1100, + "name": "COMPUTER SCIENCE I", + "description": "--description--", + "prereqs": "--prerequesites--", + "coreqs": "--corequesites--", + "crosslistings": "--crosslistings--", + "sections": [ + { + "crn": "50039", + "section": "01", + "meetings": [ + { + "time": "2:00 pm - 3:20 pm", + "days": "MR", + "location": "Online ", + "type": "Lecture", + "instructors": "Wesley D Turner, Shianne M. Hulbert" + }, + { + "time": "10:00 am - 11:50 am", + "days": "T", + "location": "Low Center for Industrial Inn. 3116", + "type": "Lecture", + "instructors": "Wesley D Turner, Shianne M. Hulbert" + }, + { + "time": "6:00 pm - 7:50 pm", + "days": "R", + "location": "Darrin Communications Center 308", + "type": "Lecture", + "instructors": "Wesley D Turner, Shianne M. Hulbert" + } + ], + "capacity": 30, + "enrolled": 25, + "remaining": 5 + }, + { + "crn": "50373", + "section": "02", + "meetings": [ + { + "time": "2:00 pm - 3:20 pm", + "days": "MR", + "location": "Online ", + "type": "Lecture", + "instructors": "Wesley D Turner, Shianne M. Hulbert" + }, + { + "time": "10:00 am - 11:50 am", + "days": "T", + "location": "Carnegie Building 208", + "type": "Lecture", + "instructors": "Wesley D Turner, Shianne M. Hulbert" + }, + { + "time": "6:00 pm - 7:50 pm", + "days": "R", + "location": "Darrin Communications Center 308", + "type": "Lecture", + "instructors": "Wesley D Turner, Shianne M. Hulbert" + } + ], + "capacity": 36, + "enrolled": 29, + "remaining": 7 + } + ] + } +] +``` diff --git a/minor_scraper.py b/minor_scraper.py deleted file mode 100644 index 0edd48e..0000000 --- a/minor_scraper.py +++ /dev/null @@ -1,119 +0,0 @@ -from selenium import webdriver -from bs4 import BeautifulSoup -import json - -op = webdriver.ChromeOptions() -op.add_argument('headless') - -PATH = "C:\Program Files (x86)\chromedriver.exe" #path on your machine -driver = webdriver.Chrome(PATH,options=op) - -url = "http://catalog.rpi.edu/content.php?catoid=22&navoid=542" - -driver.get(url) - -source_first = driver.page_source -soup_first = BeautifulSoup(source_first, "html.parser") - -table = soup_first.find_all('ul', {'class': 'program-list'})[6] -#this is the ul for the pathways - -table_li = table.find_all('li') - -program_names = [] -program_links = [] -for li in table_li: #grab the text and the link in the li element - program_names.append(li.find('a').text.strip()) # must strip trailing spaces - program_links.append(li.find('a')['href']) - -bug_programs = [] -dictionary = {} -for i in range(len(program_names)): - program = program_names[i] - if program not in bug_programs: - print(program) - dictionary[program] = {} - #click on the link for the minor - driver.find_element_by_link_text(program).click() - - source_link = driver.page_source - soup_link = BeautifulSoup(source_link, "html.parser") - - table = soup_link.find('table', {'class': 'table_default'}) - #all the categories are stored in acalog core divs - divs = table.find_all('div', {'class': 'acalog-core'}) - #now we are in the right program - - for div in divs: - #each div contains a different category of information - classes = [] - #the div always has text but some dont have h2 tags - div_text = div.text - if(div.find('h2')): - div_text = div.find('h2').text - #parse the div string - div_text = div_text.replace('\ua00a0', '') - div_text = div_text.replace('\u200b', '') - div_text = div_text.replace('\n', '') - div_text = div_text.replace('\u2013', '') - #find the list of classes - class_list = div.find('ul') - if(class_list): #the list exists - for li in class_list.find_all('li'): - #parse the string - text = li.text - text = text.replace('\u00a0', '') - text = text.replace('\u2013', ' -') - text = text.replace('\n', '') - if text != "": - classes.append(text) - elif(len(div.find_all('p')) > 0): - #if theres no list than theres a paragraph - p_list = div.find_all('p') - for p in p_list: - #parse the paragraph - text = p.text - text = text.replace('\u00a0', '') - text = text.replace('\u200b', '') - text = text.replace('\n', '') - if text != "": - classes.append(text) - elif(div.find('ol')): #there could be an ordered list - o_list = div.find('ol') - for li in o_list.find_all('li'): - text = li.text - text = text.replace('\u00a0', '') - text = text.replace('\u2013', ' -') - text = text.replace('\n', '') - classes.append(text) - if(len(classes) > 0): - dictionary[program][div_text] = classes - if(len(divs) == 0): #some minors do not have any divs and are just text - #we need to move to the right table - tables = soup_link.find_all('table', {'class': 'table_default'}) - table = tables[len(tables)-2] - classes = [] - p_list = table.find_all('p') - ul = table.find('ul') - if(p_list) : - for p in p_list: - text = p.text - text = text.replace('\u00a0', '') - text = text.replace('\u200b', '') - text = text.replace('\n', '') - classes.append(text) - if(ul): - for li in ul.find_all('li'): - text = li.text - text = text.replace('\u00a0', '') - text = text.replace('\u2013', ' -') - text = text.replace('\n', '') - classes.append("- " + text) - if (len(classes) > 0): - dictionary[program]["Information"] = classes[1:] - driver.back() - -json_obj = json.dumps(dictionary, indent=4) -with open("minors.json", "w") as outfile: - outfile.write(json_obj) -driver.close() \ No newline at end of file diff --git a/pathway_scraper.py b/pathway_scraper.py deleted file mode 100644 index ad08500..0000000 --- a/pathway_scraper.py +++ /dev/null @@ -1,90 +0,0 @@ -from selenium import webdriver -from bs4 import BeautifulSoup -import json - -op = webdriver.ChromeOptions() -op.add_argument('headless') - -PATH = "C:\Program Files (x86)\chromedriver.exe" #path on your machine -driver = webdriver.Chrome(PATH,options=op) - -url = "http://catalog.rpi.edu/content.php?catoid=22&navoid=542" - -driver.get(url) - -source_first = driver.page_source -soup_first = BeautifulSoup(source_first, "html.parser") - -table = soup_first.find_all('ul', {'class': 'program-list'})[4] -#this is the ul for the pathways - -table_li = table.find_all('li') - -program_names = [] -program_links = [] -for li in table_li: #grab the text and the link in the li element - program_names.append(li.find('a').text.strip()) # must strip trailing spaces - program_links.append(li.find('a')['href']) - -dictionary = {} -bug_programs = [] -for i in range(len(program_names)): - program = program_names[i] - if program not in bug_programs: - print(program) - dictionary[program] = {} - # a few of the integrative pathways also have links before - # so if you click Economics it actually clicks the wrong link - duplicate_programs = ["Economics", "Electronic Arts", "Philosophy","Science, Technology, and Society", - "Design, Innovation, and Society"] - if(program in duplicate_programs): - driver.find_elements_by_link_text(program)[1].click() - else: - driver.find_element_by_link_text(program).click() - source_link = driver.page_source - soup_link = BeautifulSoup(source_link, "html.parser") - - table = soup_link.find('table', {'class': 'table_default'}) - #all the categories are stored in acalog core divs - divs = table.find_all('div', {'class': 'acalog-core'}) - #now we are in the right program - - for div in divs: - #each div contains a different category of information - classes = [] - #the div always has text but some dont have h2 tags - div_text = div.text - if(div.find('h2')): - div_text = div.find('h2').text - #parse the div string - div_text = div_text.replace('\ua00a0', '') - div_text = div_text.replace('\u200b', '') - div_text = div_text.replace('\n', '') - div_text = div_text.replace('\u2013', '') - #find the list of classes - class_list = div.find('ul') - if(class_list): #the list exists - for li in class_list.find_all('li'): - #parse the string - text = li.text - text = text.replace('\u00a0', '') - text = text.replace('\u2013', ' -') - text = text.replace('\n', '') - classes.append(text) - else: - #if theres no list than theres a paragraph - p_list = div.find_all('p') - for p in p_list: - #parse the paragraph - text = p.text - text = text.replace('\u00a0', '') - text = text.replace('\u200b', '') - text = text.replace('\n', '') - classes.append(text) - dictionary[program][div_text] = classes - driver.back() - -json_obj = json.dumps(dictionary, indent=4) -with open("pathways.json", "w") as outfile: - outfile.write(json_obj) -driver.close() diff --git a/quacs/quacs_scrapper.py b/quacs/quacs_scrapper.py deleted file mode 100644 index 493403e..0000000 --- a/quacs/quacs_scrapper.py +++ /dev/null @@ -1,238 +0,0 @@ -import requests as rq -import json -import re - -# Gets QuACS data and parses into a format that CRISIS can use. - -# NOTE: Some classes will ONLY have title, department, id_num, and desciprtion -# because courses.json and catalog.json don't always have the same classes. - - -# Helper splice function -def splice(string, index, char): - return string[:index] + char + string[(index + 1):] - - -# courses.json ################################################################# - - -# Takes each of the courses in courses.json and sorts them instead by class. -def parse_majors(out, data): - # Go and parse through each major. - for major in data: - parse_courses(out, major["courses"]) - - -# Helper function to sort courses found within a specific major. -def parse_courses(out, courses): - for course in courses: - credits, CRNs = parse_sections(course) - # IDs are represented as four digit numbers, so '12' should be instead be - # '0012'. - id = "{:04d}".format(course["crse"]) - # This will make it very simple to sort by class. - name = course["subj"] + "-" + id - - out[name] = { - "title": course["title"], - "crns": CRNs, - "credits": credits, - "department": course["subj"], - "id_num": id, - # This will be filled out by transfer.json. - "transfer": [], - # It is assumed false, and hass_pathways.json will correct all that are. - "ci": "false", - # This will be further by other sources. - "required_by": { - "major": [], - "minor": [], - "hass": [] - } - } - - -# Helper function to find the CRNs and credits a course. -def parse_sections(course): - CRNs = [] - # Credits should be the same for each class, so it doesn't matter which class - # to get the credits from. - credMax = course["sections"][0]["credMax"] - credMin = course["sections"][0]["credMax"] - - credits = "" - # If classes can be taken for different amounts of credits, represent credits - # as a range. - if credMax != credMin: - credits = str(credMin) + '-' + str(credMax) - else: - credits = str(credMax) - - # Get CRNs - for section in course["sections"]: - CRNs.append(section["crn"]) - - return credits, CRNs - - -# catalog.json ################################################################# - - -# Takes takes the descriptions of classes and puts them with the result. -def parse_descriptions(out, data): - for key, value in data.items(): - # Some classes exist in catalog.json but not courses.json, - if key in out: - out[key]['description'] = value['description'] - else: - # in which case all available data from catalog.json will be used to repr- - # esent the class. - out[key] = { - "department": value['subj'], - "id_num": value['crse'], - "title": value['name'], - "description": value['description'], - # This will be filled out by transfer.json. - "transfer": [], - # It is assumed false, and hass_pathways.json will correct all that are. - "ci": "false", - # This will be further by other sources. - "required_by": { - "major": [], - "minor": [], - "hass": [] - } - } - - -# prerequisites.json ########################################################### - - -# Given a CRN, it finds its respective class -def get_class_from_crn(CRN, data): - for key, value in data.items(): - if ('crns' in value) and \ - (int(CRN) in value['crns']): - return key - return None - - -# NOTE: This function flattens the requirement tree, only giving you a set of -# all classes present in it. -def get_prereqs(data): - switch = data["type"] - if switch == "course": - return [data["course"]] - elif (switch == "or") or (switch == "and"): - result = [] - for x in data["nested"]: result += get_prereqs(x) - return [*set(result)] - else: print("Error: cannot parse type: " + switch) - - -# Adds prerequisite data into the CRISIS data -def parse_prereqs(out, data): - for key, value in data.items(): - # If a CRN doesn't have any prerequisites, why go through the trouble to get - # its class? - if value and ("prerequisites" in value): - course = get_class_from_crn(key, out) - # If a CRN doesn't have a defined class, forget about it. - if course: - out[course]["prereq"] = get_prereqs(value["prerequisites"]) - - -# transfer.json ################################################################ - - -# Adds transfer courses to the data. -def parse_tranfers(out, data): - for key, courses in data.items(): - class_name = splice(key, 4, '-') - - # Classes wich were not already documented are ignored. - if class_name not in out: continue - - for course in courses: - for item in course["transfer"]: - transfer = { - "school": course["school_name"], - "location": course["location"] - } - - # Some transfer courses don't have name or id specified. - if "name" in item: transfer["title"] = item["name"] - if "id" in item: transfer["id"] = item["id"] - - out[class_name]["transfer"].append(transfer) - - -# hass_pathways.json ########################################################### - - -# Parses each required class string in hass_pathways.json -# NOTE: this is not perfect, as certain expressions like "IHSS 19XX" are not -# understood. -def parse_required_courses(out, string, hass): - regex = r"[A-Z]{4} \d{4}" - matches = re.findall(regex, string) - - for match in matches: - # We want 'XXXX-YYYY' not 'XXXX YYYY' - match = splice(match, 4, '-') - - if match in out: - out[match]["required_by"]["hass"].append(hass) - - -# Parses the pathways in hass_pathways.json -# NOTE: this does not take into account sets of classes, where only some -# combination needs to be taken, but not each one specifically. -def parse_pathways(out, data): - for hass, desc in data.items(): - if "required" not in desc: continue - - for required_class in desc["required"]: - parse_required_courses(out, required_class, hass) - - -################################################################################ - - -# Get courses.json -courses = "https://raw.githubusercontent.com/quacs/quacs-data/master/semester_data/202201/courses.json" -courses_json = rq.get(courses).text -courses_data = json.loads(courses_json) - -# Get catalog.json -catalog = "https://raw.githubusercontent.com/quacs/quacs-data/master/semester_data/202201/catalog.json" -catalog_json = rq.get(catalog).text -catalog_data = json.loads(catalog_json) - -# Get prerequisites.json -prereqs = "https://raw.githubusercontent.com/quacs/quacs-data/master/semester_data/202201/prerequisites.json" -prereqs_json = rq.get(prereqs).text -prereqs_data = json.loads(prereqs_json) - -# Get transfer.json -transfer = "https://raw.githubusercontent.com/quacs/quacs-data/master/transfer.json" -transfer_json = rq.get(transfer).text -transfer_data = json.loads(transfer_json) - -# Get hass_pathways.json -pathways = "https://raw.githubusercontent.com/quacs/quacs-data/master/hass_pathways.json" -pathways_json = rq.get(pathways).text -pathways_data = json.loads(pathways_json) - - -data = {} -# Parse data -parse_majors(data, courses_data) -parse_descriptions(data, catalog_data) -parse_prereqs(data, prereqs_data) -parse_tranfers(data, transfer_data) -parse_pathways(data, pathways_data) - - -with open("out.json", "w") as out: - json.dump(data, out) \ No newline at end of file diff --git a/quacs/quacs_scrapping.txt b/quacs/quacs_scrapping.txt deleted file mode 100644 index 4a1480f..0000000 --- a/quacs/quacs_scrapping.txt +++ /dev/null @@ -1,135 +0,0 @@ -How to scrape from https://github.com/quacs/quacs-data: - -//=| ./semester_data/`YEAR`__/... |=// - -//==>> catalog.json - -{ - `name`: { - "subj": `SUBJECT` -> "department" - "crse": `COURSE ID` -> "id_num" - "name": `COURSE NAME` -> "title" - "description": `COURSE DESCRIPTION` -> "description" - } - ... -} - -//==>> courses.json - -[ - { - "name": PROGRAM NAME -> - "code": PROGRAM CODE - "courses": [ - { - "crse": COURSE ID - "sections": [ - "crn": CRN CODE - { - "credMin": MINIMUM CREDIT ALLOWED -\ - "credMax": MAXIMUM CREDIT ALLOWED -+-> "credits" - } - ... - ] - } - ... - ] - } - ... -] - -//==>> prerequisites.json - -{ - `CLASS CRN`: { - "prerequisites": { - "type": "course" || "or" || "and" - "nested": [] || "course": SUBJECT-ID - } - "field_of_study": { - "must_be": { - FULL NAME - } - } - "restrictions": { - "level": { - "must_be": [ - "Graduate" - ] - } - "major": { - "must_be": [ - FULL NAME - ] - } - "classification": { - "must_be": { - "Freshman" || - "Sophomore" || - "Junior" || - "Senior" || - "Graduate Student" || - } - "may_not_be": { - "Freshman" || - "Sophomore" || - "Junior" || - "Senior" || - "Graduate Student" - } - } - } - } - ... -} - -//==>> hass_pathways.json - -{ - `PATHWAY NAME`: { - "desription" - - } - ... -} - -How to store the data that we get: -[ - { - "title": - - "crns": [`CRN`...] - "department": - - "id_num": - - "credits": `cred` || 'credMin'-'credMax' - "ci": false, - "description": - - "offered": - - "prereq": [ - "`SUBJECT` `ID`" - ... - ], - "coreq": [ - "`SUBJECT` `ID`" - ... - ], - "cross": [ - "`SUBJECT` `ID`" - ... - ], - "required-by": { - "major": [`SUBJECT`...] - "minor": [`SUBJECT`...] - "hass": [`SUBJECT`...] - }, - "transfer": [ - { - "school": - - "location": - - "title": - - "id": - - } - ... - ] - } - ... -] \ No newline at end of file diff --git a/sis.rpi.edu/course_scrapper.py b/src/course.py similarity index 100% rename from sis.rpi.edu/course_scrapper.py rename to src/course.py From 243b695a2d6e5875d4720b3727822b55e3c474ce Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz <98502343+nazime1@users.noreply.github.com> Date: Fri, 18 Feb 2022 16:37:22 -0500 Subject: [PATCH 05/17] Delete scraper_practice2.py --- scraper_practice2.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) delete mode 100644 scraper_practice2.py diff --git a/scraper_practice2.py b/scraper_practice2.py deleted file mode 100644 index c38d359..0000000 --- a/scraper_practice2.py +++ /dev/null @@ -1,34 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Fri Feb 11 16:36:13 2022 - -@author: nazime -""" - -import requests -from bs4 import BeautifulSoup -import csv - -URL = "http://www.values.com/inspirational-quotes/" -r = requests.get(URL) -soup = BeautifulSoup(r.content, 'html5lib') -quotes=[] # a list to store quotes - -table = soup.find('div', attrs = {'id':'all_quotes'}) - -for row in table.findAll('div', - attrs = {'class':'col-6 col-lg-3 text-center margin-30px-bottom sm-margin-30px-top'}): - quote = {} - quote['theme'] = row.h5.text - quote['url'] = row.a['href'] - quote['img'] = row.img['src'] - quote['lines'] = row.img['alt'].split(" #")[0] - quote['author'] = row.img['alt'].split(" #")[1] - quotes.append(quote) - -filename = 'inspirational_quotes.csv' -with open(filename, 'w', newline='') as f: - w = csv.DictWriter(f,['theme','url','img','lines','author']) - w.writeheader() - for quote in quotes: - w.writerow(quote) \ No newline at end of file From bd7751da6388c69e008b5a61585aedb5d476a959 Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz <98502343+nazime1@users.noreply.github.com> Date: Fri, 18 Feb 2022 16:37:32 -0500 Subject: [PATCH 06/17] Delete scraper_practice.py --- scraper_practice.py | 45 --------------------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 scraper_practice.py diff --git a/scraper_practice.py b/scraper_practice.py deleted file mode 100644 index 6620c6d..0000000 --- a/scraper_practice.py +++ /dev/null @@ -1,45 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Fri Feb 11 16:02:40 2022 - -@author: nazime -""" - -import requests -from bs4 import BeautifulSoup - -URL = "https://realpython.github.io/fake-jobs/" -page = requests.get(URL) - -soup = BeautifulSoup(page.content, "html.parser") -results = soup.find(id="ResultsContainer") -print(results.prettify()) -job_elements = results.find_all("div", class_="card-content") -for job_element in job_elements: - title_element = job_element.find("h2", class_="title") - company_element = job_element.find("h3", class_="company") - location_element = job_element.find("p", class_="location") - print(title_element.text.strip()) - print(company_element.text.strip()) - print(location_element.text.strip()) - print() -python_jobs = results.find_all("h2", string="Python") -python_jobs = results.find_all( - "h2", string=lambda text: "python" in text.lower() -) -python_job_elements = [ - h2_element.parent.parent.parent for h2_element in python_jobs -] - -for job_element in python_job_elements: - title_element = job_element.find("h2", class_="title") - company_element = job_element.find("h3", class_="company") - location_element = job_element.find("p", class_="location") - print(title_element.text.strip()) - print(company_element.text.strip()) - print(location_element.text.strip()) - print() - links = job_element.find_all("a") - for link in links: - link_url = link["href"] - print(f"Apply here: {link_url}\n") \ No newline at end of file From 0d1e382ab73b129f9dcd29c8824500582adb53a3 Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz <98502343+nazime1@users.noreply.github.com> Date: Fri, 18 Feb 2022 16:38:18 -0500 Subject: [PATCH 07/17] Update python-app.yml --- .github/workflows/python-app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index b0e3cbc..4cc4b55 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -31,7 +31,7 @@ jobs: # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: execute py script - run: python scraper_practice2.py + run: for f in *.py; do python "$f"; done - name: commit files run: | From ac00d07ee3bc8401e95d53fd0d8ffc5014fdec84 Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz <98502343+nazime1@users.noreply.github.com> Date: Fri, 25 Feb 2022 17:17:37 -0500 Subject: [PATCH 08/17] Update python-app.yml Should have fixed the error with not scraping in the wrong place, also added working directory in order to upload to the data repository --- .github/workflows/python-app.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 4cc4b55..4058beb 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -31,9 +31,10 @@ jobs: # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: execute py script - run: for f in *.py; do python "$f"; done + run: for f in src/*.py; do python "$f"; done - name: commit files + working-directory: ./rpi-crisis/data run: | git config --local user.email "action@github.com" git config --local user.name "GitHub Action From b2fbd96904fed414b0d99771bc53c6ea6dc2bfa7 Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz <98502343+nazime1@users.noreply.github.com> Date: Fri, 25 Feb 2022 17:20:02 -0500 Subject: [PATCH 09/17] Delete workflows directory --- workflows/github-actions-demo.yml | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 workflows/github-actions-demo.yml diff --git a/workflows/github-actions-demo.yml b/workflows/github-actions-demo.yml deleted file mode 100644 index 2af4ce3..0000000 --- a/workflows/github-actions-demo.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: GitHub Actions Demo -on: [push] -jobs: - Explore-GitHub-Actions: - runs-on: ubuntu-latest - steps: - - run: echo "πŸŽ‰ The job was automatically triggered by a ${{ github.event_name }} event." - - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!" - - run: echo "πŸ”Ž The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." - - name: Check out repository code - uses: actions/checkout@v2 - - run: echo "πŸ’‘ The ${{ github.repository }} repository has been cloned to the runner." - - run: echo "πŸ–₯️ The workflow is now ready to test your code on the runner." - - name: List files in the repository - run: | - ls ${{ github.workspace }} - - run: echo "🍏 This job's status is ${{ job.status }}." \ No newline at end of file From e0782fa74376c4de7023916f06c6fff109028fc9 Mon Sep 17 00:00:00 2001 From: Trevor Brunette Date: Mon, 28 Feb 2022 14:49:13 -0500 Subject: [PATCH 10/17] Add pip installs for requests and bs4 --- .github/workflows/python-app.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 4058beb..b30a0cc 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -24,7 +24,7 @@ jobs: python -m pip install --upgrade pip pip install flake8 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Lint with flake8 + - name: Lint with flake8 requests bs4 run: | # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics @@ -37,7 +37,7 @@ jobs: working-directory: ./rpi-crisis/data run: | git config --local user.email "action@github.com" - git config --local user.name "GitHub Action + git config --local user.name "GitHub Action" git add -A git commit -m "update data" -a From f7169b7bac2f22e6e976a20c5d290fc278087488 Mon Sep 17 00:00:00 2001 From: Trevor Brunette Date: Mon, 28 Feb 2022 14:52:53 -0500 Subject: [PATCH 11/17] Update python-app.yml --- .github/workflows/python-app.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index b30a0cc..a23dabe 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -22,9 +22,9 @@ jobs: - name: install python packages run: | python -m pip install --upgrade pip - pip install flake8 pytest + pip install flake8 pytest requests bs4 if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Lint with flake8 requests bs4 + - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics From 2782fa4f99d735e5096cc0e55f2fd0a467db81db Mon Sep 17 00:00:00 2001 From: Trevor Brunette Date: Mon, 28 Feb 2022 14:58:11 -0500 Subject: [PATCH 12/17] Add pip install html5lib --- .github/workflows/python-app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index a23dabe..ffa4e2a 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -22,7 +22,7 @@ jobs: - name: install python packages run: | python -m pip install --upgrade pip - pip install flake8 pytest requests bs4 + pip install flake8 pytest requests bs4 html5lib if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 run: | From 12f37303c421f97aa6f6a07d50ac09fb4590c5a0 Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz <98502343+nazime1@users.noreply.github.com> Date: Tue, 1 Mar 2022 16:36:22 -0500 Subject: [PATCH 13/17] Update python-app.yml Trying again with pushing to data repository --- .github/workflows/python-app.yml | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index ffa4e2a..d73cf60 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -1,5 +1,5 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions +# This workflow will install Python dependencies, run all scraper files and then +# push the files to the data directory. name: auto scraper @@ -33,16 +33,17 @@ jobs: - name: execute py script run: for f in src/*.py; do python "$f"; done - - name: commit files - working-directory: ./rpi-crisis/data - run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - git add -A - git commit -m "update data" -a - - - name: push changes - uses: ad-m/github-push-action@v0.6.0 + - name: push to data directory + id: push_directory + uses: cpina/github-action-push-to-another-repository@main + env: + API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} with: - github_token: ${{ secrets.GITHUB_TOKEN }} - branch: main + source-directory: rpi-crisis/scraper + destination-github-username: 'nazime1' + destination-repository-name: 'rpi-crisis/data' + user-email: nazime@rpi.edu + commit-message: Data scraped from the various scrapers + target-branch: main + - name: Test get variable exported by push-to-data-directory + run: echo $DESTINATION_CLONED_DIRECTORY From 40f415418e33f4478c6503b0478ebf0add07653b Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz <98502343+nazime1@users.noreply.github.com> Date: Fri, 18 Mar 2022 16:20:04 -0400 Subject: [PATCH 14/17] Update python-app.yml --- .github/workflows/python-app.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index d73cf60..323e80a 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -40,8 +40,8 @@ jobs: API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} with: source-directory: rpi-crisis/scraper - destination-github-username: 'nazime1' - destination-repository-name: 'rpi-crisis/data' + destination-github-username: 'rpi-crisis' + destination-repository-name: 'data' user-email: nazime@rpi.edu commit-message: Data scraped from the various scrapers target-branch: main From fa99d17ac61ebeed0e4cbcc8adcb69fad3ad341d Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz <98502343+nazime1@users.noreply.github.com> Date: Fri, 18 Mar 2022 17:19:55 -0400 Subject: [PATCH 15/17] Update python-app.yml --- .github/workflows/python-app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 323e80a..0f7c428 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -39,7 +39,7 @@ jobs: env: API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} with: - source-directory: rpi-crisis/scraper + source-directory: src destination-github-username: 'rpi-crisis' destination-repository-name: 'data' user-email: nazime@rpi.edu From d169ca6fd79f09c05f4608680b67b3d2d5bebec7 Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz <98502343+nazime1@users.noreply.github.com> Date: Tue, 22 Mar 2022 16:44:30 -0400 Subject: [PATCH 16/17] Update python-app.yml --- .github/workflows/python-app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 0f7c428..33947a1 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -42,7 +42,7 @@ jobs: source-directory: src destination-github-username: 'rpi-crisis' destination-repository-name: 'data' - user-email: nazime@rpi.edu + user-email: bot@rpicrisis.org commit-message: Data scraped from the various scrapers target-branch: main - name: Test get variable exported by push-to-data-directory From 7e8309d591ce236bf54382b45b38533678a6d121 Mon Sep 17 00:00:00 2001 From: Erica Nazimowitz Date: Fri, 25 Mar 2022 15:55:32 -0400 Subject: [PATCH 17/17] Data scraped --- catalog.rpi.edu/fall_calendar.json | 1 + catalog.rpi.edu/out.json | 1 + catalog.rpi.edu/spring_calendar.json | 1 + 3 files changed, 3 insertions(+) create mode 100644 catalog.rpi.edu/fall_calendar.json create mode 100644 catalog.rpi.edu/out.json create mode 100644 catalog.rpi.edu/spring_calendar.json diff --git a/catalog.rpi.edu/fall_calendar.json b/catalog.rpi.edu/fall_calendar.json new file mode 100644 index 0000000..87adb6a --- /dev/null +++ b/catalog.rpi.edu/fall_calendar.json @@ -0,0 +1 @@ +{"August 6": ["Fall tuition and fees due."], "August 27": ["Official date of August graduation; diplomas mailed to students after final clearance is completed in September. Degree recipients may take part in the May 2022Commencement ceremony.", "Residence dining halls open with dinner. Residence halls and apartments open for upperclass and new graduate students."], "August 30": ["Fall 2021 semester classes begin."], "September 6": ["Labor Day - no classes."], "September 7": ["Classes resume. Follow a Monday schedule."], "September 13": ["Last day for graduate and undergraduate students to add courses, change sections or to put courses on audit. Deadline for completion of NE/Igrade expectations related toSpring 2021courses."], "September 17": ["Nomination of Masters Thesis Committee forms due to the Office of Graduate Education for December graduates."], "October 8": ["Last day to file an online degree application via SIS for December 31, 2021graduation."], "October 8 - October 9": ["Reunion andHomecoming 2021."], "October 11": ["Columbus Day - no classes."], "October 12": ["Classes resume."], "October 22": ["Last day for undergraduate and graduate students to drop a course."], "October 25- November 5": ["Consultation weeks.Advisement for Spring 2022registration. Students should consult with their faculty advisers."], "October 29": ["Doctoral dissertations due to advisers."], "November 5": ["Masters thesis and Engineering projects due to advisors."], "November 8- November 22": ["Pre-registration for the Spring 2022semester opens for currently enrolled students."], "November 12": ["Last day for undergraduates to add or remove Pass/No Credit designation."], "November 15": ["Masters theses due in the Office of Graduate Education. Last day to defend doctoral dissertations."], "November 24- November 26": ["Thanksgiving break - no classes. Dining halls closed."], "November 28": ["Dining halls reopen for dinner."], "November 29": ["Classes resume. Doctoral dissertations due in the Office of Graduate Education."], "December 10": ["Last day of classes. Deadline for completion of NE/I grade expectations related to Summer 2021 courses."], "December 11- December 14": ["Reading/Study days. Instructors can schedule no exams nor require any student work expectations on these days."], "December 15 - December 21": ["Final Examinations."], "December 15": ["Registration add/drop reopens for the Spring 2022 term."], "December 25 - January 1": ["Holiday winter break, Institute is closed."], "December 31": ["Official date of December graduation; diplomas mailed to students after final clearance is completed in January. Degree recipients may take part in the May 2022Commencement ceremony."]} \ No newline at end of file diff --git a/catalog.rpi.edu/out.json b/catalog.rpi.edu/out.json new file mode 100644 index 0000000..102942c --- /dev/null +++ b/catalog.rpi.edu/out.json @@ -0,0 +1 @@ +{"School of Architecture": [{"degree": "Architecture", "offered": ["B.Arch.", "M.Arch.", "M.S."], "hegis": "0202"}, {"degree": "Architectural Sciences", "offered": ["M.S.", "Ph.D."], "hegis": "0202"}, {"degree": "Building Sciences", "offered": ["B.S."], "hegis": "0202"}, {"degree": "Lighting", "offered": ["M.S."], "hegis": "0299"}], "School of Engineering": [{"degree": "Aeronautical Engineering", "offered": ["B.S.", "M.Eng.", "M.S.", "D.Eng.", "Ph.D."], "hegis": "0902"}, {"degree": "Biomedical Engineering", "offered": ["B.S.", "M.Eng.", "M.S.", "D.Eng.", "Ph.D."], "hegis": "0905"}, {"degree": "Chemical Engineering", "offered": ["B.S.", "M.Eng.", "M.S.", "D.Eng.", "Ph.D."], "hegis": "0906"}, {"degree": "Civil Engineering", "offered": ["B.S.", "M.Eng.", "M.S.", "D.Eng.", "Ph.D."], "hegis": "0908"}, {"degree": "Computer and Systems Engineering", "offered": ["B.S.", "M.Eng.", "M.S.", "D.Eng.", "Ph.D."], "hegis": "0999"}, {"degree": "Decision Sciences and Engineering Systems", "offered": ["Ph.D."], "hegis": "0913"}, {"degree": "Electrical Engineering", "offered": ["B.S.", "M.Eng.", "M.S.", "D.Eng.", "Ph.D."], "hegis": "0909"}, {"degree": "Engineering Physics", "offered": ["M.S.", "D.Eng.", "Ph.D."], "hegis": "0919"}, {"degree": "Engineering Science", "offered": ["B.S.", "M.S.", "Ph.D."], "hegis": "0901"}, {"degree": "Environmental Engineering", "offered": ["B.S.", "M.Eng.", "M.S.", "D.Eng.", "Ph.D."], "hegis": "0922"}, {"degree": "Industrial and Management Engineering", "offered": ["B.S.", "M.Eng.", "M.S."], "hegis": "0913"}, {"degree": "Materials Engineering", "offered": ["B.S.", "M.Eng.", "M.S.", "D.Eng.", "Ph.D."], "hegis": "0915"}, {"degree": "Mechanical Engineering", "offered": ["B.S.", "M.Eng.", "M.S.", "D.Eng.", "Ph.D."], "hegis": "0910"}, {"degree": "Nuclear Engineering", "offered": ["B.S.", "M.Eng.", "M.S.", "D.Eng."], "hegis": "0920"}, {"degree": "Nuclear Engineering and Science", "offered": ["Ph.D."], "hegis": "0920"}, {"degree": "Systems Engineering and Technology Management", "offered": ["M.E."], "hegis": "0913"}, {"degree": "Transportation Engineering", "offered": ["M.Eng.", "M.S.", "D.Eng.", "Ph.D."], "hegis": "0908"}], "School of Humanities, Arts, and Social Sciences": [{"degree": "Cognitive Science", "offered": ["B.S.", "M.S.", "Ph.D."], "hegis": "0499"}, {"degree": "Communication, Media, and Design", "offered": ["B.S."], "hegis": "0601"}, {"degree": "Communication and Rhetoric", "offered": ["M.S."], "hegis": "0601"}, {"degree": "Communication and Rhetoric", "offered": ["Ph.D."], "hegis": "0602"}, {"degree": "Design, Innovation, and Society", "offered": ["B.S."], "hegis": "4903"}, {"degree": "Ecological Economics", "offered": ["Ph.D."], "hegis": "0517"}, {"degree": "Ecological Economics, Values, and Policy", "offered": ["M.S."], "hegis": "2299"}, {"degree": "Economics", "offered": ["B.S.", "M.S."], "hegis": "2204"}, {"degree": "Electronic Arts", "offered": ["B.S.", "M.F.A.", "Ph.D."], "hegis": "1099"}, {"degree": "Electronic Media, Arts, and Communication", "offered": ["B.S."], "hegis": "0605"}, {"degree": "Games and Simulation Arts and Sciences", "offered": ["B.S."], "hegis": "2299"}, {"degree": "Human-Computer Interaction", "offered": ["M.S."], "hegis": "0799"}, {"degree": "Philosophy", "offered": ["B.S."], "hegis": "1509"}, {"degree": "Psychological Science", "offered": ["B.S."], "hegis": "2001"}, {"degree": "Science, Technology, and Society", "offered": ["B.S."], "hegis": "4903"}, {"degree": "Science and Technology Studies", "offered": ["M.S.", "Ph.D."], "hegis": "4903"}, {"degree": "Sustainability Studies", "offered": ["B.S."], "hegis": "4903"}, {"degree": "Technical Communication", "offered": ["M.S."], "hegis": "0601"}], "School of Management": [{"degree": "Business Analytics", "offered": ["B.S.", "M.S."], "hegis": "0506"}, {"degree": "Business and Management", "offered": ["B.S."], "hegis": "0506"}, {"degree": "Quantitative Finance and Risk Analytics", "offered": ["M.S."], "hegis": "0504"}, {"degree": "Management", "offered": ["M.S.", "MBA", "Ph.D."], "hegis": "0506"}, {"degree": "Supply Chain Management", "offered": ["M.S."], "hegis": "0506"}, {"degree": "Technology, Commercialization and Entrepreneurship", "offered": ["M.S."], "hegis": "5004"}], "School of Science": [{"degree": "Applied Physics", "offered": ["B.S."], "hegis": "1902"}, {"degree": "Applied Science", "offered": ["M.S."], "hegis": "4902"}, {"degree": "Astronomy", "offered": ["M.S."], "hegis": "1911"}, {"degree": "Biology", "offered": ["B.S.", "M.S.", "Ph.D."], "hegis": "0401"}, {"degree": "Biochemistry and Biophysics", "offered": ["B.S.", "M.S. Ph.D."], "hegis": "0499"}, {"degree": "Biological Neuroscience", "offered": ["B.S."], "hegis": "0425"}, {"degree": "Chemistry", "offered": ["B.S.", "M.S.", "Ph.D."], "hegis": "1905"}, {"degree": "Computer Science", "offered": ["B.S.", "M.S.", "Ph.D."], "hegis": "0701"}, {"degree": "Environmental Science", "offered": ["B.S."], "hegis": "1999"}, {"degree": "Geology", "offered": ["B.S.", "M.S.", "Ph.D."], "hegis": "1914"}, {"degree": "Hydrogeology", "offered": ["B.S.", "M.S."], "hegis": "1914"}, {"degree": "Interdisciplinary Science", "offered": ["B.S."], "hegis": "4902"}, {"degree": "Applied Mathematics", "offered": ["M.S."], "hegis": "1703"}, {"degree": "Mathematics", "offered": ["B.S.", "M.S.", "Ph.D."], "hegis": "1701"}, {"degree": "Multidisciplinary Science", "offered": ["M.S.", "Ph.D."], "hegis": "4902"}, {"degree": "Physics", "offered": ["B.S.", "M.S.", "Ph.D."], "hegis": "1902"}], "Information Technologyand Web Science": [{"degree": "Information Technology", "offered": ["M.S."], "hegis": "0702"}, {"degree": "Information Technology and Web Science", "offered": ["B.S"], "hegis": "0702"}]} \ No newline at end of file diff --git a/catalog.rpi.edu/spring_calendar.json b/catalog.rpi.edu/spring_calendar.json new file mode 100644 index 0000000..71b1e5b --- /dev/null +++ b/catalog.rpi.edu/spring_calendar.json @@ -0,0 +1 @@ +{"": ["Spring Term 2022\nJanuary 7\nSpring 2022 tuition and fees due\nJanuary 9\r\n\t\t\tResidence dining halls with dinner.\nJanuary 10\r\n\t\t\tSpring 2022 semester classes begin.\nJanuary 17\r\n\t\t\tMartin Luther King Jr. Day - no classes.\nJanuary 21\r\n\t\t\tLast day for graduate and undergraduate students to add courses, change sections, or to put courses onaudit.\nFebruary 4\nNomination of Masters Thesis Committee forms due to the Office of Graduate Education for May 2022 graduates.\nFebruary 21\r\n\t\t\tPresidents Day - no classes.\nFebruary 22\r\n\t\t\tClasses resume. Follow a Monday schedule.\nFebruary 28\nDoctoral dissertations due to advisers. Masters theses and Engineering projects due to advisers.\n\nFebruary 28 - March 18\nConsultation weeks. Advisement for Fall 2022registration. Students should consult with their faculty advisers.\n\nMarch 4\r\n\t\t\tLast day to file a Degree Application online via SIS for May 2022graduation.\n\n\nMarch 4\nLast day for undergraduate and graduate students to drop a course. Resident dining halls close after dinner."], "March 7": ["Registration begins for The Arch summer classes"], "March 7 - March 11": ["Spring Break"], "March 13": ["Resident dining halls reopen for dinner."], "March 14": ["Classes resume."], "March 21- March 25": ["Grand Marshal Week (Student Government Elections)."], "March 21- April 4": ["Pre-registration for the Fall 2022 semester opens for all currently enrolled students."], "March 23": ["GM week events - no classes."], "March 25": ["Last day to defend doctoral dissertations. Masters theses due in the Office of Graduate Education."], "April 8": ["Doctoral dissertations due in the Office of Graduate Education."], "April 15": ["Last day for undergraduates to add or remove Pass/No Credit designation."], "April 22": ["Summer Arch tuition and fees due."], "April 27": ["Last day of classes.Deadline for completion of NE/I grade expectations related to Fall2021 courses."], "April 28\u00a0- May 1": ["Reading/Study days. Instructors can schedule no exams nor require any student work expectations on these days.."], "May 2- May 6": ["Final Examinations"], "May 6": ["All resident dining halls close after dinner. Residence halls and apartments close at noon for all students not participating in Commencement."], "May 20": ["ROTC Commissioning Ceremony."], "May 21": ["Commencement."], "May 23\nSummer I and II classes begin (including The Arch classes).\n\u00a0\nMay 30": ["Memorial Day - no classes."]} \ No newline at end of file