Skip to content

Commit

Permalink
Merge pull request #2 from wasedatime/feature/lambda
Browse files Browse the repository at this point in the history
Feature/lambda
  • Loading branch information
AustinZhu authored Nov 16, 2020
2 parents 5e86553 + c94b7d3 commit 94f8719
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 26 deletions.
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# dependencies
aiohttp
boto3
lxml
3 changes: 2 additions & 1 deletion scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
SOFTWARE.
"""

__all__ = ["SyllabusCrawler"]
__all__ = ["SyllabusCrawler", "upload_to_s3"]

from scraper.crawler import SyllabusCrawler
from scraper.s3util import upload_to_s3
2 changes: 1 addition & 1 deletion scraper/const.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import random

dept_name_map = {
school_name_map = {
"PSE": {"jp": "政経", "en": "Schl Political Sci/Econo", "param": "111973"},
"LAW": {"jp": "法学", "en": "Schl Law", "param": "121973"},
"EDU": {"jp": "教育", "en": "Schl Edu", "param": "151949"},
Expand Down
19 changes: 10 additions & 9 deletions scraper/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,23 @@
from lxml import html

from scraper import hybrid, thread_only
from scraper.const import query, header, level_enum_map, type_enum_map, dept_name_map
from scraper.const import query, header, level_enum_map, type_enum_map, school_name_map
from scraper.utils import build_url, parse_period, to_half_width, parse_min_year, \
get_eval_criteria, to_enum, scrape_info, parse_term, parse_location, merge_period_location, scrape_text, parse_lang
get_eval_criteria, to_enum, scrape_info, parse_term, parse_location, merge_period_location, scrape_text, parse_lang, \
parse_credit


class SyllabusCrawler:
def __init__(self, dept, task=None, engine="thread-only", worker=8):
def __init__(self, school, task=None, engine="thread-only", worker=8):
"""
:param dept: department name
:param school: department name
:param task: tasks to execute
:param engine: "thread-only" | "hybrid",
:param worker: num of worker threads
"""
if dept not in dept_name_map.keys():
if school not in school_name_map.keys():
raise ValueError
self.dept = dept
self.school = school
self.task = task
self.engine = engine
self.worker = worker
Expand All @@ -44,7 +45,7 @@ def get_max_page(self):
Get the max page number for a department
:return: int
"""
url = build_url(self.dept, 1, 'en')
url = build_url(self.school, 1, 'en')
body = requests.urlopen(url).read()
try:
last = html.fromstring(body).xpath(query["page_num"])[-1]
Expand All @@ -58,7 +59,7 @@ def scrape_catalog(self, page):
:param page: page number (starts from 1)
:return: list of course ids
"""
req = requests.Request(url=build_url(self.dept, page + 1, 'en'), headers=header)
req = requests.Request(url=build_url(self.school, page + 1, 'en'), headers=header)
resp = requests.urlopen(req).read()
clist = html.fromstring(resp).xpath(query["course_list"])
return [re.search(r"\w{28}", clist[i].xpath(query["course_id"])[0]).group(0) for i in range(1, len(clist))]
Expand Down Expand Up @@ -113,7 +114,7 @@ def scrape_course(self, course_id):
"i": merge_period_location(periods, locations),
"j": scrape_info(info_en, 'min_year', parse_min_year),
"k": scrape_info(info_en, 'category', to_half_width),
"l": scrape_info(info_en, 'credit', None),
"l": scrape_info(info_en, 'credit', parse_credit),
"m": scrape_info(info_en, 'level', to_enum(level_enum_map)),
"n": get_eval_criteria(parsed_en),
"o": scrape_info(info_en, 'code', None),
Expand Down
19 changes: 11 additions & 8 deletions scraper/s3util.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import boto3
import os
import json
import os

import boto3
from botocore.config import Config


def upload_to_s3(syllabus, dept):
def upload_to_s3(syllabus, school):
"""
Upload the syllabus info of the department to s3
:param syllabus: iterator of course info
:param dept: abbr of the department. e.g. "PSE"
:param school: abbr of the department. e.g. "PSE"
:return: dict :=
{
'Expiration': 'string',
Expand All @@ -21,12 +23,13 @@ def upload_to_s3(syllabus, dept):
'RequestCharged': 'requester'
}
"""
s3 = boto3.resource('s3', region_name="ap-northeast-1")
syllabus_object = s3.Object(os.getenv('BUCKET_NAME'), os.getenv('OBJECT_PATH') + dept + '.json')
s3 = boto3.resource('s3', region_name="ap-northeast-1", verify=False, config=Config(signature_version='s3v4'))
syllabus_object = s3.Object(os.getenv('BUCKET_NAME'), os.getenv('OBJECT_PATH') + school + '.json')
body = bytes(json.dumps(list(syllabus)).encode('UTF-8'))
resp = syllabus_object.put(
ACL='public-read',
Body=bytes(json.dumps(syllabus).encode('UTF-8')),
ContentType='application/json',
Body=body,
ContentType='application/json; charset=utf-8',
CacheControl='max-age=86400, must-revalidate'
)
return resp
18 changes: 11 additions & 7 deletions scraper/utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import datetime
import logging
import re

import itertools
import unicodedata

from scraper.const import location_name_map, dept_name_map, query, eval_type_map, weekday_enum_map, term_enum_map, \
from scraper.const import location_name_map, school_name_map, query, eval_type_map, weekday_enum_map, term_enum_map, \
lang_enum_map

logger = logging.getLogger()
logger.setLevel(logging.WARNING)


def scrape_info(parsed, key, fn):
"""
Expand All @@ -32,7 +36,7 @@ def build_url(dept=None, page=1, lang="en", course_id=None):
"""
if course_id:
return f"https://www.wsl.waseda.jp/syllabus/JAA104.php?pKey={course_id}&pLng={lang}"
param = dept_name_map[dept]["param"]
param = school_name_map[dept]["param"]
year = datetime.datetime.now().year
return f"https://www.wsl.waseda.jp/syllabus/JAA103.php?pYear={year}&p_gakubu={param}&p_page={page}&p_number=100" \
f"&pLng={lang} "
Expand Down Expand Up @@ -75,7 +79,7 @@ def get_eval_criteria(parsed):
try:
percent = int(percent)
except ValueError:
print(percent)
logger.warning(f"Unable to parse percent: {percent}")
criteria = to_half_width(elem[2].text)
evals.append({
"t": to_enum(eval_type_map)(kind),
Expand Down Expand Up @@ -159,7 +163,7 @@ def rename_location(loc):
elif loc in location_name_map.keys():
return location_name_map[loc]
else:
print(loc)
logger.warning(f"Unable to parse location: {loc}")
return to_half_width(loc)


Expand Down Expand Up @@ -205,7 +209,7 @@ def parse_term(schedule):
try:
(term, _) = schedule.split(u'\xa0'u'\xa0', 1)
except ValueError:
print(schedule)
logger.warning(f"Unable to parse term: {schedule}")
return "undecided"
return to_enum(term_enum_map)(term)

Expand All @@ -220,7 +224,7 @@ def parse_period(schedule):
try:
(_, occ) = schedule.split(u'\xa0'u'\xa0', 1)
except ValueError:
print(schedule)
logger.warning(f"Unable to parse period: {schedule}")
return []
if occ == "othersothers":
return [{"d": -1, "p": -1}]
Expand Down Expand Up @@ -259,7 +263,7 @@ def map_to_int(data):
try:
return enum_map[data]
except KeyError:
print(data)
logger.warning(f"Unable to map '{data}' to integer")
return -1

return map_to_int
16 changes: 16 additions & 0 deletions syllabus_scraper.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,24 @@
import logging

from scraper import SyllabusCrawler, upload_to_s3

logger = logging.getLogger()
logger.setLevel(logging.INFO)


def handler(event, context):
"""
Lambda function handler
:param event:
:param context:
:return:
"""
schools = event["schools"]
for school in schools:
logger.info(f"Started scraping school: {school}")
syllabus_info = SyllabusCrawler(school=school, worker=32).execute()
logger.info(f"Finished scraping school: {school}")
logger.info(f"Uploading {school}.json to S3 ")
upload_to_s3(syllabus_info, school)
logger.info(f"Successfully uploaded {school}.json")
return None

0 comments on commit 94f8719

Please sign in to comment.