From b55cdfbb16a64c6e0e4c619d7daa468de2ba0a2c Mon Sep 17 00:00:00 2001 From: Drew Volz Date: Sat, 17 Oct 2020 15:16:17 -0600 Subject: [PATCH] [WIP] Parse jobs from wordpress feed * RSS no longer redirects the job url correctly * Switch url to wp-json feed * Switch to parsing dom from response (1 less network call) --- .../ccci-carleton-college/v1/jobs/index.js | 46 +++++++------------ 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/modules/node_modules/@frogpond/ccci-carleton-college/v1/jobs/index.js b/modules/node_modules/@frogpond/ccci-carleton-college/v1/jobs/index.js index 809cb939..ceb45930 100644 --- a/modules/node_modules/@frogpond/ccci-carleton-college/v1/jobs/index.js +++ b/modules/node_modules/@frogpond/ccci-carleton-college/v1/jobs/index.js @@ -1,16 +1,13 @@ -import {get, ONE_DAY, ONE_HOUR} from '@frogpond/ccc-lib' +import {ONE_HOUR} from '@frogpond/ccc-lib' +import {cachedWpJsonFeed} from '@frogpond/ccci-carleton-college/v1/news' import mem from 'mem' import _jsdom from 'jsdom' -import url from 'url' -import qs from 'querystring' import getUrls from 'get-urls' import pMap from 'p-map' const {JSDOM} = _jsdom -const GET_ONE_DAY = mem(get, {maxAge: ONE_DAY}) -const GET_TWO_DAYS = mem(get, {maxAge: ONE_DAY * 2}) - -const jobsUrl = 'https://apps.carleton.edu/campus/sfs/employment/feeds/jobs' +const jobsUrl = + 'https://www.carleton.edu/student-employment/post-jobs/wp-json/wp/v2/posts' const BOOLEAN_KEYS = [ 'Position available during term', @@ -19,26 +16,20 @@ const BOOLEAN_KEYS = [ const PARAGRAPHICAL_KEYS = ['Description'] -export async function fetchJob(link) { - let {job_id: id} = qs.parse(url.parse(link).query) - - link = link.replace(/^http:/, 'https:') - const page = await GET_TWO_DAYS(link) - const dom = new JSDOM(page.body) - - const jobs = dom.window.document.querySelector('#jobs') - const title = jobs.querySelector('h3') +export function fetchJob(resp) { + const dom = new JSDOM(resp.content, {contentType: 'text/html'}) - let titleText = title.textContent.trim() + let titleText = resp.title.trim() const offCampus = /^Off Campus/.test(titleText) if (offCampus) { titleText = titleText.replace(/^Off Campus: +/, '') } - const details = jobs.querySelectorAll('ul:first-of-type > li') + const details = dom.window.document.querySelectorAll('p') + const detailMap = [...details].reduce((coll, listEl) => { let [key, ...value] = listEl.childNodes - key = key.textContent.replace(/:$/, '') + key = key ? key.textContent.replace(/:$/, '') : key if (BOOLEAN_KEYS.includes(key)) { value = true @@ -46,16 +37,15 @@ export async function fetchJob(link) { let paragraphs = [...listEl.querySelectorAll('p')] let content = paragraphs.length ? paragraphs : value value = content - .map(el => el.textContent) + .map((el) => el.textContent) .join('\n\n') .trim() } else { value = value - .map(el => el.textContent) + .map((el) => el.textContent) .join(' ') .trim() } - coll.set(key, value) return coll @@ -65,25 +55,21 @@ export async function fetchJob(link) { const links = Array.from(getUrls(description)) return { - id: id, + // id: id, title: titleText, offCampus: offCampus, department: detailMap.get('Department or Office'), dateOpen: detailMap.get('Date Open') || 'Unknown', duringTerm: Boolean(detailMap.get('Position available during term')), duringBreak: Boolean(detailMap.get('Position available during break')), - description: detailMap.get('Description') || '', + description: description, links: links, } } async function _getAllJobs() { - let resp = await GET_ONE_DAY(jobsUrl) - let dom = new JSDOM(resp.body, {contentType: 'text/xml'}) - let jobLinks = Array.from( - dom.window.document.querySelectorAll('rss channel item link'), - ).map(link => link.textContent.trim()) - return pMap(jobLinks, fetchJob, {concurrency: 4}) + let resp = await cachedWpJsonFeed(jobsUrl) + return pMap(resp, fetchJob, {concurrency: 4}) } export const getJobs = mem(_getAllJobs, {maxAge: ONE_HOUR})