Skip to content

Commit

Permalink
Merge pull request #306 from ndaidong/7.2.3
Browse files Browse the repository at this point in the history
v7.2.3
  • Loading branch information
ndaidong authored Sep 23, 2022
2 parents 22f4dab + 226ea8e commit c938584
Show file tree
Hide file tree
Showing 14 changed files with 156 additions and 137 deletions.
36 changes: 18 additions & 18 deletions dist/article-parser.esm.js

Large diffs are not rendered by default.

58 changes: 29 additions & 29 deletions dist/cjs/article-parser.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dist/cjs/package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "article-parser",
"version": "7.2.2",
"version": "7.2.3",
"main": "./article-parser.js"
}
2 changes: 2 additions & 0 deletions eval.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ import { extract } from './src/main.js'

const extractFromUrl = async (url) => {
try {
console.time('extraction')
const art = await extract(url)
console.log(art)
console.timeEnd('extraction')
} catch (err) {
console.trace(err)
}
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "7.2.2",
"version": "7.2.3",
"name": "article-parser",
"description": "To extract main article from given URL",
"homepage": "https://demos.pwshub.com/article-parser",
Expand All @@ -26,7 +26,7 @@
},
"scripts": {
"lint": "standard .",
"pretest": "npm run lint",

"test": "NODE_ENV=test NODE_OPTIONS=--experimental-vm-modules jest --coverage=true",
"build": "node build",
"eval": "node eval",
Expand Down
7 changes: 1 addition & 6 deletions src/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,14 @@ import {
import retrieve from './utils/retrieve.js'
import parseFromHtml from './utils/parseFromHtml.js'
import { isValid as isValidUrl } from './utils/linker.js'
import { isValid as isHTMLString } from './utils/html.js'

export const extract = async (input, parserOptions = {}, fetchOptions = {}) => {
if (!isString(input)) {
throw new Error('Input must be a string')
}

if (isHTMLString(input)) {
return parseFromHtml(input, null, parserOptions)
}

if (!isValidUrl(input)) {
throw new Error('Input must be a valid URL')
return parseFromHtml(input, null, parserOptions)
}
const html = await retrieve(input, fetchOptions)
if (!html) {
Expand Down
19 changes: 8 additions & 11 deletions src/utils/extractWithReadability.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,12 @@

import { Readability } from '@mozilla/readability'
import { DOMParser } from 'linkedom'
import { isString } from 'bellajs'

import { isValid as isHTMLString } from './html.js'

/**
* @param html {string}
* @param inputUrl {string}
* @returns {string|null}
*/
export default (html, inputUrl = '') => {
if (!isHTMLString(html)) return null
if (!isString(html)) {
return null
}
const doc = new DOMParser().parseFromString(html, 'text/html')
const base = doc.createElement('base')
base.setAttribute('href', inputUrl)
Expand All @@ -22,9 +18,10 @@ export default (html, inputUrl = '') => {
}

export function extractTitleWithReadability (html) {
if (!isHTMLString(html)) return null
if (!isString(html)) {
return null
}
const doc = new DOMParser().parseFromString(html, 'text/html')
const reader = new Readability(doc)
// noinspection JSUnresolvedFunction
return reader._getArticleTitle()
return reader._getArticleTitle() || null
}
45 changes: 29 additions & 16 deletions src/utils/extractWithReadability.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,35 @@ import { isString } from 'bellajs'

import extractWithReadability, { extractTitleWithReadability } from './extractWithReadability.js'

test('test extractWithReadability from good html content', async () => {
const html = readFileSync('./test-data/regular-article.html', 'utf8')
const result = extractWithReadability(html, 'https://foo.bar')
expect(isString(result)).toBe(true)
expect(result.length > 200).toBe(true)
expect(result).toEqual(expect.stringContaining('<img src="https://foo.bar/orange.png">'))
})
describe('test extractWithReadability()', () => {
test('extract from good html content', async () => {
const html = readFileSync('./test-data/regular-article.html', 'utf8')
const result = extractWithReadability(html, 'https://foo.bar')
expect(isString(result)).toBe(true)
expect(result.length > 200).toBe(true)
expect(result).toEqual(expect.stringContaining('<img src="https://foo.bar/orange.png">'))
})

test('test extractWithReadability from bad html content', async () => {
expect(extractWithReadability(null)).toBe(null)
expect(extractWithReadability({})).toBe(null)
expect(extractWithReadability('<div></span>')).toBe(null)
})
test('extract from bad html content', async () => {
expect(extractWithReadability(null)).toBe(null)
expect(extractWithReadability({})).toBe(null)
expect(extractWithReadability('<div></span>')).toBe(null)
})

test('extract title only', async () => {
const html = readFileSync('./test-data/regular-article.html', 'utf8')
const result = extractTitleWithReadability(html)
expect(result).toBe('Article title here - ArticleParser')
})

test('extract title from page without title', async () => {
const html = readFileSync('./test-data/html-no-title.html', 'utf8')
const result = extractTitleWithReadability(html)
expect(result).toBe(null)
})

test('test extractTitleWithReadability', async () => {
const html = readFileSync('./test-data/regular-article.html', 'utf8')
const result = extractTitleWithReadability(html)
expect(result).toBe('Article title here - ArticleParser')
test('extract title from non-string', async () => {
const result = extractTitleWithReadability({})
expect(result).toBe(null)
})
})
9 changes: 0 additions & 9 deletions src/utils/html.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,13 @@ import sanitize from 'sanitize-html'

import { getSanitizeHtmlOptions } from '../config.js'

export const isValid = (str = '') => {
const reg = /<(?=.*? .*?\/ ?>|br|hr|input|!--|wbr)[a-z]+.*?>|<([a-z]+).*?<\/\1>/i
return reg.test(str)
}

export const purify = html => {
return sanitize(html, {
allowedTags: false,
allowedAttributes: false
})
}

/**
* @param inputHtml {string}
* @returns cleanHtml {string}
*/
export const cleanify = (inputHtml) => {
const doc = new DOMParser().parseFromString(inputHtml, 'text/html')
const html = doc.documentElement.innerHTML
Expand Down
37 changes: 0 additions & 37 deletions src/utils/html.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,46 +6,9 @@ import { readFileSync } from 'fs'
import { isString } from 'bellajs'

import {
isValid as isHTMLString,
cleanify
} from './html.js'

describe('test isValid() method', () => {
test('validate bad input', () => {
const result = isHTMLString({})
expect(result).toBe(false)
})

test('validate regular string', () => {
const result = isHTMLString('This is just a string, not HTML')
expect(result).toBe(false)
})

test('validate bad-format HTML', () => {
const result = isHTMLString('<div class="welcome">Hello world</span>')
expect(result).toBe(false)
})

test('validate well-format HTML', () => {
const result = isHTMLString('<div class="welcome">Hello <b>world</b><hr></div>')
expect(result).toBe(true)
})

test('validate example HTML page', () => {
const files = [
'regular-article.html',
'html-no-title.html',
'html-article-no-source.html',
'html-too-short-article.html'
]
files.forEach((file) => {
const html = readFileSync(`./test-data/${file}`, 'utf8')
const result = isHTMLString(html)
expect(result).toBe(true)
})
})
})

describe('test cleanify() method', () => {
test('check if unwanted elements/attributes removed', () => {
const html = readFileSync('./test-data/regular-article.html', 'utf8')
Expand Down
8 changes: 4 additions & 4 deletions src/utils/parseFromHtml.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ describe('test parseFromHtml()', () => {
const cases = [
{
input: {
desc: 'a bad input',
html: {}
desc: 'a webpage with no title',
html: readFileSync('./test-data/html-no-title.html', 'utf8')
},
expectation: null
},
{
input: {
desc: 'a webpage with no title',
html: readFileSync('./test-data/html-no-title.html', 'utf8')
desc: 'a webpage without link',
html: readFileSync('./test-data/html-no-link.html', 'utf8')
},
expectation: null
},
Expand Down
2 changes: 1 addition & 1 deletion test-data/html-article-no-source.html
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@
To be more specific, those turtles are nothing more than fishes. A grape can hardly be considered a shrewd goldfish without also being an owl. Some unbiased goats are thought of simply as tangerines.

Shouting with happiness, a courageous elephant is a duck of the mind? Some posit the upbeat hippopotamus to be less than enchanting. It's an undeniable fact, really; authors often misinterpret the grape as an endurable rabbit, when in actuality it feels more like a tough dolphin. We know that a cherry can hardly be considered a responsible apricot without also being a nectarine.
</article>s
</article>
</body>
</html>
37 changes: 37 additions & 0 deletions test-data/html-no-link.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Article title here - ArticleParser</title>
<meta name="author" content="Alice">
<meta name="description" content="Few words about this article">
<link rel="stylesheet" href="/path/to/cssfile.css">
<link rel="alternate" title="ArticleParser" type="application/atom+xml" href="https://somewhere.com/atom.xml">
<link rel="manifest" href="/manifest.json">
</head>
<body>
<header>Page header here</header>
<main>
<section>
<nav>Navigation here</nav>
</section>
<section>
<h1>Article title here</h1>
<article>
<div class="contentdetail">Few can name a <a href="https://otherwhere.com/descriptions/rational-peach">rational peach</a> that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.</div>
<p class="contentdetail">
Those cheetahs are nothing more than dogs. A <a href="/dict/watermelon">watermelon</a> is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.</p>
<p>The first fair dog is, in its own way, a lemon.</p>
<address>4746 Kelly Drive, West Virginia</address>
<img src="./orange.png" style="border: solid 1px #000">
</article>
</section>
<section class="sidebar-widget">
<widget>Some widget here</widget>
<widget>Some widget here</widget>
</section>
</main>
<footer>Page footer here</footer>
</body>
</html>
27 changes: 24 additions & 3 deletions test-data/html-no-title.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,29 @@
<head>
<meta charset="utf-8">
<meta name="viewport"content="width=device-width,initial-scale=1">
<title>TechNews</title>
</head>
<body>
</body>
<body>
<header>Page header here</header>
<main>
<section>
<nav>Navigation here</nav>
</section>
<section>
<h1>Article title here</h1>
<article>
<div class="contentdetail">Few can name a <a href="https://otherwhere.com/descriptions/rational-peach">rational peach</a> that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.</div>
<p class="contentdetail">
Those cheetahs are nothing more than dogs. A <a href="/dict/watermelon">watermelon</a> is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.</p>
<p>The first fair dog is, in its own way, a lemon.</p>
<address>4746 Kelly Drive, West Virginia</address>
<img src="./orange.png" style="border: solid 1px #000">
</article>
</section>
<section class="sidebar-widget">
<widget>Some widget here</widget>
<widget>Some widget here</widget>
</section>
</main>
<footer>Page footer here</footer>
</body>
</html>

0 comments on commit c938584

Please sign in to comment.