Merge pull request #306 from ndaidong/7.2.3

v7.2.3
extractus · Sep 23, 2022 · c938584 · c938584
2 parents 22f4dab + 226ea8e
commit c938584
Show file tree

Hide file tree

Showing 14 changed files with 156 additions and 137 deletions.
diff --git a/dist/article-parser.esm.js b/dist/article-parser.esm.js
diff --git a/dist/cjs/article-parser.js b/dist/cjs/article-parser.js
diff --git a/dist/cjs/package.json b/dist/cjs/package.json
@@ -1,5 +1,5 @@
 {
   "name": "article-parser",
-  "version": "7.2.2",
+  "version": "7.2.3",
   "main": "./article-parser.js"
 }
diff --git a/eval.js b/eval.js
@@ -7,8 +7,10 @@ import { extract } from './src/main.js'
 
 const extractFromUrl = async (url) => {
   try {
+    console.time('extraction')
     const art = await extract(url)
     console.log(art)
+    console.timeEnd('extraction')
   } catch (err) {
     console.trace(err)
   }

diff --git a/package.json b/package.json
@@ -1,5 +1,5 @@
 {
-  "version": "7.2.2",
+  "version": "7.2.3",
   "name": "article-parser",
   "description": "To extract main article from given URL",
   "homepage": "https://demos.pwshub.com/article-parser",
@@ -26,7 +26,7 @@
   },
   "scripts": {
     "lint": "standard .",
-    "pretest": "npm run lint",
+
     "test": "NODE_ENV=test NODE_OPTIONS=--experimental-vm-modules jest --coverage=true",
     "build": "node build",
     "eval": "node eval",

diff --git a/src/main.js b/src/main.js
@@ -10,19 +10,14 @@ import {
 import retrieve from './utils/retrieve.js'
 import parseFromHtml from './utils/parseFromHtml.js'
 import { isValid as isValidUrl } from './utils/linker.js'
-import { isValid as isHTMLString } from './utils/html.js'
 
 export const extract = async (input, parserOptions = {}, fetchOptions = {}) => {
   if (!isString(input)) {
     throw new Error('Input must be a string')
   }
 
-  if (isHTMLString(input)) {
-    return parseFromHtml(input, null, parserOptions)
-  }
-
   if (!isValidUrl(input)) {
-    throw new Error('Input must be a valid URL')
+    return parseFromHtml(input, null, parserOptions)
   }
   const html = await retrieve(input, fetchOptions)
   if (!html) {

diff --git a/src/utils/extractWithReadability.js b/src/utils/extractWithReadability.js
@@ -2,16 +2,12 @@
 
 import { Readability } from '@mozilla/readability'
 import { DOMParser } from 'linkedom'
+import { isString } from 'bellajs'
 
-import { isValid as isHTMLString } from './html.js'
-
-/**
- * @param html {string}
- * @param inputUrl {string}
- * @returns {string|null}
- */
 export default (html, inputUrl = '') => {
-  if (!isHTMLString(html)) return null
+  if (!isString(html)) {
+    return null
+  }
   const doc = new DOMParser().parseFromString(html, 'text/html')
   const base = doc.createElement('base')
   base.setAttribute('href', inputUrl)
@@ -22,9 +18,10 @@ export default (html, inputUrl = '') => {
 }
 
 export function extractTitleWithReadability (html) {
-  if (!isHTMLString(html)) return null
+  if (!isString(html)) {
+    return null
+  }
   const doc = new DOMParser().parseFromString(html, 'text/html')
   const reader = new Readability(doc)
-  // noinspection JSUnresolvedFunction
-  return reader._getArticleTitle()
+  return reader._getArticleTitle() || null
 }
diff --git a/src/utils/extractWithReadability.test.js b/src/utils/extractWithReadability.test.js
@@ -7,22 +7,35 @@ import { isString } from 'bellajs'
 
 import extractWithReadability, { extractTitleWithReadability } from './extractWithReadability.js'
 
-test('test extractWithReadability from good html content', async () => {
-  const html = readFileSync('./test-data/regular-article.html', 'utf8')
-  const result = extractWithReadability(html, 'https://foo.bar')
-  expect(isString(result)).toBe(true)
-  expect(result.length > 200).toBe(true)
-  expect(result).toEqual(expect.stringContaining('<img src="https://foo.bar/orange.png">'))
-})
+describe('test extractWithReadability()', () => {
+  test('extract from good html content', async () => {
+    const html = readFileSync('./test-data/regular-article.html', 'utf8')
+    const result = extractWithReadability(html, 'https://foo.bar')
+    expect(isString(result)).toBe(true)
+    expect(result.length > 200).toBe(true)
+    expect(result).toEqual(expect.stringContaining('<img src="https://foo.bar/orange.png">'))
+  })
 
-test('test extractWithReadability from bad html content', async () => {
-  expect(extractWithReadability(null)).toBe(null)
-  expect(extractWithReadability({})).toBe(null)
-  expect(extractWithReadability('<div></span>')).toBe(null)
-})
+  test('extract from bad html content', async () => {
+    expect(extractWithReadability(null)).toBe(null)
+    expect(extractWithReadability({})).toBe(null)
+    expect(extractWithReadability('<div></span>')).toBe(null)
+  })
+
+  test('extract title only', async () => {
+    const html = readFileSync('./test-data/regular-article.html', 'utf8')
+    const result = extractTitleWithReadability(html)
+    expect(result).toBe('Article title here - ArticleParser')
+  })
+
+  test('extract title from page without title', async () => {
+    const html = readFileSync('./test-data/html-no-title.html', 'utf8')
+    const result = extractTitleWithReadability(html)
+    expect(result).toBe(null)
+  })
 
-test('test extractTitleWithReadability', async () => {
-  const html = readFileSync('./test-data/regular-article.html', 'utf8')
-  const result = extractTitleWithReadability(html)
-  expect(result).toBe('Article title here - ArticleParser')
+  test('extract title from non-string', async () => {
+    const result = extractTitleWithReadability({})
+    expect(result).toBe(null)
+  })
 })
diff --git a/src/utils/html.js b/src/utils/html.js
@@ -5,22 +5,13 @@ import sanitize from 'sanitize-html'
 
 import { getSanitizeHtmlOptions } from '../config.js'
 
-export const isValid = (str = '') => {
-  const reg = /<(?=.*? .*?\/ ?>|br|hr|input|!--|wbr)[a-z]+.*?>|<([a-z]+).*?<\/\1>/i
-  return reg.test(str)
-}
-
 export const purify = html => {
   return sanitize(html, {
     allowedTags: false,
     allowedAttributes: false
   })
 }
 
-/**
- * @param inputHtml {string}
- * @returns cleanHtml {string}
- */
 export const cleanify = (inputHtml) => {
   const doc = new DOMParser().parseFromString(inputHtml, 'text/html')
   const html = doc.documentElement.innerHTML

diff --git a/src/utils/html.test.js b/src/utils/html.test.js
@@ -6,46 +6,9 @@ import { readFileSync } from 'fs'
 import { isString } from 'bellajs'
 
 import {
-  isValid as isHTMLString,
   cleanify
 } from './html.js'
 
-describe('test isValid() method', () => {
-  test('validate bad input', () => {
-    const result = isHTMLString({})
-    expect(result).toBe(false)
-  })
-
-  test('validate regular string', () => {
-    const result = isHTMLString('This is just a string, not HTML')
-    expect(result).toBe(false)
-  })
-
-  test('validate bad-format HTML', () => {
-    const result = isHTMLString('<div class="welcome">Hello world</span>')
-    expect(result).toBe(false)
-  })
-
-  test('validate well-format HTML', () => {
-    const result = isHTMLString('<div class="welcome">Hello <b>world</b><hr></div>')
-    expect(result).toBe(true)
-  })
-
-  test('validate example HTML page', () => {
-    const files = [
-      'regular-article.html',
-      'html-no-title.html',
-      'html-article-no-source.html',
-      'html-too-short-article.html'
-    ]
-    files.forEach((file) => {
-      const html = readFileSync(`./test-data/${file}`, 'utf8')
-      const result = isHTMLString(html)
-      expect(result).toBe(true)
-    })
-  })
-})
-
 describe('test cleanify() method', () => {
   test('check if unwanted elements/attributes removed', () => {
     const html = readFileSync('./test-data/regular-article.html', 'utf8')

diff --git a/src/utils/parseFromHtml.test.js b/src/utils/parseFromHtml.test.js
@@ -13,15 +13,15 @@ describe('test parseFromHtml()', () => {
   const cases = [
     {
       input: {
-        desc: 'a bad input',
-        html: {}
+        desc: 'a webpage with no title',
+        html: readFileSync('./test-data/html-no-title.html', 'utf8')
       },
       expectation: null
     },
     {
       input: {
-        desc: 'a webpage with no title',
-        html: readFileSync('./test-data/html-no-title.html', 'utf8')
+        desc: 'a webpage without link',
+        html: readFileSync('./test-data/html-no-link.html', 'utf8')
       },
       expectation: null
     },

diff --git a/test-data/html-article-no-source.html b/test-data/html-article-no-source.html
@@ -17,6 +17,6 @@
 To be more specific, those turtles are nothing more than fishes. A grape can hardly be considered a shrewd goldfish without also being an owl. Some unbiased goats are thought of simply as tangerines.
 
 Shouting with happiness, a courageous elephant is a duck of the mind? Some posit the upbeat hippopotamus to be less than enchanting. It's an undeniable fact, really; authors often misinterpret the grape as an endurable rabbit, when in actuality it feels more like a tough dolphin. We know that a cherry can hardly be considered a responsible apricot without also being a nectarine.
-		</article>s
+		</article>
 	</body>
 </html>
diff --git a/test-data/html-no-link.html b/test-data/html-no-link.html
@@ -0,0 +1,37 @@
+<!doctype html>
+<html>
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Article title here - ArticleParser</title>
+    <meta name="author" content="Alice">
+    <meta name="description" content="Few words about this article">
+    <link rel="stylesheet" href="/path/to/cssfile.css">
+    <link rel="alternate" title="ArticleParser" type="application/atom+xml" href="https://somewhere.com/atom.xml">
+    <link rel="manifest" href="/manifest.json">
+  </head>
+  <body>
+    <header>Page header here</header>
+    <main>
+      <section>
+        <nav>Navigation here</nav>
+      </section>
+      <section>
+        <h1>Article title here</h1>
+        <article>
+          <div class="contentdetail">Few can name a <a href="https://otherwhere.com/descriptions/rational-peach">rational peach</a> that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.</div>
+          <p class="contentdetail">
+            Those cheetahs are nothing more than dogs. A <a href="/dict/watermelon">watermelon</a> is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.</p>
+          <p>The first fair dog is, in its own way, a lemon.</p>
+          <address>4746 Kelly Drive, West Virginia</address>
+          <img src="./orange.png" style="border: solid 1px #000">
+        </article>
+      </section>
+      <section class="sidebar-widget">
+        <widget>Some widget here</widget>
+        <widget>Some widget here</widget>
+      </section>
+    </main>
+    <footer>Page footer here</footer>
+  </body>
+</html>
diff --git a/test-data/html-no-title.html b/test-data/html-no-title.html
@@ -3,8 +3,29 @@
 	<head>
 		<meta charset="utf-8">
 		<meta name="viewport"content="width=device-width,initial-scale=1">
-		<title>TechNews</title>
 	</head>
-	<body>
-	</body>
+  <body>
+    <header>Page header here</header>
+    <main>
+      <section>
+        <nav>Navigation here</nav>
+      </section>
+      <section>
+        <h1>Article title here</h1>
+        <article>
+          <div class="contentdetail">Few can name a <a href="https://otherwhere.com/descriptions/rational-peach">rational peach</a> that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.</div>
+          <p class="contentdetail">
+            Those cheetahs are nothing more than dogs. A <a href="/dict/watermelon">watermelon</a> is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.</p>
+          <p>The first fair dog is, in its own way, a lemon.</p>
+          <address>4746 Kelly Drive, West Virginia</address>
+          <img src="./orange.png" style="border: solid 1px #000">
+        </article>
+      </section>
+      <section class="sidebar-widget">
+        <widget>Some widget here</widget>
+        <widget>Some widget here</widget>
+      </section>
+    </main>
+    <footer>Page footer here</footer>
+  </body>
 </html>