Merge pull request #308 from ndaidong/7.2.4

v7.2.4
extractus · Sep 24, 2022 · 0e80ba3 · 0e80ba3
2 parents 4c6abe1 + 3e86b80
commit 0e80ba3
Show file tree

Hide file tree

Showing 11 changed files with 94 additions and 67 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,5 @@ coverage
 yarn.lock
 coverage.lcov
 pnpm-lock.yaml
+
+evaluation
diff --git a/README.md b/README.md
@@ -177,6 +177,10 @@ extract(url, null, {
 
 Passing requests to proxy is useful while running `article-parser` on browser. View [examples/browser-article-parser](https://github.com/ndaidong/article-parser/tree/main/examples/browser-article-parser) as reference example.
 
+For more info about proxy authentication, please refer [HTTP authentication](https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication)
+
+For a deeper customization, you can consider using [Proxy](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Proxy) to replace `fetch` behaviors with your own handlers.
+
 ---
 
 ### Transformations

diff --git a/dist/article-parser.esm.js b/dist/article-parser.esm.js
diff --git a/dist/cjs/article-parser.js b/dist/cjs/article-parser.js
diff --git a/dist/cjs/package.json b/dist/cjs/package.json
@@ -1,5 +1,5 @@
 {
   "name": "article-parser",
-  "version": "7.2.3",
+  "version": "7.2.4",
   "main": "./article-parser.js"
 }
diff --git a/eval.js b/eval.js
@@ -1,15 +1,24 @@
 // eval.js
 
-import { readFileSync, existsSync } from 'fs'
+import { execSync } from 'child_process'
+import { readFileSync, writeFileSync, existsSync } from 'fs'
+
+import { slugify } from 'bellajs'
 
 import { isValid as isValidUrl } from './src/utils/linker.js'
 import { extract } from './src/main.js'
 
+if (!existsSync('evaluation')) {
+  execSync('mkdir evaluation')
+}
+
 const extractFromUrl = async (url) => {
   try {
     console.time('extraction')
     const art = await extract(url)
     console.log(art)
+    const slug = slugify(art.title)
+    writeFileSync(`evaluation/${slug}.html`, art.content, 'utf8')
     console.timeEnd('extraction')
   } catch (err) {
     console.trace(err)

diff --git a/examples/browser-article-parser/README.md b/examples/browser-article-parser/README.md
@@ -23,8 +23,6 @@ Basically `article-parser` only works at server side.
 However there are some noble publishers those enable `Access-Control-Allow-Origin` on their service.
 For example with articles from [bitcoin.com](https://news.bitcoin.com/the-future-of-nft-is-evt-the-new-game-changer-token/), [CNBC](https://www.cnbc.com/2022/09/21/what-another-major-rate-hike-by-the-federal-reserve-means-to-you.html) or [Decrypt](https://decrypt.co/110356/cardano-blockchain-moves-forward-with-vasil-upgrade) we can extract from browser.
 
-Another ideal environment to run `article-parser` directly is browser extensions.
-
 With the remaining cases, we need a proxy layer to bypass CORS policy.
 
 ---
diff --git a/index.js b/index.js
diff --git a/package.json b/package.json
@@ -1,5 +1,5 @@
 {
-  "version": "7.2.3",
+  "version": "7.2.4",
   "name": "article-parser",
   "description": "To extract main article from given URL",
   "homepage": "https://demos.pwshub.com/article-parser",
@@ -26,7 +26,6 @@
   },
   "scripts": {
     "lint": "standard .",
-
     "test": "NODE_ENV=test NODE_OPTIONS=--experimental-vm-modules jest --coverage=true",
     "build": "node build",
     "eval": "node eval",

diff --git a/reset.js b/reset.js
@@ -11,7 +11,7 @@ import {
 import { execSync } from 'child_process'
 
 const dirs = [
-  'dist',
+  'evaluation',
   'docs',
   '.nyc_output',
   'coverage',

diff --git a/src/utils/html.js b/src/utils/html.js
@@ -2,19 +2,37 @@
 
 import { DOMParser } from 'linkedom'
 import sanitize from 'sanitize-html'
+import { pipe } from 'bellajs'
 
 import { getSanitizeHtmlOptions } from '../config.js'
 
-export const purify = html => {
+export const purify = (html) => {
   return sanitize(html, {
     allowedTags: false,
     allowedAttributes: false
   })
 }
 
+const WS_REGEXP = /^[\s\f\n\r\t\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff\x09\x0a\x0b\x0c\x0d\x20\xa0]+$/ // eslint-disable-line
+
+const stripMultiLinebreaks = (str) => {
+  return str.replace(/(\r\n|\n|\u2424){2,}/g, '\n').split('\n').map((line) => {
+    return WS_REGEXP.test(line) ? line.trim() : line
+  }).filter((line) => {
+    return line.length > 0
+  }).join('\n')
+}
+
+const stripMultispaces = (str) => {
+  return str.replace(WS_REGEXP, ' ').replace(/  +/g, ' ').trim()
+}
+
 export const cleanify = (inputHtml) => {
   const doc = new DOMParser().parseFromString(inputHtml, 'text/html')
   const html = doc.documentElement.innerHTML
-  const cleanHtml = sanitize(html, getSanitizeHtmlOptions())
-  return cleanHtml.replace(/[\r\n]/gm, '').replace(/  +/g, ' ').trim()
+  return pipe(
+    input => sanitize(input, getSanitizeHtmlOptions()),
+    input => stripMultiLinebreaks(input),
+    input => stripMultispaces(input)
+  )(html)
 }