Skip to content

Commit

Permalink
Merge pull request #292 from ndaidong/7.1.0
Browse files Browse the repository at this point in the history
v7.1.0 - To work with `bun` and `deno`
  • Loading branch information
ndaidong authored Sep 17, 2022
2 parents da8df03 + b5875c7 commit 2ab8a99
Show file tree
Hide file tree
Showing 14 changed files with 147 additions and 311 deletions.
32 changes: 0 additions & 32 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,8 @@ In addition, this lib provides some methods to customize default settings. Don't
- getParserOptions()
- setParserOptions(Object parserOptions)
- getRequestOptions()
- setRequestOptions(Object requestOptions)
- getSanitizeHtmlOptions()
- setSanitizeHtmlOptions(Object sanitizeHtmlOptions)
- getHtmlCrushOptions(Object htmlCrushOptions)
- setHtmlCrushOptions()
Here are default properties/values:
Expand All @@ -312,23 +308,6 @@ Here are default properties/values:
Read [string-comparison](https://www.npmjs.com/package/string-comparison) docs for more info about `urlsCompareAlgorithm`.
#### Object `requestOptions`:
```js
{
headers: {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0',
accept: 'text/html; charset=utf-8'
},
responseType: 'text',
responseEncoding: 'utf8',
timeout: 6e4,
maxRedirects: 3
}
```
Read [axios' request config](https://axios-http.com/docs/req_config) for more info.

#### Object `sanitizeHtmlOptions`:
```js
Expand Down Expand Up @@ -370,17 +349,6 @@ Read [axios' request config](https://axios-http.com/docs/req_config) for more in
Read [sanitize-html](https://www.npmjs.com/package/sanitize-html#what-are-the-default-options) docs for more info.
#### Object `htmlCrushOptions`:

```js
{
removeLineBreaks: true,
removeHTMLComments: 2
}
```

For more options, please refer [html-crush](https://www.codsen.com/os/html-crush/) docs.

## Test
Expand Down
99 changes: 49 additions & 50 deletions dist/article-parser.browser.js

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions dist/article-parser.browser.js.map

Large diffs are not rendered by default.

156 changes: 69 additions & 87 deletions dist/cjs/article-parser.js

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions dist/cjs/article-parser.js.map

Large diffs are not rendered by default.

12 changes: 0 additions & 12 deletions dist/cjs/index.d.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
// Type definitions

import {AxiosRequestConfig} from "axios";
import {IOptions as SanitizeOptions} from "sanitize-html";
import {defaults} from "html-crush";
import "urlpattern-polyfill";

type HtmlCrushOptions = Partial<typeof defaults>

/**
* @example
* {
Expand Down Expand Up @@ -37,20 +33,12 @@ export function removeTransformations(options: Array<URLPatternInit>): Number;

export function setParserOptions(options: ParserOptions): void;

export function setRequestOptions(options: AxiosRequestConfig): void;

export function setSanitizeHtmlOptions(options: SanitizeOptions): void;

export function setHtmlCrushOptions(options: HtmlCrushOptions): void;

export function getParserOptions(): ParserOptions;

export function getRequestOptions(): AxiosRequestConfig;

export function getSanitizeHtmlOptions(): SanitizeOptions;

export function getHtmlCrushOptions(): HtmlCrushOptions;

export interface ParserOptions {
/**
* For estimating "time to read".
Expand Down
2 changes: 1 addition & 1 deletion dist/cjs/package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "article-parser-cjs",
"version": "7.0.3",
"version": "7.1.0",
"main": "./article-parser.js"
}
12 changes: 0 additions & 12 deletions index.d.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
// Type definitions

import {AxiosRequestConfig} from "axios";
import {IOptions as SanitizeOptions} from "sanitize-html";
import {defaults} from "html-crush";
import "urlpattern-polyfill";

type HtmlCrushOptions = Partial<typeof defaults>

/**
* @example
* {
Expand Down Expand Up @@ -37,20 +33,12 @@ export function removeTransformations(options: Array<URLPatternInit>): Number;

export function setParserOptions(options: ParserOptions): void;

export function setRequestOptions(options: AxiosRequestConfig): void;

export function setSanitizeHtmlOptions(options: SanitizeOptions): void;

export function setHtmlCrushOptions(options: HtmlCrushOptions): void;

export function getParserOptions(): ParserOptions;

export function getRequestOptions(): AxiosRequestConfig;

export function getSanitizeHtmlOptions(): SanitizeOptions;

export function getHtmlCrushOptions(): HtmlCrushOptions;

export interface ParserOptions {
/**
* For estimating "time to read".
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "7.0.3",
"version": "7.1.0",
"name": "article-parser",
"description": "To extract main article from given URL",
"homepage": "https://ndaidong.github.io/article-parser-demo/",
Expand Down Expand Up @@ -33,8 +33,8 @@
},
"dependencies": {
"@mozilla/readability": "^0.4.2",
"axios": "^0.27.2",
"bellajs": "^11.0.7",
"cross-fetch": "^3.1.5",
"html-crush": "^5.1.6",
"linkedom": "^0.14.14",
"sanitize-html": "^2.7.2",
Expand Down
46 changes: 1 addition & 45 deletions src/config.js
Original file line number Diff line number Diff line change
@@ -1,18 +1,6 @@
// configs

import { clone, copies } from 'bellajs'

const requestOptions = {
headers: {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
accept: 'text/html; charset=utf-8',
'accept-encoding': 'deflate,zlib,gzip'
},
responseType: 'text',
responseEncoding: 'utf8',
timeout: 6e4, // 1 minute
maxRedirects: 3
}
import { clone } from 'bellajs'

const sanitizeHtmlOptions = {
allowedTags: [
Expand Down Expand Up @@ -60,14 +48,6 @@ const sanitizeHtmlOptions = {
]
}

/**
* @type {HtmlCrushOptions}
*/
const htmlCrushOptions = {
removeHTMLComments: 2,
removeLineBreaks: true
}

const parserOptions = {
wordsPerMinute: 300, // to estimate "time to read"
urlsCompareAlgorithm: 'levenshtein', // to find the best url from list
Expand All @@ -77,33 +57,17 @@ const parserOptions = {
}

const state = {
requestOptions,
sanitizeHtmlOptions,
htmlCrushOptions,
parserOptions
}

/**
* @returns {RequestOptions}
*/
export const getRequestOptions = () => {
return clone(state.requestOptions)
}

/**
* @returns {SanitizeOptions}
*/
export const getSanitizeHtmlOptions = () => {
return clone(state.sanitizeHtmlOptions)
}

/**
* @returns {HtmlCrushOptions}
*/
export const getHtmlCrushOptions = () => {
return clone(state.htmlCrushOptions)
}

/**
* @returns {ParserOptions}
*/
Expand All @@ -119,14 +83,6 @@ export const setParserOptions = (opts = {}) => {
})
}

export const setRequestOptions = (opts = {}) => {
copies(opts, state.requestOptions)
}

export const setHtmlCrushOptions = (opts = {}) => {
copies(opts, state.htmlCrushOptions)
}

export const setSanitizeHtmlOptions = (opts = {}) => {
Object.keys(opts).forEach((key) => {
state.sanitizeHtmlOptions[key] = clone(opts[key])
Expand Down
41 changes: 1 addition & 40 deletions src/config.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,12 @@
/* eslint-env jest */

import {
setRequestOptions,
getRequestOptions,
setParserOptions,
getParserOptions,
setSanitizeHtmlOptions,
getSanitizeHtmlOptions,
getHtmlCrushOptions,
setHtmlCrushOptions
getSanitizeHtmlOptions
} from './config.js'

test('Testing setRequestOptions/getRequestOptions methods', () => {
setRequestOptions({
headers: {
authorization: 'bearer <token>'
},
timeout: 20,
somethingElse: 1000
})

const actual = getRequestOptions()
const expectedHeader = {
authorization: 'bearer <token>',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0',
accept: 'text/html; charset=utf-8',
'accept-encoding': 'deflate,zlib,gzip'
}

expect(actual.headers).toEqual(expectedHeader)
expect(actual.timeout).toEqual(20)
})

test('Testing setParserOptions/getParserOptions methods', () => {
const expectedWPM = 400
const expectedAlgorithm = 'levenshtein'
Expand Down Expand Up @@ -73,17 +48,3 @@ test('Testing setSanitizeHtmlOptions/getSanitizeHtmlOptions methods', () => {

expect(getSanitizeHtmlOptions().allowedTags).toEqual([])
})

test('Testing setHtmlCrushOptions/getHtmlCrushOptions methods', () => {
const removeHTMLComments = 4
const removeLineBreaks = true

setHtmlCrushOptions({
removeHTMLComments
})

const actual = getHtmlCrushOptions()

expect(actual.removeHTMLComments).toEqual(removeHTMLComments)
expect(actual.removeLineBreaks).toEqual(removeLineBreaks)
})
8 changes: 0 additions & 8 deletions src/main.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,8 @@ import {
extract,
getParserOptions,
setParserOptions,
getRequestOptions,
setRequestOptions,
getSanitizeHtmlOptions,
setSanitizeHtmlOptions,
getHtmlCrushOptions,
setHtmlCrushOptions,
addTransformations,
removeTransformations
} from './main'
Expand All @@ -32,12 +28,8 @@ describe('check all exported methods', () => {
extract,
getParserOptions,
setParserOptions,
getRequestOptions,
setRequestOptions,
getSanitizeHtmlOptions,
setSanitizeHtmlOptions,
getHtmlCrushOptions,
setHtmlCrushOptions,
addTransformations,
removeTransformations
]
Expand Down
9 changes: 7 additions & 2 deletions src/utils/html.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { DOMParser } from 'linkedom'
import sanitize from 'sanitize-html'
import { crush } from 'html-crush'

import { getHtmlCrushOptions, getSanitizeHtmlOptions } from '../config.js'
import { getSanitizeHtmlOptions } from '../config.js'

export const isValid = (str = '') => {
const reg = /<(?=.*? .*?\/ ?>|br|hr|input|!--|wbr)[a-z]+.*?>|<([a-z]+).*?<\/\1>/i
Expand All @@ -18,6 +18,11 @@ export const cleanify = html => {
})
}

const htmlCrushOptions = {
removeHTMLComments: 2,
removeLineBreaks: true
}

/**
* @param inputHtml {string}
* @returns cleanHtml {string}
Expand All @@ -27,7 +32,7 @@ export const cleanAndMinify = (inputHtml) => {

const html = doc.documentElement.innerHTML

const crushed = crush(html, getHtmlCrushOptions())
const crushed = crush(html, htmlCrushOptions)

const cleanHtml = sanitize(crushed.result, getSanitizeHtmlOptions())

Expand Down
21 changes: 9 additions & 12 deletions src/utils/retrieve.js
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
// utils -> retrieve

import axios from 'axios'

import { getRequestOptions } from '../config.js'
import fetch from 'cross-fetch'

export default async (url) => {
try {
const res = await axios.get(url, getRequestOptions())

const contentType = res.headers['content-type'] || ''
if (!contentType || !contentType.includes('text/html')) {
throw new Error(`Content type must be "text/html", not "${contentType}"`)
const res = await fetch(url, {
headers: {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
return res.data
} catch (err) {
throw new Error(`${err.name}: ${err.message}`)
})
const contentType = res.headers.get('content-type') || ''
if (!contentType || !contentType.includes('text/')) {
throw new Error(`Content type must be "text/html", not "${contentType}"`)
}
return res.text()
}

0 comments on commit 2ab8a99

Please sign in to comment.