Merge pull request #933 from spencermountain/dev

Dev
spencermountain · Jul 2, 2022 · 08f2a85 · 08f2a85
2 parents d123804 + f2ac30a
commit 08f2a85
Show file tree

Hide file tree

Showing 143 changed files with 2,245 additions and 1,412 deletions.
diff --git a/API.md b/API.md
@@ -146,7 +146,7 @@
 * .uncache()
 
 ### Lookup
-* nlp.compile()
+* nlp.buildTrie()
 * .lookup()
 
 ### Typeahead

diff --git a/README.md b/README.md
@@ -512,6 +512,7 @@ _(match methods use the [match-syntax](https://docs.compromise.cool/compromise-m
 - **[.growRight('')](https://observablehq.com/@spencermountain/compromise-match)** - add any matching terms immediately after each match
 - **[.growLeft('')](https://observablehq.com/@spencermountain/compromise-match)** - add any matching terms immediately before each match
 - **[.grow('')](https://observablehq.com/@spencermountain/compromise-match)** - add any matching terms before or after each match
+- **[.sweep(net)](https://observablehq.com/@spencermountain/compromise-sweep)** - apply a series of match objects to the document
 - **[.splitOn('')](https://observablehq.com/@spencermountain/compromise-split)** - return a Document with three parts for every match ('splitOn')
 - **[.splitBefore('')](https://observablehq.com/@spencermountain/compromise-split)** - partition a phrase before each matching segment
 - **[.splitAfter('')](https://observablehq.com/@spencermountain/compromise-split)** - partition a phrase after each matching segment
@@ -573,16 +574,23 @@ _(match methods use the [match-syntax](https://docs.compromise.cool/compromise-m
 
 _(these methods are on the main `nlp` object)_
 
-- **[nlp.tokenize()](https://observablehq.com/@spencermountain/compromise-tokenization)** - parse text without running POS-tagging
-- **[nlp.plugin()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - mix in a compromise-plugin
-- **[nlp.parseMatch()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - pre-parse any match statements for faster lookups
+- **[nlp.tokenize(str)](https://observablehq.com/@spencermountain/compromise-tokenization)** - parse text without running POS-tagging
+- **[nlp.lazy(str, match)](https://observablehq.com/@spencermountain/compromise-performance)** - scan through a text with minimal analysis
+- **[nlp.plugin({})](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - mix in a compromise-plugin
+- **[nlp.parseMatch(str)](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - pre-parse any match statements into json
 - **[nlp.world()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - grab or change library internals
 - **[nlp.model()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - grab all current linguistic data
 - **[nlp.methods()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - grab or change internal methods
 - **[nlp.hooks()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - see which compute methods run automatically
-- **[nlp.verbose()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - log our decision-making for debugging
+- **[nlp.verbose(mode)](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - log our decision-making for debugging
 - **[nlp.version](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - current semver version of the library
 
+- **[nlp.addWords(obj)](https://observablehq.com/@spencermountain/compromise-plugin)** - add new words to the lexicon
+- **[nlp.addTags(obj)](https://observablehq.com/@spencermountain/compromise-plugin)** - add new tags to the tagSet
+- **[nlp.typeahead(arr)](https://observablehq.com/@spencermountain/compromise-typeahead)** - add words to the auto-fill dictionary
+- **[nlp.buildTrie(arr)](https://observablehq.com/@spencermountain/compromise-lookup)** - compile a list of words into a fast lookup form
+- **[nlp.buildNet(arr)](https://observablehq.com/@spencermountain/compromise-sweep)** - compile a list of matches into a fast match form
+
 <!-- spacer -->
 <img height="30px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
 

diff --git a/builds/compromise.js b/builds/compromise.js
diff --git a/builds/one/compromise-one.cjs b/builds/one/compromise-one.cjs
diff --git a/builds/one/compromise-one.mjs b/builds/one/compromise-one.mjs
diff --git a/builds/three/compromise-three.cjs b/builds/three/compromise-three.cjs
diff --git a/builds/three/compromise-three.mjs b/builds/three/compromise-three.mjs
diff --git a/builds/two/compromise-two.cjs b/builds/two/compromise-two.cjs
diff --git a/builds/two/compromise-two.mjs b/builds/two/compromise-two.mjs
diff --git a/changelog.md b/changelog.md
@@ -6,9 +6,22 @@ compromise uses semver, and pushes to npm and github frequently
 
 While all _Major_ releases should be reviewed, our only _large_ releases are **v6** in 2016 **v12** in 2019 and **v14** in 2022. Others have been mostly incremental.
 
-<!-- #### [Unreleased]
+<!-- #### 14.4.0 [Unreleased]
+
 -->
 
+#### 14.4.0 [July 2021]
+- **[change]** - support root matches like '{walk}' work without doing .compute('root')
+- **[change]** - split numbers+units '12km' as contraction - #919
+- **[new]** - `.lazy(txt, match)` fast-scan method [1](https://observablehq.com/@spencermountain/compromise-performance)
+- **[fix]** - support apostrophes in lexicon #932
+- **[fix]** - support unTag property in sweep
+- **[change]** - keep sentence caches, when still valid
+- **[change]** - alias nlp.compile() to .buildTrie()
+- **[fix]** - tagging fixes
+- **[update]** - dependencies
+  _plugin-releases_: dates, speed, de-compromise
+
 #### 14.3.1 [June 2021]
 - **[fix]** - missed caches in .sweep()
 - **[new]** - .out('hash') and `.json({hash:true})`

diff --git a/data/lexicon/index.js b/data/lexicon/index.js
@@ -60,6 +60,7 @@ import personNoun from './switches/person-noun.js'
 import personDate from './switches/person-date.js'
 import personVerb from './switches/person-verb.js'
 import personPlace from './switches/person-place.js'
+import unitNoun from './switches/unit-noun.js'
 
 //add-in the generic, flat word-lists
 const data = [
@@ -123,6 +124,7 @@ const data = [
   [personPlace, 'Person|Place'],
   [personDate, 'Person|Date'],
   [personVerb, 'Person|Verb'],
+  [unitNoun, 'Unit|Noun'],
 ]
 for (let i = 0; i < data.length; i++) {
   const list = data[i][0]

diff --git a/data/lexicon/misc.js b/data/lexicon/misc.js
@@ -55,6 +55,7 @@ export default {
 
   //misc
   records: 'Plural',
+  feet: 'Plural',
   'a few': 'Value',
   'ones': 'Plural', //those ones
 

diff --git a/data/lexicon/nouns/singulars.js b/data/lexicon/nouns/singulars.js
@@ -400,6 +400,7 @@ export default [
   'tv',
 
   'stone',
+  'man',
   'tributary',
 ]
 

diff --git a/data/lexicon/numbers/units.js b/data/lexicon/numbers/units.js
@@ -1,216 +1,118 @@
+// ambigous units are in ../switches/unit-noun.js
 // units that are also abbreviations are in ../abbrev/units.js
 export default [
   '°c',
   'celsius',
   '°f',
   'fahrenheit',
-  // 'kelvin',
   'kelvins',
   '°n',
-  'newton',
-  'newtons',
   'm³',
-  'cubic meter',
-  'cubic meters',
-  'm3',
   'dm³',
-  'cubic decimeter',
-  'cubic decimeters',
-  'dm3',
   'cm³',
-  'cubic centimeter',
-  'cubic centimeters',
-  'cm3',
   'litre',
   'litres',
   'liter',
   'liters',
-  // 'dl',
   'deciliter',
   'deciliters',
-  // 'cl',
   'centiliter',
   'centiliters',
-  // 'ml',
   'milliliter',
   'milliliters',
   'in³',
-  'cubic inch',
-  'cubic inchs',
-  'in3',
   'ft³',
-  'cubic foot',
-  'cubic foots',
-  'ft3',
   'yd³',
-  'cubic yard',
-  'cubic yards',
-  'yd3',
-  // 'gal',
   'gallon',
   'gallons',
   'bbl',
-  // 'pt',
   'pint',
   'pints',
-  // 'qt',
   'quart',
   'quarts',
-  // 'tbl',
-  'tablespoon',
-  'tablespoons',
-  // 'tsp',
-  'teaspoon',
-  'teaspoons',
-  // 'tbsp',
-  'cup', //ambig
-  'cups',
-  // 'fl oz',
+  'fl oz',
   'fluid ounce',
   'fluid ounces',
-  // 'km',
   'kilometer',
   'kilometers',
   'meter',
   'meters',
-  // 'dm',
   'decimeter',
   'decimeters',
-  // 'cm',
   'centimeter',
   'centimeters',
-  // 'mm',
   'millimeter',
   'millimeters',
-  // 'mi',
   'mile',
-  // 'miles',
-  // 'ft', //ambiguous
-  // 'yd',
-  'yard',
-  'yards',
   'tonne',
   'tonnes',
-  // 'kg',
+  'kilo',
+  'kilos',
   'kilogram',
   'kilograms',
-  // 'hg',
   'hectogram',
   'hectograms',
   'gram',
   'grams',
-  // 'dg',
   'decigram',
   'decigrams',
-  // 'cg',
   'centigram',
   'centigrams',
-  // 'mg',
   'milligram',
   'milligrams',
-  // 'µg',
+  'µg',
   'microgram',
   'micrograms',
   'carat',
   'carats',
-  'grain',
-  'grains',
-  // 'oz',
   'ounce',
   'ounces',
-  // 'lb',
-  'pound', //ambig
-  'pounds',
   'ton',
   'km²',
-  'square kilometer',
-  'square kilometers',
-  'km2',
   'm²',
-  'square meter',
-  'square meters',
-  'm2',
   'dm²',
-  'square decimeter',
-  'square decimeters',
-  'dm2',
   'cm²',
-  'square centimeter',
-  'square centimeters',
-  'cm2',
   'mm²',
-  'square millimeter',
-  'square millimeters',
-  'mm2',
   'hectare',
   'hectares',
   'mile²',
-  'square mile',
-  'square miles',
-  'mile2',
   'in²',
-  'square inch',
-  'square inchs',
-  'in2',
   'yd²',
-  'square yard',
-  'square yards',
-  'yd2',
   'ft²',
-  'square foot',
-  'square foots',
-  // 'sq ft',
-  'square feet',
-  'square feets',
-  'ft2',
+  'sq ft',
   'acre',
   'acres',
-  // 'hz',
   'hertz',
   'hertzs',
   'km/h',
-  'kilometer per hour',
-  'kilometers per hour',
-  // 'kmph',
-  // 'mps',
-  'meter per second',
-  'meters per second',
+  'kmph',
   'm/s',
-  // 'mph',
-  'mile per hour',
-  'miles per hour',
-  'miles an hour',
   'mi/h',
   'knot',
   'knots',
   'byte',
   'bytes',
-  // 'kb',
   'kilobyte',
   'kilobytes',
-  // 'mb',
   'megabyte',
   'megabytes',
-  // 'gb',//ambig
   'gigabyte',
   'gigabytes',
-  // 'tb',
   'terabyte',
   'terabytes',
   'petabyte',
   'petabytes',
-  // 'eb',
   'exabyte',
   'exabytes',
-  // 'zb',
   'zettabyte',
   'zettabytes',
-  // 'yb',
   'yottabyte',
   'yottabytes',
+  'kbps',
+  'bbps',
+  'gbps',
   'joule',
   'joules',
-  // 'pa',
   'pascals',
   'watt',
   'watts',
@@ -221,9 +123,7 @@ export default [
   'farad',
   'farads',
   'ohms',
-  // 'lx',
   'lux',
-  // 'lm',
   'lumen',
   'lumens',
   'µs',
@@ -238,5 +138,6 @@ export default [
   'attosecond',
   'attoseconds',
   'percent',
+  'year old',
   'years old',
 ]
diff --git a/data/lexicon/switches/adj-noun.js b/data/lexicon/switches/adj-noun.js
@@ -93,6 +93,7 @@ export default [
   'subject',
   'subordinate',
   'superior',
+  'swell',
   'taboo',
   'tan',
   'teen',

diff --git a/data/lexicon/switches/noun-gerund.js b/data/lexicon/switches/noun-gerund.js
@@ -58,7 +58,6 @@ export default [
   'debating',
   'directing',
   'doubting',
-  'downloading',
   'drawing',
   'dressing',
   'drinking',

diff --git a/data/lexicon/switches/noun-verb.js b/data/lexicon/switches/noun-verb.js
@@ -168,6 +168,7 @@ export default [
   'dispute',
   'divorce',
   'document',
+  'download',
   'dodge',
   'doubt',
   'draft',
@@ -635,6 +636,7 @@ export default [
   'twist',
   'type',
   'upgrade',
+  'upload',
   'usher',
   'vacuum',
   'value',

diff --git a/data/lexicon/switches/person-verb.js b/data/lexicon/switches/person-verb.js
@@ -15,6 +15,7 @@ export default [
   'chuck',
   'mack',
   'grant',//'sung'
+  'chase'
   // 'will',
   //  may
 ]