forked from MihaiValentin/lunr-languages
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lunr.th.js
99 lines (91 loc) · 3 KB
/
lunr.th.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/*!
* Lunr languages, `Thai` language
* https://github.com/MihaiValentin/lunr-languages
*
* Copyright 2017, Keerati Thiwanruk
* http://www.mozilla.org/MPL/
*/
/*!
* based on
* Snowball JavaScript Library v0.3
* http://code.google.com/p/urim/
* http://snowball.tartarus.org/
*
* Copyright 2010, Oleg Mazko
* http://www.mozilla.org/MPL/
*/
/**
* export the module via AMD, CommonJS or as a browser global
* Export code from https://github.com/umdjs/umd/blob/master/returnExports.js
*/
;
(function(root, factory) {
if (typeof define === 'function' && define.amd) {
// AMD. Register as an anonymous module.
define(factory)
} else if (typeof exports === 'object') {
/**
* Node. Does not work with strict CommonJS, but
* only CommonJS-like environments that support module.exports,
* like Node.
*/
module.exports = factory()
} else {
// Browser globals (root is window)
factory()(root.lunr);
}
}(this, function() {
/**
* Just return a value to define the module export.
* This example returns an object, but the module
* can return a function as the exported value.
*/
return function(lunr) {
/* throw error if lunr is not yet included */
if ('undefined' === typeof lunr) {
throw new Error('Lunr is not present. Please include / require Lunr before this script.');
}
/* throw error if lunr stemmer support is not yet included */
if ('undefined' === typeof lunr.stemmerSupport) {
throw new Error('Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.');
}
/*
Thai tokenization is the same to Japanense, which does not take into account spaces.
So, it uses the same logic to assign tokenization function due to different Lunr versions.
*/
var isLunr2 = lunr.version[0] == "2";
/* register specific locale function */
lunr.th = function() {
this.pipeline.reset();
this.pipeline.add(
/*lunr.th.stopWordFilter,*/
lunr.th.trimmer
);
if (isLunr2) { // for lunr version 2.0.0
this.tokenizer = lunr.th.tokenizer;
} else {
if (lunr.tokenizer) { // for lunr version 0.6.0
lunr.tokenizer = lunr.th.tokenizer;
}
if (this.tokenizerFn) { // for lunr version 0.7.0 -> 1.0.0
this.tokenizerFn = lunr.th.tokenizer;
}
}
};
/* lunr trimmer function */
lunr.th.wordCharacters = "[\u0e00-\u0e7f]";
lunr.th.trimmer = lunr.trimmerSupport.generateTrimmer(lunr.th.wordCharacters);
lunr.Pipeline.registerFunction(lunr.th.trimmer, 'trimmer-th');
var segmenter = lunr.wordcut;
segmenter.init();
lunr.th.tokenizer = function(obj) {
//console.log(obj);
if (!arguments.length || obj == null || obj == undefined) return []
if (Array.isArray(obj)) return obj.map(function(t) {
return isLunr2 ? new lunr.Token(t) : t
})
var str = obj.toString().replace(/^\s+/, '');
return segmenter.cut(str).split('|');
}
};
}))