-
Notifications
You must be signed in to change notification settings - Fork 0
/
senters.py
120 lines (81 loc) · 2.6 KB
/
senters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import logging
import pathlib
import bunkai
import hasami
import kuzukiri
import pysbd
import rhoknp
import sengiri
import spacy
from bunkai.cli import setup
here = pathlib.Path(__file__).parent
class Senter:
def __call__(self, text: str) -> list[str]:
raise NotImplementedError
@property
def name(self) -> str:
raise NotImplementedError
class Rhoknp(Senter):
"""https://github.com/ku-nlp/rhoknp."""
def __init__(self) -> None:
self.senter = rhoknp.RegexSenter()
def __call__(self, text: str) -> list[str]:
return [s.text for s in self.senter(text).sentences]
@property
def name(self) -> str:
return "rhoknp"
class Sengiri(Senter):
"""https://github.com/ikegami-yukino/sengiri."""
def __call__(self, text: str) -> list[str]:
return sengiri.tokenize(text)
@property
def name(self) -> str:
return "sengiri"
class Hasami(Senter):
"""https://github.com/ikegami-yukino/sengiri."""
def __call__(self, text: str) -> list[str]:
return hasami.segment_sentences(text)
@property
def name(self) -> str:
return "hasami"
class Bunkai(Senter):
"""https://github.com/megagonlabs/bunkai."""
def __init__(self) -> None:
model_path = here.joinpath("bunkai_model")
if not model_path.exists():
setup(here / "bunkai_model", None)
self.bunkai = bunkai.Bunkai(path_model=model_path)
# Disable verbose logging
logging.getLogger("bunkai").setLevel(logging.ERROR)
def __call__(self, text: str) -> list[str]:
return list(self.bunkai(text))
@property
def name(self) -> str:
return "bunkai"
class Pysbd(Senter):
"""https://github.com/ikegami-yukino/sengiri."""
def __init__(self) -> None:
self.seg = pysbd.Segmenter(language="ja")
def __call__(self, text: str) -> list[str]:
return self.seg.segment(text)
@property
def name(self) -> str:
return "pysbd"
class Kuzukiri(Senter):
"""https://github.com/alinear-corp/kuzukiri."""
def __init__(self) -> None:
self.segmenter = kuzukiri.Segmenter()
def __call__(self, text: str) -> list[str]:
return self.segmenter.split(text)
@property
def name(self) -> str:
return "kuzukiri"
class Ginza(Senter):
"""https://github.com/megagonlabs/ginza."""
def __init__(self) -> None:
self.nlp = spacy.load("ja_ginza_electra")
def __call__(self, text: str) -> list[str]:
return [sent.text for sent in self.nlp(text).sents]
@property
def name(self) -> str:
return "ginza"