-
Notifications
You must be signed in to change notification settings - Fork 8
/
case.py
170 lines (122 loc) · 4.57 KB
/
case.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""Helpers for recasing.
This module contains the TokenCase enum (representing the outcome variable)
and associated helper functions for case-folding (i.e., applying a casing
to an arbitrary Unicode string)."""
import enum
import unicodedata
from typing import Dict, List, Optional, Tuple
# Casing features at the the Unicode character level.
@enum.unique
class CharCase(enum.IntEnum):
"""Enum for the three character classes."""
DC = 0 # All non-L Unicode categories ("don't care").
LOWER = 1 # Unicode category 'Ll'.
UPPER = 2 # Unicode category 'Lu'.
def __str__(self):
return self.name
class UnknownCharCaseError(ValueError):
pass
def get_cc(nunichar: str) -> CharCase:
"""Computes CharCase for a Unicode character.
This function computes the CharCase of a Unicode character.
Args:
nunichar: A Unicode character whose casing is to be computed.
Returns:
The CharCase for the input character.
"""
catstr = unicodedata.category(nunichar)
if catstr == "Ll":
return CharCase.LOWER
elif catstr == "Lu":
return CharCase.UPPER
else:
return CharCase.DC
def apply_cc(nunichar: str, cc: CharCase) -> str:
"""Applies CharCase to a Unicode character.
This function applies a CharCase to a Unicode character. Unless CharCase
is `DC`, this is insensitive to the casing of the input character.
Args:
nunichar: A Unicode character to be cased.
cc: A CharCase indicating the casing to be applied.
Returns:
An appropriately-cased Unicode character.
Raises:
UnknownCharCaseError.
"""
if cc == CharCase.LOWER:
return nunichar.lower()
elif cc == CharCase.UPPER:
return nunichar.upper()
elif cc == CharCase.DC:
return nunichar
else:
raise UnknownCharCaseError(cc)
# Casing features at the word ("token") level.
@enum.unique
class TokenCase(enum.IntEnum):
"""Enum for the five token classes."""
DC = 0 # [DC]+
LOWER = 1 # [Ll] ([Ll] | [DC])*
UPPER = 2 # [Lu] ([Lu] | [DC])* except where bled by title.
TITLE = 3 # [Lu] ([Ll] | [DC])*
MIXED = 4 # All others.
def __str__(self):
return self.name
class UnknownTokenCaseError(ValueError):
pass
# Type definitions for mixed-base patterns.
ObligatoryPattern = List[CharCase]
Pattern = Optional[ObligatoryPattern]
MixedPatternTable = Dict[str, ObligatoryPattern]
def get_tc(nunistr: str) -> Tuple[TokenCase, Pattern]:
"""Computes TokenCase for a Unicode string.
This function computes the TokenCase of a Unicode character.
Args:
nunistr: A Unicode string whose casing is to be computed.
Returns:
A list consisting of the TokenCase for the input character, and either
None (representing "n/a") or a list of CharCase instances representing
the specifics of a `MIXED` TokenCase pattern.
"""
if nunistr.islower():
return (TokenCase.LOWER, None)
# If title and upper have a fight, title wins. Arguably, "A" is usually
# titlecase, not uppercase.
elif nunistr.istitle():
return (TokenCase.TITLE, None)
elif nunistr.isupper():
return (TokenCase.UPPER, None)
pattern = [get_cc(nunichr) for nunichr in nunistr]
if all(tc == CharCase.DC for tc in pattern):
return (TokenCase.DC, None)
return (TokenCase.MIXED, pattern)
def apply_tc(nunistr: str, tc: TokenCase, pattern: Pattern = None) -> str:
"""Applies TokenCase to a Unicode string.
This function applies a TokenCase to a Unicode string. Unless TokenCase is
`DC`, this is insensitive to the casing of the input string.
Args:
nunistr: A Unicode string to be cased.
tc: A TokenCase indicating the casing to be applied.
pattern: An iterable of CharCase characters representing the specifics
of the `MIXED` TokenCase, when the `tc` argument is `MIXED`.
Returns:
An appropriately-cased Unicode string.
Raises:
UnknownTokenCaseError.
"""
if tc == TokenCase.DC:
return nunistr
elif tc == TokenCase.LOWER:
return nunistr.lower()
elif tc == TokenCase.UPPER:
return nunistr.upper()
elif tc == TokenCase.TITLE:
return nunistr.title()
elif tc == TokenCase.MIXED:
# Defaults to lowercase if no pattern is provided.
if pattern is None:
return nunistr.lower()
assert pattern
assert len(nunistr) == len(pattern)
return "".join(apply_cc(ch, cc) for (ch, cc) in zip(nunistr, pattern))
raise UnknownTokenCaseError(tc)