-
Notifications
You must be signed in to change notification settings - Fork 0
/
mediawiki-furigana.py
172 lines (158 loc) · 5.98 KB
/
mediawiki-furigana.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# -*- coding:utf-8 -*-
'''
This is a script for creating Japanese lyric furigana in MoegirlPedia
(https://zh.moegirl.org.cn/) or any Mediawiki site you want.
This script is written by a Python beginner.
You have to modify the prefix and suffix to suit your own need.
In regular MediaWiki you may want to use {{ruby|word|read}}
IMPORTANT NOTICE:
ALL THE FURIGANA GENERATED BY THE SCRIPT NEED MANUAL CHECKING.
Since the context info was lost because of the program design,
MeCab sometimes cannot generate appropriate furigana.
But it is quite a relief compared to manually adding furigana.
You can copy the generated lyrics from the console.
'''
'''
ranges = [
{"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs
{"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs
{"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs
{"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs
#{'from': ord(u'\u3040'), 'to': ord(u'\u309f')}, # Japanese Hiragana
#{"from": ord(u"\u30a0"), "to": ord(u"\u30ff")}, # Japanese Katakana
{"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement
{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")},
{"from": ord(u"\u3400"), "to": ord(u"\u4dbf")},
{"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
{"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
{"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
{"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0
]
#Complementary unicode sets
'''
ranges = [{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")} ]
def is_cjk(char):
return any([range["from"] <= ord(char) <= range["to"] for range in ranges])
def cjk_substrings(string):
i = 0
while i<len(string):
if is_cjk(string[i]):
start = i
while is_cjk(string[i]): i += 1
yield string[start:i]
i += 1
def k2h(input_str_k):
input_str_k = str(input_str_k
.replace ( 'ア','あ' )
.replace ( 'イ','い' )
.replace ( 'ウ','う' )
.replace ( 'エ','え' )
.replace ( 'オ','お' )
.replace ( 'カ','か' )
.replace ( 'キ','き' )
.replace ( 'ク','く' )
.replace ( 'ケ','け' )
.replace ( 'コ','こ' )
.replace ( 'サ','さ' )
.replace ( 'シ','し' )
.replace ( 'ス','す' )
.replace ( 'セ','せ' )
.replace ( 'ソ','そ' )
.replace ( 'タ','た' )
.replace ( 'チ','ち' )
.replace ( 'ツ','つ' )
.replace ( 'テ','て' )
.replace ( 'ト','と' )
.replace ( 'ナ','な' )
.replace ( 'ニ','に' )
.replace ( 'ヌ','ぬ' )
.replace ( 'ネ','ね' )
.replace ( 'ノ','の' )
.replace ( 'ハ','は' )
.replace ( 'ヒ','ひ' )
.replace ( 'フ','ふ' )
.replace ( 'ヘ','へ' )
.replace ( 'ホ','ほ' )
.replace ( 'マ','ま' )
.replace ( 'ミ','み' )
.replace ( 'ム','む' )
.replace ( 'メ','め' )
.replace ( 'モ','も' )
.replace ( 'ヤ','や' )
.replace ( 'ユ','ゆ' )
.replace ( 'ヨ','よ' )
.replace ( 'ラ','ら' )
.replace ( 'リ','り' )
.replace ( 'ル','る' )
.replace ( 'レ','れ' )
.replace ( 'ロ','ろ' )
.replace ( 'ワ','わ' )
.replace ( 'ヲ','を' )
.replace ( 'ン','ん' )
.replace ( 'ガ','が' )
.replace ( 'ギ','ぎ' )
.replace ( 'グ','ぐ' )
.replace ( 'ゲ','げ' )
.replace ( 'ゴ','ご' )
.replace ( 'ザ','ざ' )
.replace ( 'ジ','じ' )
.replace ( 'ズ','ず' )
.replace ( 'ゼ','ぜ' )
.replace ( 'ゾ','ぞ' )
.replace ( 'ダ','だ' )
.replace ( 'ヂ','ぢ' )
.replace ( 'ヅ','づ' )
.replace ( 'デ','で' )
.replace ( 'ド','ど' )
.replace ( 'バ','ば' )
.replace ( 'ビ','び' )
.replace ( 'ブ','ぶ' )
.replace ( 'ベ','べ' )
.replace ( 'ボ','ぼ' )
.replace ( 'パ','ぱ' )
.replace ( 'ピ','ぴ' )
.replace ( 'プ','ぷ' )
.replace ( 'ペ','ぺ' )
.replace ( 'ポ','ぽ' )
.replace ( 'ァ','ぁ' )
.replace ( 'ィ','ぃ' )
.replace ( 'ゥ','ぅ' )
.replace ( 'ェ','ぇ' )
.replace ( 'ォ','ぉ' )
.replace ( 'ャ','ゃ' )
.replace ( 'ュ','ゅ' )
.replace ( 'ョ','ょ' )
.replace ( 'ッ','っ' )
.replace ( 'ヮ','ゎ' )
)
return input_str_k
#A dumb replace function.
import MeCab
mecab = MeCab.Tagger ("-Oyomi")
yomi = ""
x = "{{Photrans|" #prefix
z = "}}" #suffix
c = "|" #separater
txt = open("test.txt","rb") #filename
for line in txt :
line = line.decode("utf-8")
string = line
for sub in cjk_substrings(string):
string = string.replace(sub, x + sub + z)
if x+x+x+x or z+z+z+z in string: #Idk why I have to write like this
string = string.replace(x+x+x+x, x) #but it just works.
string = string.replace(z+z+z+z, z) #Sometimes it will have excessive
if x+x+x or z+z+z in string: #generated things, remove them manually.
string = string.replace(x+x+x, x)
string = string.replace(z+z+z, x)
if x+x or z+z in string:
string = string.replace(x+x, x)
string = string.replace(z+z, z)
yomi = mecab.parse(sub)
yomi = yomi.join(yomi.split())
yomi = k2h(yomi)
string = string.replace(x+sub+z, x+sub+c+yomi+z)
print(string,end="")
txt.close()
#Make sure to check the generated lyric with furigana.
#At least 20% of the furigana is wrong and need manual correction!