-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate.py
110 lines (90 loc) · 3.88 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import json
import os
from pathlib import Path
import re
# 中文数字到阿拉伯数字的映射表
cn2num = {
'零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
'十': 10, '百': 100, '千': 1000, '万': 10000
}
def chinese_to_number(chinese):
total = 0
unit = 1
prev_unit = 1
if chinese.startswith('十'):
# 处理"十"开头的情况,比如"十五"应为15,而"十"应为10
total += 10
chinese = chinese[1:]
for i in range(len(chinese) - 1, -1, -1):
curr_char = chinese[i]
if curr_char in cn2num:
num = cn2num[curr_char]
if num >= 10:
unit = num
prev_unit = unit
else:
total += num * prev_unit
prev_unit = 1
else:
prev_unit = 1
return total
def extract_number_from_filename(filename):
match = re.search(r'第(.+?)回', filename)
if match:
chinese_number = match.group(1)
return chinese_to_number(chinese_number)
return 0
if __name__ == "__main__":
directory = "小学生/水浒传"
# 生成一个html,分成两列,左边列是原文,右边列是译文
output_file = f"成品/小学生版水浒传.html"
with open(output_file, 'w', encoding='utf-8') as f:
f.write("<html><body><h1>水浒传</h1>")
for root, dirs, files in os.walk(directory):
# files里面的文件名是中文的数字,例如第一回,第一百回这种,需要按照数字的来排序
sorted_files = sorted(files, key=extract_number_from_filename)
for ff in sorted_files:
print(ff)
for input_file in sorted_files:
file_path = Path(root) / input_file
with open(file_path, 'r', encoding='utf-8') as f2:
# f.write("<tr><td><h3>原文<h3></td><td><h3>译文</h3></td>")
paragraphs = json.load(f2)
is_first = True
for paragraph in paragraphs:
if is_first:
f.write("<h2>" + paragraph[2] + "</h2><table>")
is_first = False
else:
f.write("<tr><td>")
f.write("<p>"+ paragraph[1] + "<br><br></p></td>")
f.write("<td>")
f.write("<p>"+ paragraph[2] + "<br><br></p>")
f.write("</tr>")
f.write("</table>")
f.write("</body></html>")
#markdown
output_file = f"小学生版水浒传.md"
with open(output_file, 'w', encoding='utf-8') as f:
f.write("# 水浒传"+ "\n\n")
for root, dirs, files in os.walk(directory):
# files里面的文件名是中文的数字,例如第一回,第一百回这种,需要按照数字的来排序
sorted_files = sorted(files, key=extract_number_from_filename)
for ff in sorted_files:
print(ff)
for input_file in sorted_files:
file_path = Path(root) / input_file
with open(file_path, 'r', encoding='utf-8') as f2:
paragraphs = json.load(f2)
is_first = True
# f.write("<tr><td><h3>原文<h3></td><td><h3>译文</h3></td>")
for paragraph in paragraphs:
# f.write("<tr>")
if is_first:
f.write("## " + paragraph[2]+ "\n\n")
is_first = False
else:
# f.write("<td>")
# f.write("<p>"+ paragraph[1] + "<br><br></p></td>")
# f.write("<td>")
f.write(paragraph[2] + "\n\n")