-
Notifications
You must be signed in to change notification settings - Fork 6
/
pipeline_poc.py
88 lines (61 loc) · 2.44 KB
/
pipeline_poc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
本代码用于演示我们是如何从数据源进行一步步操作从而得到数据集
出于简洁明了的目的,不会针对各个环节的效率进行优化
实际实践所用代码可以参照https://wiki.mnbvc.org/doku.php/pxyl中留档的代码
"""
import argparse
import json
import subprocess
import sys
import logging
from pathlib import Path
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
ROOT_DIR = Path(__file__).parent
def run_script(script, args):
subprocess.run([sys.executable, script] + args)
def load_config(config_file: str):
with open(config_file, 'r') as file:
return json.load(file)
def download(script, args):
"""
数据源下载,经调研,发现https://documents.un.org/prod/ods.nsf/home.xsp内含有doc格式文件
"""
run_script(script, args)
def format_conversion(script, args):
"""
将download下载的数据格式转化成对齐所需要的格式,
可能会包含以下函数:doc2docx、wpf2docx、docx2txt
"""
run_script(script, args)
def align(script, args):
"""
文本对齐,此方法中需要先执行翻译,然后进行对齐
"""
# run_script(script, args)
def translate():
"""
文本翻译
"""
def main(config_file: Path):
config = load_config(config_file)
parser = argparse.ArgumentParser(description='总线脚本')
parser.add_argument('identifier', help='网站标识符,例如 "us_embassy"')
args, unknown_args = parser.parse_known_args()
script = next((item['script'] for item in config if item['identifier'] == args.identifier), None)
if not script:
print(f"未找到标识符为 '{args.identifier}' 的配置。")
sys.exit(1)
logging.info(f"Downloading '{args.identifier}' data is processing...")
download(script["download"], unknown_args)
logging.info(f"Downloading '{args.identifier}' data is successed!")
logging.info(f"'{args.identifier}' format converting...")
format_conversion(script["format_conversion"], unknown_args)
logging.info(f"'{args.identifier}' format converted!")
logging.info(f"'{args.identifier}' is aligning...")
align(script["alignment"], unknown_args)
logging.info(f"'{args.identifier}' is align completed!")
print(script)
if __name__ == '__main__':
# 配置文件的路径
config_file = ROOT_DIR / 'config.json'
main(config_file)