-
Notifications
You must be signed in to change notification settings - Fork 5
/
docx2markdown2.py
59 lines (45 loc) · 2.13 KB
/
docx2markdown2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
import concurrent.futures
import pypandoc
def convert_docx_to_markdown(docx_file, markdown_folder, image_folder):
try:
# 构建Markdown文件名
docx_filename = os.path.basename(docx_file)
markdown_filename = os.path.splitext(docx_filename)[0] + ".md"
markdown_file = os.path.join(markdown_folder, markdown_filename)
# 创建图片文件夹
os.makedirs(image_folder, exist_ok=True)
# 配置Pandoc选项,将图片保存到指定的文件夹中
pandoc_options = [
f"--extract-media={image_folder}",
"--wrap=none",
]
# 使用pypandoc将.docx文件转换为Markdown,并使用外部图片链接
output = pypandoc.convert_file(docx_file, 'md', extra_args=pandoc_options)
# 将转换后的Markdown写入.md文件
with open(markdown_file, 'w', encoding='utf-8') as file:
file.write(output)
print(f"转换完成:{docx_file} -> {markdown_file}")
except Exception as e:
print(f"转换错误:{docx_file} -> {e}")
def convert_docx_folder(docx_folder, markdown_folder, image_folder):
# 获取.docx文件列表
docx_files = [file for file in os.listdir(docx_folder) if file.endswith(".docx")]
total_files = len(docx_files)
completed_files = 0
with concurrent.futures.ThreadPoolExecutor() as executor:
# 提交任务给线程池进行并行处理
futures = []
for docx_file in docx_files:
docx_path = os.path.join(docx_folder, docx_file)
future = executor.submit(convert_docx_to_markdown, docx_path, markdown_folder, image_folder)
futures.append(future)
# 获取任务的进度
for future in concurrent.futures.as_completed(futures):
completed_files += 1
print(f"进度:{completed_files}/{total_files}")
# 调用函数进行转换
docx_folder = r'/www/dataset/MNBVC/docx_math'
markdown_folder = r'/www/dataset/MNBVC/clear_data' # 存放.md文件的文件夹路径
image_folder ="/www/dataset/MNBVC/image_folder"
convert_docx_folder(docx_folder, markdown_folder, image_folder)