zbparse/flask_app/main/docx截取docx.py
2024-09-03 09:36:18 +08:00

51 lines
1.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from docx import Document
import re
import os
def copy_docx(source_path):
doc = Document(source_path) # 打开源文档
output_folder = os.path.dirname(source_path)
# 获取原文件名并添加后缀
original_file_name = os.path.basename(source_path)
file_name_without_ext, file_ext = os.path.splitext(original_file_name)
modified_file_name = file_name_without_ext + "_invalid" + file_ext
destination_path = os.path.join(output_folder, modified_file_name)
new_doc = Document() # 创建新文档
# 定义正则表达式模式
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|:清标报告|:清标报告')
# 寻找最后一个begin_pattern的位置
last_begin_index = -1
for i, paragraph in enumerate(doc.paragraphs):
if begin_pattern.search(paragraph.text):
last_begin_index = i
# 从最后一个匹配的begin_pattern开始复制直到end_pattern
if last_begin_index != -1:
for i, paragraph in enumerate(doc.paragraphs[last_begin_index:], start=last_begin_index):
new_para = new_doc.add_paragraph(style=paragraph.style)
for run in paragraph.runs:
new_run = new_para.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
if run.font.color:
new_run.font.color.rgb = run.font.color.rgb
new_run.font.size = run.font.size
if end_pattern.search(paragraph.text):
break
new_doc.save(destination_path) # 保存新文档
# 调用函数
if __name__ == '__main__':
source_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest13.docx"
copy_docx(source_path)