zbparse/flask_app/main/docx截取docx.py

54 lines
2.0 KiB
Python
Raw Normal View History

2024-09-03 09:36:18 +08:00
from docx import Document
import re
import os
def copy_docx(source_path):
doc = Document(source_path) # 打开源文档
output_folder = os.path.dirname(source_path)
# 获取原文件名并添加后缀
original_file_name = os.path.basename(source_path)
file_name_without_ext, file_ext = os.path.splitext(original_file_name)
2024-09-13 15:14:49 +08:00
modified_file_name = file_name_without_ext + "_invalid" + file_ext
2024-09-03 09:36:18 +08:00
destination_path = os.path.join(output_folder, modified_file_name)
new_doc = Document() # 创建新文档
# 定义正则表达式模式
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|:清标报告|:清标报告')
# 寻找最后一个begin_pattern的位置
last_begin_index = -1
for i, paragraph in enumerate(doc.paragraphs):
2024-09-03 09:36:18 +08:00
if begin_pattern.search(paragraph.text):
last_begin_index = i
# 从最后一个匹配的begin_pattern开始复制直到end_pattern
if last_begin_index != -1:
for i, paragraph in enumerate(doc.paragraphs[last_begin_index:], start=last_begin_index):
new_para = new_doc.add_paragraph(style=paragraph.style)
for run in paragraph.runs:
new_run = new_para.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
if run.font.color:
new_run.font.color.rgb = run.font.color.rgb
new_run.font.size = run.font.size
if end_pattern.search(paragraph.text):
break
new_doc.save(destination_path) # 保存新文档
2024-09-13 15:03:55 +08:00
print("docx截取docx成功")
2024-10-10 11:52:37 +08:00
return destination_path
2024-09-03 09:36:18 +08:00
# 调用函数
if __name__ == '__main__':
source_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx"
2024-10-10 11:52:37 +08:00
res=copy_docx(source_path)
print(res)