2024-09-03 09:36:18 +08:00
|
|
|
|
from docx import Document
|
|
|
|
|
import re
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def copy_docx(source_path):
|
|
|
|
|
doc = Document(source_path) # 打开源文档
|
|
|
|
|
output_folder = os.path.dirname(source_path)
|
|
|
|
|
|
|
|
|
|
# 获取原文件名并添加后缀
|
|
|
|
|
original_file_name = os.path.basename(source_path)
|
|
|
|
|
file_name_without_ext, file_ext = os.path.splitext(original_file_name)
|
2024-09-13 15:14:49 +08:00
|
|
|
|
modified_file_name = file_name_without_ext + "_invalid" + file_ext
|
2024-09-03 09:36:18 +08:00
|
|
|
|
destination_path = os.path.join(output_folder, modified_file_name)
|
|
|
|
|
|
|
|
|
|
new_doc = Document() # 创建新文档
|
|
|
|
|
|
|
|
|
|
# 定义正则表达式模式
|
|
|
|
|
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
|
|
|
|
|
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|:清标报告|:清标报告')
|
|
|
|
|
|
|
|
|
|
# 寻找最后一个begin_pattern的位置
|
|
|
|
|
last_begin_index = -1
|
|
|
|
|
for i, paragraph in enumerate(doc.paragraphs):
|
2024-09-11 12:02:09 +08:00
|
|
|
|
|
2024-09-03 09:36:18 +08:00
|
|
|
|
if begin_pattern.search(paragraph.text):
|
|
|
|
|
last_begin_index = i
|
|
|
|
|
|
|
|
|
|
# 从最后一个匹配的begin_pattern开始复制,直到end_pattern
|
|
|
|
|
if last_begin_index != -1:
|
|
|
|
|
for i, paragraph in enumerate(doc.paragraphs[last_begin_index:], start=last_begin_index):
|
|
|
|
|
new_para = new_doc.add_paragraph(style=paragraph.style)
|
|
|
|
|
for run in paragraph.runs:
|
|
|
|
|
new_run = new_para.add_run(run.text)
|
|
|
|
|
new_run.bold = run.bold
|
|
|
|
|
new_run.italic = run.italic
|
|
|
|
|
new_run.underline = run.underline
|
|
|
|
|
if run.font.color:
|
|
|
|
|
new_run.font.color.rgb = run.font.color.rgb
|
|
|
|
|
new_run.font.size = run.font.size
|
|
|
|
|
|
|
|
|
|
if end_pattern.search(paragraph.text):
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
new_doc.save(destination_path) # 保存新文档
|
2024-09-13 15:03:55 +08:00
|
|
|
|
print("docx截取docx成功!")
|
2024-10-10 11:52:37 +08:00
|
|
|
|
return destination_path
|
2024-09-03 09:36:18 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 调用函数
|
|
|
|
|
if __name__ == '__main__':
|
2024-09-11 12:02:09 +08:00
|
|
|
|
source_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx"
|
2024-10-10 11:52:37 +08:00
|
|
|
|
res=copy_docx(source_path)
|
|
|
|
|
print(res)
|