2024-09-03 09:36:18 +08:00
|
|
|
|
import os
|
2025-02-06 14:39:58 +08:00
|
|
|
|
import copy
|
|
|
|
|
import regex
|
|
|
|
|
from docx import Document
|
|
|
|
|
from docx.text.paragraph import Paragraph
|
|
|
|
|
from docx.table import Table
|
2024-09-03 09:36:18 +08:00
|
|
|
|
|
2025-02-06 14:39:58 +08:00
|
|
|
|
def iter_block_items(parent):
|
|
|
|
|
"""
|
|
|
|
|
遍历给定 parent(Document 或 _Cell)的块级对象,
|
|
|
|
|
依次返回其中的 Paragraph(段落)和 Table(表格)。
|
|
|
|
|
"""
|
|
|
|
|
# 对于 Document 对象,获取 body 节点
|
|
|
|
|
parent_elm = parent.element.body
|
|
|
|
|
for child in parent_elm.iterchildren():
|
|
|
|
|
tag = child.tag.lower()
|
|
|
|
|
if tag.endswith('}p'):
|
|
|
|
|
yield Paragraph(child, parent)
|
|
|
|
|
elif tag.endswith('}tbl'):
|
|
|
|
|
yield Table(child, parent)
|
2024-09-03 09:36:18 +08:00
|
|
|
|
|
2025-02-06 14:39:58 +08:00
|
|
|
|
#暂用不上,因为需要pdf文件->每页后面打标记->转docx。后面这一步钱无法省。
|
2024-09-03 09:36:18 +08:00
|
|
|
|
def copy_docx(source_path):
|
2025-02-06 14:39:58 +08:00
|
|
|
|
"""
|
|
|
|
|
从文档中截取内容:
|
|
|
|
|
- 从第一个匹配 begin_pattern 的段落开始(按文档顺序)
|
|
|
|
|
- 到最后一个匹配 end_pattern 的段落结束(按文档倒序匹配)
|
|
|
|
|
如果没有匹配到 begin_pattern,则默认从第一段开始;
|
|
|
|
|
如果没有匹配到 end_pattern,则默认到最后一段结束;
|
|
|
|
|
同时确保匹配到的起始段落一定在结束段落之前,
|
|
|
|
|
否则也采用默认的整个文档范围。
|
|
|
|
|
同时保留中间的所有块级对象(包括文本、表格等),
|
|
|
|
|
并尽量保持原来的格式、样式不变。
|
|
|
|
|
"""
|
|
|
|
|
# 打开源文档,并构建新文档
|
|
|
|
|
doc = Document(source_path)
|
2024-09-03 09:36:18 +08:00
|
|
|
|
output_folder = os.path.dirname(source_path)
|
|
|
|
|
original_file_name = os.path.basename(source_path)
|
|
|
|
|
file_name_without_ext, file_ext = os.path.splitext(original_file_name)
|
2024-09-13 15:14:49 +08:00
|
|
|
|
modified_file_name = file_name_without_ext + "_invalid" + file_ext
|
2024-09-03 09:36:18 +08:00
|
|
|
|
destination_path = os.path.join(output_folder, modified_file_name)
|
2025-02-06 14:39:58 +08:00
|
|
|
|
new_doc = Document()
|
2024-09-03 09:36:18 +08:00
|
|
|
|
|
2025-02-06 14:39:58 +08:00
|
|
|
|
# 定义正则表达式(使用 regex 模块):
|
|
|
|
|
begin_pattern = regex.compile(
|
|
|
|
|
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
|
|
|
|
|
r'(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
)
|
|
|
|
|
end_pattern = regex.compile(
|
|
|
|
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:(?:文件|书)(?:的)?(?:组成|构成|编制)|格式).*|'
|
|
|
|
|
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|'
|
|
|
|
|
r'[::]清标报告|'
|
|
|
|
|
r'\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式(?:及相关附件)?\s*$',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 遍历所有块级对象(段落和表格)
|
|
|
|
|
block_items = list(iter_block_items(doc))
|
|
|
|
|
|
|
|
|
|
# 从前向后查找第一个匹配 begin_pattern 的段落索引
|
|
|
|
|
begin_index = None
|
|
|
|
|
for idx, block in enumerate(block_items):
|
|
|
|
|
if isinstance(block, Paragraph):
|
|
|
|
|
if begin_pattern.search(block.text):
|
|
|
|
|
begin_index = idx
|
2024-09-03 09:36:18 +08:00
|
|
|
|
break
|
|
|
|
|
|
2025-02-06 14:39:58 +08:00
|
|
|
|
# 从后向前查找第一个匹配 end_pattern 的段落索引
|
|
|
|
|
end_index = None
|
|
|
|
|
for idx in range(len(block_items) - 1, -1, -1):
|
|
|
|
|
block = block_items[idx]
|
|
|
|
|
if isinstance(block, Paragraph):
|
|
|
|
|
if end_pattern.search(block.text):
|
|
|
|
|
end_index = idx
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# 根据匹配结果确定截取范围
|
|
|
|
|
if begin_index is None and end_index is None:
|
|
|
|
|
# 两者都没匹配到,采用整个文档
|
|
|
|
|
begin_index = 0
|
|
|
|
|
end_index = len(block_items) - 1
|
|
|
|
|
elif begin_index is None:
|
|
|
|
|
# 没有匹配到开头,则从第一块开始,但保留实际找到的 end_index
|
|
|
|
|
begin_index = 0
|
|
|
|
|
elif end_index is None:
|
|
|
|
|
# 没有匹配到结尾,则到最后一块结束,但保留实际找到的 begin_index
|
|
|
|
|
end_index = len(block_items) - 1
|
|
|
|
|
|
|
|
|
|
# 如果匹配到的起始段落在结束段落之后,则采用整个文档范围
|
|
|
|
|
if begin_index > end_index:
|
|
|
|
|
begin_index = 0
|
|
|
|
|
end_index = len(block_items) - 1
|
2024-09-03 09:36:18 +08:00
|
|
|
|
|
2025-02-06 14:39:58 +08:00
|
|
|
|
# 将 begin_index 到 end_index 之间的所有块(包括段落和表格)复制到新文档中
|
|
|
|
|
for block in block_items[begin_index : end_index + 1]:
|
|
|
|
|
# 直接将原块的 XML 进行 deep copy 后添加到新文档的 body 中
|
|
|
|
|
new_doc._body._element.append(copy.deepcopy(block._element))
|
|
|
|
|
|
|
|
|
|
new_doc.save(destination_path)
|
|
|
|
|
print("docx截取成功,已保存到:", destination_path)
|
|
|
|
|
return destination_path
|
2024-09-03 09:36:18 +08:00
|
|
|
|
|
2025-02-06 14:39:58 +08:00
|
|
|
|
# 测试调用
|
2024-09-03 09:36:18 +08:00
|
|
|
|
if __name__ == '__main__':
|
2025-02-06 14:39:58 +08:00
|
|
|
|
source_path = r"C:\Users\Administrator\Desktop\货物标\zbfilesdocx\6_2定版视频会议磋商文件.docx"
|
|
|
|
|
res = copy_docx(source_path)
|
|
|
|
|
print(res)
|