109 lines
4.6 KiB
Python
109 lines
4.6 KiB
Python
import os
|
||
import copy
|
||
import regex
|
||
from docx import Document
|
||
from docx.text.paragraph import Paragraph
|
||
from docx.table import Table
|
||
|
||
def iter_block_items(parent):
|
||
"""
|
||
遍历给定 parent(Document 或 _Cell)的块级对象,
|
||
依次返回其中的 Paragraph(段落)和 Table(表格)。
|
||
"""
|
||
# 对于 Document 对象,获取 body 节点
|
||
parent_elm = parent.element.body
|
||
for child in parent_elm.iterchildren():
|
||
tag = child.tag.lower()
|
||
if tag.endswith('}p'):
|
||
yield Paragraph(child, parent)
|
||
elif tag.endswith('}tbl'):
|
||
yield Table(child, parent)
|
||
|
||
#暂用不上,因为需要pdf文件->每页后面打标记->转docx。后面这一步钱无法省。
|
||
def copy_docx(source_path):
|
||
"""
|
||
从文档中截取内容:
|
||
- 从第一个匹配 begin_pattern 的段落开始(按文档顺序)
|
||
- 到最后一个匹配 end_pattern 的段落结束(按文档倒序匹配)
|
||
如果没有匹配到 begin_pattern,则默认从第一段开始;
|
||
如果没有匹配到 end_pattern,则默认到最后一段结束;
|
||
同时确保匹配到的起始段落一定在结束段落之前,
|
||
否则也采用默认的整个文档范围。
|
||
同时保留中间的所有块级对象(包括文本、表格等),
|
||
并尽量保持原来的格式、样式不变。
|
||
"""
|
||
# 打开源文档,并构建新文档
|
||
doc = Document(source_path)
|
||
output_folder = os.path.dirname(source_path)
|
||
original_file_name = os.path.basename(source_path)
|
||
file_name_without_ext, file_ext = os.path.splitext(original_file_name)
|
||
modified_file_name = file_name_without_ext + "_invalid" + file_ext
|
||
destination_path = os.path.join(output_folder, modified_file_name)
|
||
new_doc = Document()
|
||
|
||
# 定义正则表达式(使用 regex 模块):
|
||
begin_pattern = regex.compile(
|
||
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
|
||
r'(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
||
regex.MULTILINE
|
||
)
|
||
end_pattern = regex.compile(
|
||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:(?:文件|书)(?:的)?(?:组成|构成|编制)|格式).*|'
|
||
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|'
|
||
r'[::]清标报告|'
|
||
r'\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式(?:及相关附件)?\s*$',
|
||
regex.MULTILINE
|
||
)
|
||
|
||
# 遍历所有块级对象(段落和表格)
|
||
block_items = list(iter_block_items(doc))
|
||
|
||
# 从前向后查找第一个匹配 begin_pattern 的段落索引
|
||
begin_index = None
|
||
for idx, block in enumerate(block_items):
|
||
if isinstance(block, Paragraph):
|
||
if begin_pattern.search(block.text):
|
||
begin_index = idx
|
||
break
|
||
|
||
# 从后向前查找第一个匹配 end_pattern 的段落索引
|
||
end_index = None
|
||
for idx in range(len(block_items) - 1, -1, -1):
|
||
block = block_items[idx]
|
||
if isinstance(block, Paragraph):
|
||
if end_pattern.search(block.text):
|
||
end_index = idx
|
||
break
|
||
|
||
# 根据匹配结果确定截取范围
|
||
if begin_index is None and end_index is None:
|
||
# 两者都没匹配到,采用整个文档
|
||
begin_index = 0
|
||
end_index = len(block_items) - 1
|
||
elif begin_index is None:
|
||
# 没有匹配到开头,则从第一块开始,但保留实际找到的 end_index
|
||
begin_index = 0
|
||
elif end_index is None:
|
||
# 没有匹配到结尾,则到最后一块结束,但保留实际找到的 begin_index
|
||
end_index = len(block_items) - 1
|
||
|
||
# 如果匹配到的起始段落在结束段落之后,则采用整个文档范围
|
||
if begin_index > end_index:
|
||
begin_index = 0
|
||
end_index = len(block_items) - 1
|
||
|
||
# 将 begin_index 到 end_index 之间的所有块(包括段落和表格)复制到新文档中
|
||
for block in block_items[begin_index : end_index + 1]:
|
||
# 直接将原块的 XML 进行 deep copy 后添加到新文档的 body 中
|
||
new_doc._body._element.append(copy.deepcopy(block._element))
|
||
|
||
new_doc.save(destination_path)
|
||
print("docx截取成功,已保存到:", destination_path)
|
||
return destination_path
|
||
|
||
# 测试调用
|
||
if __name__ == '__main__':
|
||
source_path = r"C:\Users\Administrator\Desktop\货物标\zbfilesdocx\6_2定版视频会议磋商文件.docx"
|
||
res = copy_docx(source_path)
|
||
print(res)
|