zbparse/flask_app/general/docx截取docx.py

import os
import copy
import regex
from docx import Document
from docx.text.paragraph import Paragraph
from docx.table import Table

def iter_block_items(parent):
    """
    遍历给定 parent（Document 或 _Cell）的块级对象，
    依次返回其中的 Paragraph（段落）和 Table（表格）。
    """
    # 对于 Document 对象，获取 body 节点
    parent_elm = parent.element.body
    for child in parent_elm.iterchildren():
        tag = child.tag.lower()
        if tag.endswith('}p'):
            yield Paragraph(child, parent)
        elif tag.endswith('}tbl'):
            yield Table(child, parent)

#暂用不上，因为需要pdf文件->每页后面打标记->转docx。后面这一步钱无法省。
def copy_docx(source_path):
    """
    从文档中截取内容：
      - 从第一个匹配 begin_pattern 的段落开始（按文档顺序）
      - 到最后一个匹配 end_pattern 的段落结束（按文档倒序匹配）
    如果没有匹配到 begin_pattern，则默认从第一段开始；
    如果没有匹配到 end_pattern，则默认到最后一段结束；
    同时确保匹配到的起始段落一定在结束段落之前，
    否则也采用默认的整个文档范围。
    同时保留中间的所有块级对象（包括文本、表格等），
    并尽量保持原来的格式、样式不变。
    """
    # 打开源文档，并构建新文档
    doc = Document(source_path)
    output_folder = os.path.dirname(source_path)
    original_file_name = os.path.basename(source_path)
    file_name_without_ext, file_ext = os.path.splitext(original_file_name)
    modified_file_name = file_name_without_ext + "_invalid" + file_ext
    destination_path = os.path.join(output_folder, modified_file_name)
    new_doc = Document()

    # 定义正则表达式（使用 regex 模块）：
    begin_pattern = regex.compile(
        r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
        r'(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
        regex.MULTILINE
    )
    end_pattern = regex.compile(
        r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:(?:文件|书)(?:的)?(?:组成|构成|编制)|格式).*|'
        r'第[一二三四五六七八九十]+(?:章|部分).*?合同|'
        r'[：:]清标报告|'
        r'\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式(?:及相关附件)?\s*$',
        regex.MULTILINE
    )

    # 遍历所有块级对象（段落和表格）
    block_items = list(iter_block_items(doc))

    # 从前向后查找第一个匹配 begin_pattern 的段落索引
    begin_index = None
    for idx, block in enumerate(block_items):
        if isinstance(block, Paragraph):
            if begin_pattern.search(block.text):
                begin_index = idx
                break

    # 从后向前查找第一个匹配 end_pattern 的段落索引
    end_index = None
    for idx in range(len(block_items) - 1, -1, -1):
        block = block_items[idx]
        if isinstance(block, Paragraph):
            if end_pattern.search(block.text):
                end_index = idx
                break

    # 根据匹配结果确定截取范围
    if begin_index is None and end_index is None:
        # 两者都没匹配到，采用整个文档
        begin_index = 0
        end_index = len(block_items) - 1
    elif begin_index is None:
        # 没有匹配到开头，则从第一块开始，但保留实际找到的 end_index
        begin_index = 0
    elif end_index is None:
        # 没有匹配到结尾，则到最后一块结束，但保留实际找到的 begin_index
        end_index = len(block_items) - 1

    # 如果匹配到的起始段落在结束段落之后，则采用整个文档范围
    if begin_index > end_index:
        begin_index = 0
        end_index = len(block_items) - 1

    # 将 begin_index 到 end_index 之间的所有块（包括段落和表格）复制到新文档中
    for block in block_items[begin_index : end_index + 1]:
        # 直接将原块的 XML 进行 deep copy 后添加到新文档的 body 中
        new_doc._body._element.append(copy.deepcopy(block._element))

    new_doc.save(destination_path)
    print("docx截取成功，已保存到：", destination_path)
    return destination_path

# 测试调用
if __name__ == '__main__':
    source_path = r"C:\Users\Administrator\Desktop\货物标\zbfilesdocx\6_2定版视频会议磋商文件.docx"
    res = copy_docx(source_path)
    print(res)