zbparse/flask_app/main/docx截取docx.py

from docx import Document
import re
import os


def copy_docx(source_path):
    doc = Document(source_path)  # 打开源文档
    output_folder = os.path.dirname(source_path)

    # 获取原文件名并添加后缀
    original_file_name = os.path.basename(source_path)
    file_name_without_ext, file_ext = os.path.splitext(original_file_name)
    modified_file_name = file_name_without_ext + "_invalid" + file_ext
    destination_path = os.path.join(output_folder, modified_file_name)

    new_doc = Document()  # 创建新文档

    # 定义正则表达式模式
    begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
    end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|：清标报告|:清标报告')

    # 寻找最后一个begin_pattern的位置
    last_begin_index = -1
    for i, paragraph in enumerate(doc.paragraphs):
        if begin_pattern.search(paragraph.text):
            last_begin_index = i

    # 从最后一个匹配的begin_pattern开始复制，直到end_pattern
    if last_begin_index != -1:
        for i, paragraph in enumerate(doc.paragraphs[last_begin_index:], start=last_begin_index):
            new_para = new_doc.add_paragraph(style=paragraph.style)
            for run in paragraph.runs:
                new_run = new_para.add_run(run.text)
                new_run.bold = run.bold
                new_run.italic = run.italic
                new_run.underline = run.underline
                if run.font.color:
                    new_run.font.color.rgb = run.font.color.rgb
                new_run.font.size = run.font.size

            if end_pattern.search(paragraph.text):
                break

    new_doc.save(destination_path)  # 保存新文档


# 调用函数
if __name__ == '__main__':
    source_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest13.docx"
    copy_docx(source_path)