zbparse/flask_app/main/test.py

import re

from docx import Document
def preprocess_paragraphs(paragraphs):
    processed = []
    index = 0
    while index < len(paragraphs):
        current_text = paragraphs[index].text.strip()

        # 检测是否为空白行
        if current_text == '':
            # 确保有前一行和后一行
            if index > 0 and index + 1 < len(paragraphs):
                prev_text = paragraphs[index - 1].text.strip()
                # print(prev_text)
                next_text = paragraphs[index + 1].text.strip()
                # print(next_text)
                # print("------------------------------")
                # 检查前一行是否不以指定标点结尾
                if not prev_text.endswith(('，', ',', '。', '!', '?')):
                    # 检查后一行是否以序号开头
                    if re.match(r'^(\d+(\s*\.\s*\d+)*)\s*', prev_text) and not re.match(r'^(\d+(\s*\.\s*\d+)*)\s*、',next_text) and len(prev_text)>30:
                        # 合并前一行和后一行
                        merged_text = prev_text + next_text
                        # print(merged_text)
                        # print("---------------------------------")
                        if processed:
                            # 用合并后的文本替换已处理的前一行
                            processed[-1] = merged_text
                        else:
                            processed.append(merged_text)
                        # 跳过后一行
                        index += 2
                        continue
        else:
            # 非空白行，直接添加到处理后的列表
            processed.append(current_text)

        index += 1
    return processed

doc_path = 'D:\\flask_project\\flask_app\\static\\output\\015d997e-c32c-49d1-a611-a2e817ace6a1\\ztbfile.docx'
doc = Document(doc_path)

# 假设 doc 是您的文档对象
processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
for i in processed_paragraphs:
    print("___________________________")
    print(i)