2024-10-15 20:57:58 +08:00

52 lines
2.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from docx import Document
def preprocess_paragraphs(paragraphs):
processed = []
index = 0
while index < len(paragraphs):
current_text = paragraphs[index].text.strip()
# 检测是否为空白行
if current_text == '':
# 确保有前一行和后一行
if index > 0 and index + 1 < len(paragraphs):
prev_text = paragraphs[index - 1].text.strip()
# print(prev_text)
next_text = paragraphs[index + 1].text.strip()
# print(next_text)
# print("------------------------------")
# 检查前一行是否不以指定标点结尾
if not prev_text.endswith(('', ',', '', '!', '?')):
# 检查后一行是否以序号开头
if re.match(r'^(\d+(\s*\.\s*\d+)*)\s*', prev_text) and not re.match(r'^(\d+(\s*\.\s*\d+)*)\s*、',next_text) and len(prev_text)>30:
# 合并前一行和后一行
merged_text = prev_text + next_text
# print(merged_text)
# print("---------------------------------")
if processed:
# 用合并后的文本替换已处理的前一行
processed[-1] = merged_text
else:
processed.append(merged_text)
# 跳过后一行
index += 2
continue
else:
# 非空白行,直接添加到处理后的列表
processed.append(current_text)
index += 1
return processed
doc_path = 'D:\\flask_project\\flask_app\\static\\output\\015d997e-c32c-49d1-a611-a2e817ace6a1\\ztbfile.docx'
doc = Document(doc_path)
# 假设 doc 是您的文档对象
processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
for i in processed_paragraphs:
print("___________________________")
print(i)