52 lines
2.0 KiB
Python
Raw Normal View History

2024-10-15 20:57:58 +08:00
import re
from docx import Document
def preprocess_paragraphs(paragraphs):
processed = []
index = 0
while index < len(paragraphs):
current_text = paragraphs[index].text.strip()
# 检测是否为空白行
if current_text == '':
# 确保有前一行和后一行
if index > 0 and index + 1 < len(paragraphs):
prev_text = paragraphs[index - 1].text.strip()
# print(prev_text)
next_text = paragraphs[index + 1].text.strip()
# print(next_text)
# print("------------------------------")
# 检查前一行是否不以指定标点结尾
if not prev_text.endswith(('', ',', '', '!', '?')):
# 检查后一行是否以序号开头
if re.match(r'^(\d+(\s*\.\s*\d+)*)\s*', prev_text) and not re.match(r'^(\d+(\s*\.\s*\d+)*)\s*、',next_text) and len(prev_text)>30:
# 合并前一行和后一行
merged_text = prev_text + next_text
# print(merged_text)
# print("---------------------------------")
if processed:
# 用合并后的文本替换已处理的前一行
processed[-1] = merged_text
else:
processed.append(merged_text)
# 跳过后一行
index += 2
continue
else:
# 非空白行,直接添加到处理后的列表
processed.append(current_text)
index += 1
return processed
doc_path = 'D:\\flask_project\\flask_app\\static\\output\\015d997e-c32c-49d1-a611-a2e817ace6a1\\ztbfile.docx'
doc = Document(doc_path)
# 假设 doc 是您的文档对象
processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
for i in processed_paragraphs:
print("___________________________")
print(i)
2024-10-10 21:03:02 +08:00