2024-10-15 20:57:58 +08:00
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
from docx import Document
|
|
|
|
|
def preprocess_paragraphs(paragraphs):
|
|
|
|
|
processed = []
|
|
|
|
|
index = 0
|
|
|
|
|
while index < len(paragraphs):
|
|
|
|
|
current_text = paragraphs[index].text.strip()
|
|
|
|
|
|
|
|
|
|
# 检测是否为空白行
|
|
|
|
|
if current_text == '':
|
|
|
|
|
# 确保有前一行和后一行
|
|
|
|
|
if index > 0 and index + 1 < len(paragraphs):
|
|
|
|
|
prev_text = paragraphs[index - 1].text.strip()
|
|
|
|
|
# print(prev_text)
|
|
|
|
|
next_text = paragraphs[index + 1].text.strip()
|
|
|
|
|
# print(next_text)
|
|
|
|
|
# print("------------------------------")
|
|
|
|
|
# 检查前一行是否不以指定标点结尾
|
|
|
|
|
if not prev_text.endswith((',', ',', '。', '!', '?')):
|
|
|
|
|
# 检查后一行是否以序号开头
|
|
|
|
|
if re.match(r'^(\d+(\s*\.\s*\d+)*)\s*', prev_text) and not re.match(r'^(\d+(\s*\.\s*\d+)*)\s*、',next_text) and len(prev_text)>30:
|
|
|
|
|
# 合并前一行和后一行
|
|
|
|
|
merged_text = prev_text + next_text
|
|
|
|
|
# print(merged_text)
|
|
|
|
|
# print("---------------------------------")
|
|
|
|
|
if processed:
|
|
|
|
|
# 用合并后的文本替换已处理的前一行
|
|
|
|
|
processed[-1] = merged_text
|
|
|
|
|
else:
|
|
|
|
|
processed.append(merged_text)
|
|
|
|
|
# 跳过后一行
|
|
|
|
|
index += 2
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
# 非空白行,直接添加到处理后的列表
|
|
|
|
|
processed.append(current_text)
|
|
|
|
|
|
|
|
|
|
index += 1
|
|
|
|
|
return processed
|
|
|
|
|
|
|
|
|
|
doc_path = 'D:\\flask_project\\flask_app\\static\\output\\015d997e-c32c-49d1-a611-a2e817ace6a1\\ztbfile.docx'
|
|
|
|
|
doc = Document(doc_path)
|
|
|
|
|
|
|
|
|
|
# 假设 doc 是您的文档对象
|
|
|
|
|
processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
|
|
|
|
|
for i in processed_paragraphs:
|
|
|
|
|
print("___________________________")
|
|
|
|
|
print(i)
|
2024-10-10 21:03:02 +08:00
|
|
|
|
|
|
|
|
|
|