52 lines
2.0 KiB
Python
52 lines
2.0 KiB
Python
import re
|
||
|
||
from docx import Document
|
||
def preprocess_paragraphs(paragraphs):
|
||
processed = []
|
||
index = 0
|
||
while index < len(paragraphs):
|
||
current_text = paragraphs[index].text.strip()
|
||
|
||
# 检测是否为空白行
|
||
if current_text == '':
|
||
# 确保有前一行和后一行
|
||
if index > 0 and index + 1 < len(paragraphs):
|
||
prev_text = paragraphs[index - 1].text.strip()
|
||
# print(prev_text)
|
||
next_text = paragraphs[index + 1].text.strip()
|
||
# print(next_text)
|
||
# print("------------------------------")
|
||
# 检查前一行是否不以指定标点结尾
|
||
if not prev_text.endswith((',', ',', '。', '!', '?')):
|
||
# 检查后一行是否以序号开头
|
||
if re.match(r'^(\d+(\s*\.\s*\d+)*)\s*', prev_text) and not re.match(r'^(\d+(\s*\.\s*\d+)*)\s*、',next_text) and len(prev_text)>30:
|
||
# 合并前一行和后一行
|
||
merged_text = prev_text + next_text
|
||
# print(merged_text)
|
||
# print("---------------------------------")
|
||
if processed:
|
||
# 用合并后的文本替换已处理的前一行
|
||
processed[-1] = merged_text
|
||
else:
|
||
processed.append(merged_text)
|
||
# 跳过后一行
|
||
index += 2
|
||
continue
|
||
else:
|
||
# 非空白行,直接添加到处理后的列表
|
||
processed.append(current_text)
|
||
|
||
index += 1
|
||
return processed
|
||
|
||
doc_path = 'D:\\flask_project\\flask_app\\static\\output\\015d997e-c32c-49d1-a611-a2e817ace6a1\\ztbfile.docx'
|
||
doc = Document(doc_path)
|
||
|
||
# 假设 doc 是您的文档对象
|
||
processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
|
||
for i in processed_paragraphs:
|
||
print("___________________________")
|
||
print(i)
|
||
|
||
|