import re from docx import Document def preprocess_paragraphs(paragraphs): processed = [] index = 0 while index < len(paragraphs): current_text = paragraphs[index].text.strip() # 检测是否为空白行 if current_text == '': # 确保有前一行和后一行 if index > 0 and index + 1 < len(paragraphs): prev_text = paragraphs[index - 1].text.strip() # print(prev_text) next_text = paragraphs[index + 1].text.strip() # print(next_text) # print("------------------------------") # 检查前一行是否不以指定标点结尾 if not prev_text.endswith((',', ',', '。', '!', '?')): # 检查后一行是否以序号开头 if re.match(r'^(\d+(\s*\.\s*\d+)*)\s*', prev_text) and not re.match(r'^(\d+(\s*\.\s*\d+)*)\s*、',next_text) and len(prev_text)>30: # 合并前一行和后一行 merged_text = prev_text + next_text # print(merged_text) # print("---------------------------------") if processed: # 用合并后的文本替换已处理的前一行 processed[-1] = merged_text else: processed.append(merged_text) # 跳过后一行 index += 2 continue else: # 非空白行,直接添加到处理后的列表 processed.append(current_text) index += 1 return processed doc_path = 'D:\\flask_project\\flask_app\\static\\output\\015d997e-c32c-49d1-a611-a2e817ace6a1\\ztbfile.docx' doc = Document(doc_path) # 假设 doc 是您的文档对象 processed_paragraphs = preprocess_paragraphs(doc.paragraphs) for i in processed_paragraphs: print("___________________________") print(i)