12.30 废标项

This commit is contained in:
zy123 2024-12-30 10:11:27 +08:00
parent 70b6a4b999
commit 8bfbfc4263
2 changed files with 193 additions and 19 deletions

View File

@ -74,9 +74,12 @@ def read_tables_from_docx(file_path):
def preprocess_paragraphs(paragraphs): def preprocess_paragraphs(paragraphs):
processed = [] # 初始化处理后的段落列表 processed = [] # 初始化处理后的段落列表
index = 0 index = 0
flag = False # 初始化标志位
# 定义两个新的正则表达式模式 # 定义两个新的正则表达式模式
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*') pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*')
# 定义列表项的模式 # 定义列表项的模式
list_item_pattern = re.compile( list_item_pattern = re.compile(
r'^\s*(' r'^\s*('
@ -91,6 +94,14 @@ def preprocess_paragraphs(paragraphs):
r')' r')'
) )
# 新增的正则表达式,用于匹配以数字序号开头的段落
pattern_numeric_header = re.compile(
r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)' # 匹配如 '12.1 内容'
)
pattern_numeric_header_fallback = re.compile(
r'^(\d+\.)\s*(.+)$' # 匹配如 '12. 内容'
)
# 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况 # 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
def has_long_spaces(text, max_space_count=5): def has_long_spaces(text, max_space_count=5):
return any(len(space) > max_space_count for space in re.findall(r'\s+', text)) return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
@ -101,7 +112,10 @@ def preprocess_paragraphs(paragraphs):
# 辅助函数:查找上一个非空且非标记的段落 # 辅助函数:查找上一个非空且非标记的段落
def find_prev_text(current_index): def find_prev_text(current_index):
for i in range(current_index - 1, -1, -1): for i in range(current_index - 1, -1, -1):
try:
text = paragraphs[i].text.strip() text = paragraphs[i].text.strip()
except AttributeError:
continue # 如果段落对象没有 text 属性,跳过
if text and not pattern_marker.search(text): if text and not pattern_marker.search(text):
return text, i return text, i
return '', -1 return '', -1
@ -109,13 +123,21 @@ def preprocess_paragraphs(paragraphs):
# 辅助函数:查找下一个非空且非标记的段落 # 辅助函数:查找下一个非空且非标记的段落
def find_next_text(current_index): def find_next_text(current_index):
for i in range(current_index + 1, len(paragraphs)): for i in range(current_index + 1, len(paragraphs)):
try:
text = paragraphs[i].text.strip() text = paragraphs[i].text.strip()
except AttributeError:
continue # 如果段落对象没有 text 属性,跳过
if text and not pattern_marker.search(text): if text and not pattern_marker.search(text):
return text, i return text, i
return '', -1 return '', -1
while index < len(paragraphs): while index < len(paragraphs):
try:
current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白 current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
except AttributeError:
# 如果段落对象没有 text 属性,跳过该段落
index += 1
continue
# 检查当前段落是否为页面标记 # 检查当前段落是否为页面标记
if pattern_marker.search(current_text): if pattern_marker.search(current_text):
@ -147,7 +169,11 @@ def preprocess_paragraphs(paragraphs):
# 从当前 index 向下,跳过所有连续的空白段落和标记 # 从当前 index 向下,跳过所有连续的空白段落和标记
skip_index = index + 1 skip_index = index + 1
while skip_index < len(paragraphs): while skip_index < len(paragraphs):
try:
skip_text = paragraphs[skip_index].text.strip() skip_text = paragraphs[skip_index].text.strip()
except AttributeError:
skip_index += 1
continue # 如果段落对象没有 text 属性,跳过
if skip_text == '' or pattern_marker.search(skip_text): if skip_text == '' or pattern_marker.search(skip_text):
skip_index += 1 skip_index += 1
else: else:
@ -161,12 +187,155 @@ def preprocess_paragraphs(paragraphs):
index += 1 index += 1
continue continue
# 如果当前段落不是标记且不匹配排除模式,则添加到处理后的列表中 # 检查是否为以数字序号开头的段落
match = pattern_numeric_header.match(current_text)
if not match:
match = pattern_numeric_header_fallback.match(current_text)
if match:
# 当前段落以数字序号开头,直接添加到 processed
processed.append(current_text)
flag = True # 设置标志位,准备处理下一个段落
index += 1
continue
else:
if flag:
# 当前段落不以数字序号开头,且 flag 为 True
if not list_item_pattern.match(current_text):
if processed:
# **新增逻辑开始**
next_non_empty_text, next_non_empty_index = find_next_text(index)
is_next_numbered = False
if next_non_empty_text:
is_next_numbered = bool(
pattern_numeric_header.match(next_non_empty_text) or
pattern_numeric_header_fallback.match(next_non_empty_text)
)
if is_next_numbered:
# 只有在下一个段落以数字序号开头时,才将当前段落追加到上一个段落
processed[-1] = processed[-1] + ' ' + current_text
else:
# 否则,不追加,而是作为新的段落添加
processed.append(current_text)
# **新增逻辑结束**
else:
# **新增处理:匹配 list_item_pattern 的段落也应被保存**
processed.append(current_text)
# 无论是否追加,都将 flag 重置
flag = False
index += 1
continue
else:
# flag 为 False直接添加到 processed
processed.append(current_text) processed.append(current_text)
index += 1 index += 1
continue
return processed return processed
#老版本
# def preprocess_paragraphs(paragraphs):
# processed = [] # 初始化处理后的段落列表
# index = 0
# # 定义两个新的正则表达式模式
# pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
# pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*')
# # 定义列表项的模式
# list_item_pattern = re.compile(
# r'^\s*('
# r'[\(]\d+[\)]|' # 匹配:(1) 或 1
# r'[A-Za-z]\.\s*|' # 匹配A. 或 b.
# r'[一二三四五六七八九十]+、|' # 匹配:一、二、三、
# r'第[一二三四五六七八九十百零]+[章节部分节]|' # 匹配第x章第x部分第x节
# r'[A-Za-z]\d+(?:\.\d+)*[\s\.、.)\]?|' # 匹配A1.2 等
# r'\d+(?:\.\d+)+[\s\.、.)\]?(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|' # 匹配数字序号如1.1 1.1.1
# r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|' # 数字后空格,空格后非指定关键字
# r'(?=\d+[、.])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])' # 数字后直接跟顿号或点号
# r')'
# )
#
# # 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
# def has_long_spaces(text, max_space_count=5):
# return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
#
# # 正则表达式用于检测页面标记
# pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')
#
# # 辅助函数:查找上一个非空且非标记的段落
# def find_prev_text(current_index):
# for i in range(current_index - 1, -1, -1):
# text = paragraphs[i].text.strip()
# if text and not pattern_marker.search(text):
# return text, i
# return '', -1
#
# # 辅助函数:查找下一个非空且非标记的段落
# def find_next_text(current_index):
# for i in range(current_index + 1, len(paragraphs)):
# text = paragraphs[i].text.strip()
# if text and not pattern_marker.search(text):
# return text, i
# return '', -1
#
# while index < len(paragraphs):
# try:
# current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
# except AttributeError:
# # 如果段落对象没有 text 属性,跳过该段落
# index += 1
# continue
#
# # 检查当前段落是否为页面标记
# if pattern_marker.search(current_text):
# # 动态查找前一个非空段落
# prev_text, prev_index = find_prev_text(index)
# # 动态查找后一个非空段落
# next_text, next_index = find_next_text(index)
#
# # 应用现有的合并逻辑
# if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
# if not prev_text.endswith(('。', '!', '?')): # '', ',', 先注释了,如果逗号,可能还没结束。
# # 检查后一个段落是否为列表项
# if not list_item_pattern.match(next_text) and len(prev_text) > 30:
# # 合并前后段落
# merged_text = prev_text + ' ' + next_text # 为了可读性添加空格
# if prev_index < len(paragraphs):
# # 移除 processed 中的前一个段落
# if processed and processed[-1] == prev_text:
# processed.pop()
# # 添加合并后的文本
# processed.append(merged_text)
#
# # 跳过标记以及前后所有空白段落,直到 next_index
# index = next_index + 1
# continue # 继续下一个循环
#
# # 如果不满足合并条件,跳过标记及其周围的空白段落
# # 计算下一个需要处理的索引
# # 从当前 index 向下,跳过所有连续的空白段落和标记
# skip_index = index + 1
# while skip_index < len(paragraphs):
# skip_text = paragraphs[skip_index].text.strip()
# if skip_text == '' or pattern_marker.search(skip_text):
# skip_index += 1
# else:
# break
# index = skip_index
# continue # 继续下一个循环
#
# # 检查当前段落是否匹配任一排除模式
# if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text) < 8:
# # 如果匹配则跳过当前段落不添加到processed列表中
# index += 1
# continue
#
# # 如果当前段落不是标记且不匹配排除模式,则添加到处理后的列表中
# processed.append(current_text)
# index += 1
#
# return processed
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式 #如果当前段落有序号,则向下匹配直接遇到相同的序号样式
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。 #如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
@ -224,7 +393,7 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
# for i in range(1, 6): #同级与matched_patterns = [pattern]重复了,故注释 # for i in range(1, 6): #同级与matched_patterns = [pattern]重复了,故注释
# # 复制 parts 列表以避免修改原列表 # # 复制 parts 列表以避免修改原列表
# new_parts = parts.copy() # new_parts = parts. copy()
# new_parts[-1] = str(int(new_parts[-1]) + i) # new_parts[-1] = str(int(new_parts[-1]) + i)
# # 使用不同的分隔符 # # 使用不同的分隔符
# next_pattern = r'^' + r'\s*[.]\s*'.join(new_parts) # next_pattern = r'^' + r'\s*[.]\s*'.join(new_parts)
@ -262,6 +431,9 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
while current_index < len(processed_paragraphs) - 1: while current_index < len(processed_paragraphs) - 1:
current_index += 1 current_index += 1
next_text = processed_paragraphs[current_index].strip() next_text = processed_paragraphs[current_index].strip()
# 添加对空白行的处理
if not next_text:
continue # 跳过空白行,进入下一个循环
if not found_next_number: if not found_next_number:
# 修改后的正则,支持 '数字 、' 格式 # 修改后的正则,支持 '数字 、' 格式
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.][A-Za-z0-9]+)*)|([(]\s*\d+\s*[)])|(\d+\s*、)', next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.][A-Za-z0-9]+)*)|([(]\s*\d+\s*[)])|(\d+\s*、)',
@ -289,7 +461,9 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
processed_paragraphs = preprocess_paragraphs(doc.paragraphs) processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
index = 0 index = 0
while index < len(processed_paragraphs): while index < len(processed_paragraphs):
# print(processed_paragraphs[index].strip())
index = extract_from_text(processed_paragraphs[index].strip(), index) index = extract_from_text(processed_paragraphs[index].strip(), index)
# print("--------------")
index += 1 index += 1
return extracted_paragraphs return extracted_paragraphs
@ -664,7 +838,7 @@ def combine_find_invalid(invalid_docpath, output_dir):
print("无效标与废标done...") print("无效标与废标done...")
return {"无效标与废标项": combined_dict} return {"无效标与废标项": combined_dict}
#TODO:无效标这里提示词修改一下 #TODO:preprocess测试一下有无问题
if __name__ == '__main__': if __name__ == '__main__':
start_time = time.time() start_time = time.time()
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json" # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
@ -672,12 +846,12 @@ if __name__ == '__main__':
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json" # clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx" # doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx' # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf' pdf_path = r'C:\Users\Administrator\Desktop\货物标\truncate_all\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目.pdf'
output_dir = r"D:\flask_project\flask_app\static\output\output1\000aac0d-4aa4-4bc3-a9f9-76ff82ec2470\tmp" output_dir = r"C:\Users\Administrator\Desktop\货物标\truncate_all"
# invalid_added=insert_mark(pdf_path) # invalid_added = insert_mark(pdf_path)
# invalid_added_docx=pdf2docx(invalid_added) # invalid_added_docx = pdf2docx(invalid_added)
invalid_added_docx=r'D:\flask_project\flask_app\static\output\output1\000aac0d-4aa4-4bc3-a9f9-76ff82ec2470\invalid_added.docx' invalid_added_docx=r'C:\Users\Administrator\Desktop\货物标\truncate_all\invalid_added.docx'
results = combine_find_invalid(invalid_added_docx, output_dir) results = combine_find_invalid(invalid_added_docx, output_dir)
end_time = time.time() end_time = time.time()
print("Results:", json.dumps(results, ensure_ascii=False, indent=4)) print("Results:", json.dumps(results, ensure_ascii=False, indent=4))