From 8bfbfc4263ea7252c88094a1d07013eb5b5e2641 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Mon, 30 Dec 2024 10:11:27 +0800 Subject: [PATCH] =?UTF-8?q?12.30=20=E5=BA=9F=E6=A0=87=E9=A1=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../投标人须知正文条款提取成json文件.py | 8 +- flask_app/general/无效标和废标公共代码.py | 204 ++++++++++++++++-- 2 files changed, 193 insertions(+), 19 deletions(-) diff --git a/flask_app/general/投标人须知正文条款提取成json文件.py b/flask_app/general/投标人须知正文条款提取成json文件.py index 75a61a0..0b9d6b6 100644 --- a/flask_app/general/投标人须知正文条款提取成json文件.py +++ b/flask_app/general/投标人须知正文条款提取成json文件.py @@ -113,10 +113,10 @@ def parse_text_by_heading(text): current_content.append(line_stripped) continue - # 匹配带数字的标题,例如 '12.1 内容' - match = re.match(r'^(? max_space_count for space in re.findall(r'\s+', text)) @@ -101,7 +112,10 @@ def preprocess_paragraphs(paragraphs): # 辅助函数:查找上一个非空且非标记的段落 def find_prev_text(current_index): for i in range(current_index - 1, -1, -1): - text = paragraphs[i].text.strip() + try: + text = paragraphs[i].text.strip() + except AttributeError: + continue # 如果段落对象没有 text 属性,跳过 if text and not pattern_marker.search(text): return text, i return '', -1 @@ -109,13 +123,21 @@ def preprocess_paragraphs(paragraphs): # 辅助函数:查找下一个非空且非标记的段落 def find_next_text(current_index): for i in range(current_index + 1, len(paragraphs)): - text = paragraphs[i].text.strip() + try: + text = paragraphs[i].text.strip() + except AttributeError: + continue # 如果段落对象没有 text 属性,跳过 if text and not pattern_marker.search(text): return text, i return '', -1 while index < len(paragraphs): - current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白 + try: + current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白 + except AttributeError: + # 如果段落对象没有 text 属性,跳过该段落 + index += 1 + continue # 检查当前段落是否为页面标记 if pattern_marker.search(current_text): @@ -147,7 +169,11 @@ def preprocess_paragraphs(paragraphs): # 从当前 index 向下,跳过所有连续的空白段落和标记 skip_index = index + 1 while skip_index < len(paragraphs): - skip_text = paragraphs[skip_index].text.strip() + try: + skip_text = paragraphs[skip_index].text.strip() + except AttributeError: + skip_index += 1 + continue # 如果段落对象没有 text 属性,跳过 if skip_text == '' or pattern_marker.search(skip_text): skip_index += 1 else: @@ -161,12 +187,155 @@ def preprocess_paragraphs(paragraphs): index += 1 continue - # 如果当前段落不是标记且不匹配排除模式,则添加到处理后的列表中 - processed.append(current_text) - index += 1 + # 检查是否为以数字序号开头的段落 + match = pattern_numeric_header.match(current_text) + if not match: + match = pattern_numeric_header_fallback.match(current_text) + + if match: + # 当前段落以数字序号开头,直接添加到 processed + processed.append(current_text) + flag = True # 设置标志位,准备处理下一个段落 + index += 1 + continue + else: + if flag: + # 当前段落不以数字序号开头,且 flag 为 True + if not list_item_pattern.match(current_text): + if processed: + # **新增逻辑开始** + next_non_empty_text, next_non_empty_index = find_next_text(index) + is_next_numbered = False + if next_non_empty_text: + is_next_numbered = bool( + pattern_numeric_header.match(next_non_empty_text) or + pattern_numeric_header_fallback.match(next_non_empty_text) + ) + + if is_next_numbered: + # 只有在下一个段落以数字序号开头时,才将当前段落追加到上一个段落 + processed[-1] = processed[-1] + ' ' + current_text + else: + # 否则,不追加,而是作为新的段落添加 + processed.append(current_text) + # **新增逻辑结束** + else: + # **新增处理:匹配 list_item_pattern 的段落也应被保存** + processed.append(current_text) + # 无论是否追加,都将 flag 重置 + flag = False + index += 1 + continue + else: + # flag 为 False,直接添加到 processed + processed.append(current_text) + index += 1 + continue return processed +#老版本 +# def preprocess_paragraphs(paragraphs): +# processed = [] # 初始化处理后的段落列表 +# index = 0 +# # 定义两个新的正则表达式模式 +# pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') +# pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') +# # 定义列表项的模式 +# list_item_pattern = re.compile( +# r'^\s*(' +# r'[(\(]\d+[)\)]|' # 匹配:(1) 或 (1) +# r'[A-Za-z]\.\s*|' # 匹配:A. 或 b. +# r'[一二三四五六七八九十]+、|' # 匹配:一、二、三、 +# r'第[一二三四五六七八九十百零]+[章节部分节]|' # 匹配:第x章,第x部分,第x节 +# r'[A-Za-z]\d+(?:\.\d+)*[\s\.、.)\)]?|' # 匹配:A1.2 等 +# r'\d+(?:\.\d+)+[\s\.、.)\)]?(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|' # 匹配:数字序号如1.1 1.1.1 +# r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|' # 数字后空格,空格后非指定关键字 +# r'(?=\d+[、..])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])' # 数字后直接跟顿号或点号 +# r')' +# ) +# +# # 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况 +# def has_long_spaces(text, max_space_count=5): +# return any(len(space) > max_space_count for space in re.findall(r'\s+', text)) +# +# # 正则表达式用于检测页面标记 +# pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$') +# +# # 辅助函数:查找上一个非空且非标记的段落 +# def find_prev_text(current_index): +# for i in range(current_index - 1, -1, -1): +# text = paragraphs[i].text.strip() +# if text and not pattern_marker.search(text): +# return text, i +# return '', -1 +# +# # 辅助函数:查找下一个非空且非标记的段落 +# def find_next_text(current_index): +# for i in range(current_index + 1, len(paragraphs)): +# text = paragraphs[i].text.strip() +# if text and not pattern_marker.search(text): +# return text, i +# return '', -1 +# +# while index < len(paragraphs): +# try: +# current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白 +# except AttributeError: +# # 如果段落对象没有 text 属性,跳过该段落 +# index += 1 +# continue +# +# # 检查当前段落是否为页面标记 +# if pattern_marker.search(current_text): +# # 动态查找前一个非空段落 +# prev_text, prev_index = find_prev_text(index) +# # 动态查找后一个非空段落 +# next_text, next_index = find_next_text(index) +# +# # 应用现有的合并逻辑 +# if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text): +# if not prev_text.endswith(('。', '!', '?')): # ',', ',', 先注释了,如果逗号,可能还没结束。 +# # 检查后一个段落是否为列表项 +# if not list_item_pattern.match(next_text) and len(prev_text) > 30: +# # 合并前后段落 +# merged_text = prev_text + ' ' + next_text # 为了可读性添加空格 +# if prev_index < len(paragraphs): +# # 移除 processed 中的前一个段落 +# if processed and processed[-1] == prev_text: +# processed.pop() +# # 添加合并后的文本 +# processed.append(merged_text) +# +# # 跳过标记以及前后所有空白段落,直到 next_index +# index = next_index + 1 +# continue # 继续下一个循环 +# +# # 如果不满足合并条件,跳过标记及其周围的空白段落 +# # 计算下一个需要处理的索引 +# # 从当前 index 向下,跳过所有连续的空白段落和标记 +# skip_index = index + 1 +# while skip_index < len(paragraphs): +# skip_text = paragraphs[skip_index].text.strip() +# if skip_text == '' or pattern_marker.search(skip_text): +# skip_index += 1 +# else: +# break +# index = skip_index +# continue # 继续下一个循环 +# +# # 检查当前段落是否匹配任一排除模式 +# if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text) < 8: +# # 如果匹配,则跳过当前段落,不添加到processed列表中 +# index += 1 +# continue +# +# # 如果当前段落不是标记且不匹配排除模式,则添加到处理后的列表中 +# processed.append(current_text) +# index += 1 +# +# return processed + #如果当前段落有序号,则向下匹配直接遇到相同的序号样式 #如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。 def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): @@ -224,7 +393,7 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): # for i in range(1, 6): #同级,与matched_patterns = [pattern]重复了,故注释 # # 复制 parts 列表以避免修改原列表 - # new_parts = parts.copy() + # new_parts = parts. copy() # new_parts[-1] = str(int(new_parts[-1]) + i) # # 使用不同的分隔符 # next_pattern = r'^' + r'\s*[..]\s*'.join(new_parts) @@ -262,6 +431,9 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): while current_index < len(processed_paragraphs) - 1: current_index += 1 next_text = processed_paragraphs[current_index].strip() + # 添加对空白行的处理 + if not next_text: + continue # 跳过空白行,进入下一个循环 if not found_next_number: # 修改后的正则,支持 '数字 、' 格式 next_section_number = re.match(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|([((]\s*\d+\s*[))])|(\d+\s*、)', @@ -289,7 +461,9 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): processed_paragraphs = preprocess_paragraphs(doc.paragraphs) index = 0 while index < len(processed_paragraphs): + # print(processed_paragraphs[index].strip()) index = extract_from_text(processed_paragraphs[index].strip(), index) + # print("--------------") index += 1 return extracted_paragraphs @@ -664,7 +838,7 @@ def combine_find_invalid(invalid_docpath, output_dir): print("无效标与废标done...") return {"无效标与废标项": combined_dict} -#TODO:无效标这里提示词修改一下 +#TODO:preprocess测试一下有无问题 if __name__ == '__main__': start_time = time.time() # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json" @@ -672,12 +846,12 @@ if __name__ == '__main__': # clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json" # doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx" # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx' - pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf' + pdf_path = r'C:\Users\Administrator\Desktop\货物标\truncate_all\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目).pdf' - output_dir = r"D:\flask_project\flask_app\static\output\output1\000aac0d-4aa4-4bc3-a9f9-76ff82ec2470\tmp" - # invalid_added=insert_mark(pdf_path) - # invalid_added_docx=pdf2docx(invalid_added) - invalid_added_docx=r'D:\flask_project\flask_app\static\output\output1\000aac0d-4aa4-4bc3-a9f9-76ff82ec2470\invalid_added.docx' + output_dir = r"C:\Users\Administrator\Desktop\货物标\truncate_all" + # invalid_added = insert_mark(pdf_path) + # invalid_added_docx = pdf2docx(invalid_added) + invalid_added_docx=r'C:\Users\Administrator\Desktop\货物标\truncate_all\invalid_added.docx' results = combine_find_invalid(invalid_added_docx, output_dir) end_time = time.time() print("Results:", json.dumps(results, ensure_ascii=False, indent=4))