11.6修复bug

This commit is contained in:
zy123 2024-11-06 17:29:45 +08:00
parent c0ba74a374
commit 21eeb87903
5 changed files with 134 additions and 72 deletions

View File

@ -34,7 +34,7 @@ def preprocess_data(data):
# 转换结构化的JSON数据 # 转换结构化的JSON数据
#No parent found at level 1 for key '24.2'. Check the data structure. #生成结构化的数据
def transform_json(data): def transform_json(data):
result = {} result = {}
temp = {0: result} # 初始化根字典 temp = {0: result} # 初始化根字典
@ -183,19 +183,35 @@ def process_nested_data(data):
# 到达最内层,处理非字典和非列表的元素(字符串) # 到达最内层,处理非字典和非列表的元素(字符串)
return post_process(data) return post_process(data)
def get_requirements_with_gpt(invalid_path, selection): #生成无结构的数据
def concatenate_keys_values(section_content):
"""
将章节内容的键值对拼接成一个字符串列表每个元素为 "key value"
Args:
section_content (dict): 章节内容的键值对
Returns:
list of str: 拼接后的字符串列表
"""
concatenated = []
for key, value in section_content.items():
concatenated.append(f"{key} {value}")
return concatenated
def get_requirements_with_gpt(merged_baseinfo_path, selection):
""" """
根据 selection 的值选择相应的用户查询并调用大模型获取要求 根据 selection 的值选择相应的用户查询并调用大模型获取要求
Args: Args:
invalid_path (str): 无效文件的路径用于上传 merged_baseinfo_path (str): 无效文件的路径用于上传
selection (int): 选择的类型12 3 selection (int): 选择的类型12 3
Returns: Returns:
dict: 大模型返回的要求结果或错误信息 dict: 大模型返回的要求结果或错误信息
""" """
# 上传文件并获取 file_id # 上传文件并获取 file_id
file_id = upload_file(invalid_path) file_id = upload_file(merged_baseinfo_path)
# 定义 selection 对应的用户查询 # 定义 selection 对应的用户查询
user_queries = { user_queries = {
# 1: """ # 1: """

View File

@ -95,8 +95,8 @@ def extract_text_by_page(file_path):
if __name__ == '__main__': if __name__ == '__main__':
file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf' file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\54d7aa40-8b36-4803-b6f7-278356ff1381\\ztbfile_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"

View File

@ -2,7 +2,7 @@ import json
import re import re
from functools import cmp_to_key from functools import cmp_to_key
from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt,concatenate_keys_values
#提取两个大标题之间的内容 #提取两个大标题之间的内容
@ -103,19 +103,26 @@ def extract_from_notice(merged_baseinfo_path,clause_path, type):
# 提取目标部分 # 提取目标部分
extracted_data = extract_between_sections(data, target_values) # 读取json截取大标题之间的内容 extracted_data = extract_between_sections(data, target_values) # 读取json截取大标题之间的内容
transformed_data = process_with_outer_key(extracted_data) if not extracted_data:
final_result = process_nested_data(transformed_data)
if not final_result:
final_result = get_requirements_with_gpt(merged_baseinfo_path, type) #万一没用正则匹配到,那就调用大模型 final_result = get_requirements_with_gpt(merged_baseinfo_path, type) #万一没用正则匹配到,那就调用大模型
return final_result return final_result
# print(json.dumps(extracted_data,ensure_ascii=False,indent=4))
extracted_data_concatenated = {section: concatenate_keys_values(content)
for section, content in extracted_data.items()}
return extracted_data_concatenated
# transformed_data = process_with_outer_key(extracted_data)
# final_result = process_nested_data(transformed_data)
# return final_result
except Exception as e: except Exception as e:
print(f"Error in extract_from_notice: {e}") print(f"Error in extract_from_notice: {e}")
return DEFAULT_RESULT return DEFAULT_RESULT
if __name__ == "__main__": if __name__ == "__main__":
clause_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\clause1.json' clause_path = r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\\clause1.json'
merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\648e094b-e677-47ce-9073-09e0c82af210\ztbfile_merged_baseinfo.pdf" merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_merged_baseinfo.pdf"
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\clause1.json' # file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\clause1.json'
try: try:
res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景 res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景

View File

@ -105,12 +105,14 @@ def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip() content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords): def handle_content_append(current_content, line_content, append_newline, keywords,in_special_section):
if append_newline: if append_newline:
if should_add_newline(current_content, keywords): if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符 current_content.append('\n') # 添加换行符
append_newline = False append_newline = False
current_content.append(line_content) current_content.append(line_content)
if in_special_section:
current_content.append('\n')
return append_newline return append_newline
""" """
@ -134,7 +136,11 @@ def parse_text_by_heading(text):
skip_subheadings = False skip_subheadings = False
last_main_number = None last_main_number = None
temp_title = None # 临时存储以点号开头但不带数字的标题 temp_title = None # 临时存储以点号开头但不带数字的标题
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*')
initial_heading_pattern = None
special_section_keywords = ['文件的组成', '文件的构成'] # 定义特殊章节关键词
in_special_section = False # 标志是否在特殊章节中
lines = text.split('\n') lines = text.split('\n')
def check_year_pattern(line): def check_year_pattern(line):
@ -150,11 +156,22 @@ def parse_text_by_heading(text):
current_content.append(line_stripped) current_content.append(line_stripped)
continue continue
# **首先检查是否进入特殊章节**
if any(keyword in line_stripped for keyword in special_section_keywords):
in_special_section = True
# 匹配带数字的标题,例如 '12.1 内容' # 匹配带数字的标题,例如 '12.1 内容'
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped) match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match: if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped) match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
# 检查是否退出特殊章节
if in_special_section:
# 如果匹配到了主要标题,且不包含特殊关键词,则退出特殊章节
if match and not any(keyword in line_stripped for keyword in special_section_keywords):
in_special_section = False
# 以下是原有的匹配逻辑
# 匹配以点号开头并带有数字的情况,例如 '.12.1 内容' # 匹配以点号开头并带有数字的情况,例如 '.12.1 内容'
dot_match = re.match(r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', line_stripped) dot_match = re.match(r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', line_stripped)
@ -197,7 +214,7 @@ def parse_text_by_heading(text):
append_newline = len(new_key.rstrip('.').split('.')) <= 2 append_newline = len(new_key.rstrip('.').split('.')) <= 2
last_main_number = new_key.split('.')[0] last_main_number = new_key.split('.')[0]
else: else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords) append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
elif dot_match: elif dot_match:
# 处理以点号开头并带有数字的情况 # 处理以点号开头并带有数字的情况
@ -212,14 +229,15 @@ def parse_text_by_heading(text):
current_content = [line_content] current_content = [line_content]
append_newline = True append_newline = True
else: else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords) append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
elif dot_text_match: elif dot_text_match:
# 处理以点号开头但不带数字的情况,存储到临时变量 # 处理以点号开头但不带数字的情况,存储到临时变量
temp_title = dot_text_match.group(1).strip() temp_title = dot_text_match.group(1).strip()
continue # 跳过进一步处理该行 continue # 跳过进一步处理该行
elif pure_number_match: # 处理不带点号的纯数字开头的情况,例如 '27xxxxx' elif pure_number_match:
# 处理不带点号的纯数字开头的情况
new_key_candidate, line_content = pure_number_match.groups() new_key_candidate, line_content = pure_number_match.groups()
new_key_candidate += '.' # 添加点号 new_key_candidate += '.' # 添加点号
line_content = line_content.lstrip('..、,') line_content = line_content.lstrip('..、,')
@ -247,69 +265,86 @@ def parse_text_by_heading(text):
current_content = [line_content] current_content = [line_content]
append_newline = True append_newline = True
last_main_number = new_key_candidate.rstrip('.') last_main_number = new_key_candidate.rstrip('.')
# if current_key is None or (current_key != new_key and ( #不给序号排序
# len(current_content) == 0 or current_content[-1][-1] != '第')):
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
# current_key = new_key
# current_content = [line_content]
# append_newline = len(new_key.rstrip('.').split('.')) <= 2
# last_main_number = new_key.split('.')[0]
else: else:
# 将当前行视为当前标题的内容 # 将当前行视为当前标题的内容
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords) append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
else: else:
if not skip_subheadings: # 合并中文标题、字母标题、阿拉伯数字标题的匹配逻辑 if not skip_subheadings and not in_special_section: # 增加 in_special_section 判断
pattern_title = re.compile( numbered_match = pattern_numbered.match(line_stripped)
r'^\s*(?:[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)') parenthesis_match = pattern_parentheses.match(line_stripped)
chinese_match = pattern_title.match(line_stripped)
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped) letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped) arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
# 优化处理逻辑,减少冗余 # 判断当前行是否匹配了任何标题模式
if chinese_match or letter_match or arabic_match: if numbered_match or parenthesis_match or letter_match or arabic_match:
# 保存之前的 key 的内容(无论是中文标题还是阿拉伯数字标题) # 如果初始标题模式尚未设置,则记录当前匹配的标题模式
if current_key is not None: if initial_heading_pattern is None:
content_string = ''.join(current_content).strip() if numbered_match:
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '') initial_heading_pattern = 'numbered'
if current_key_chinese is not None: elif parenthesis_match:
data[current_key_chinese] = current_value_chinese initial_heading_pattern = 'parentheses'
current_key_chinese = None elif letter_match:
initial_heading_pattern = 'letter'
elif arabic_match:
initial_heading_pattern = 'arabic'
# 处理中文标题 # 确定当前匹配的标题模式
if chinese_match: if numbered_match:
current_key_chinese = chinese_match.group(1) or chinese_match.group(2) current_heading_pattern = 'numbered'
current_value_chinese = line_stripped[chinese_match.end():].lstrip('..、,').replace(' ', '') elif parenthesis_match:
if current_key_chinese in data: current_heading_pattern = 'parentheses'
handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
current_key_chinese = None
# 处理字母标题
elif letter_match: elif letter_match:
letter_key, letter_value = letter_match.groups() current_heading_pattern = 'letter'
letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
}
current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
current_value_chinese = letter_value.lstrip('..、,').replace(' ', '')
if current_key_chinese in data:
handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
current_key_chinese = None
# 处理阿拉伯数字标题
elif arabic_match: elif arabic_match:
arabic_key, arabic_value = arabic_match.groups() current_heading_pattern = 'arabic'
current_key = arabic_key.replace('', '.')
current_content = [arabic_value]
append_newline = True
last_main_number = current_key.rstrip('.')
continue
# 如果没有匹配标题,继续处理正文内容 # 如果当前标题模式与初始标题模式一致,创建新的键值对
if line_stripped: if current_heading_pattern == initial_heading_pattern:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords) # 保存之前的 key 的内容
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
current_key_chinese = None
# 处理匹配到的标题
if current_heading_pattern == 'numbered':
current_key_chinese = numbered_match.group(1)
current_value_chinese = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'parentheses':
current_key_chinese = parenthesis_match.group(1)
current_value_chinese = line_stripped[parenthesis_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'letter':
# 字母标题处理
letter_key, letter_value = letter_match.groups()
letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
}
current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
current_value_chinese = letter_value.lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'arabic':
arabic_key, arabic_value = arabic_match.groups()
current_key = arabic_key.replace('', '.')
current_content = [arabic_value]
append_newline = True
last_main_number = current_key.rstrip('.')
continue
else:
# 当前标题模式与初始模式不一致,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
else:
# 未匹配到任何标题模式,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
else:
# 在特殊章节中,所有内容都作为当前标题的内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
# 最后保存最后一个 key 对应的内容 # 最后保存最后一个 key 对应的内容
if current_key is not None: if current_key is not None:
@ -448,11 +483,12 @@ def process_folder(input_folder, output_folder):
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏 #TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理 #TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
#TODO:11.6 目前 '文件的组成' 是匹配任意行,可以考虑只匹配'11.文件的组成' 前面有序号的行 a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_tobidders_notice_part2.pdf还有问题
if __name__ == "__main__": if __name__ == "__main__":
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' file_path=r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
output_folder = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\tmp' output_folder = r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\\tmp'
try: try:
output_path = convert_clause_to_json(file_path,output_folder,1) output_path = convert_clause_to_json(file_path,output_folder,1)
print(f"Final JSON result saved to: {output_path}") print(f"Final JSON result saved to: {output_path}")

View File

@ -243,6 +243,9 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
#广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题 #广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题
#TODO: 目前跳转可能有个问题,资格审查那边:既有原来的内容又有跳转后的内容;符合本采购文件第一章第二款要求,并提供合格有效的证明材料<br>1、满足《中华人民共和国政府采购法》第二十二条规定<br>1具有独立承担 #TODO: 目前跳转可能有个问题,资格审查那边:既有原来的内容又有跳转后的内容;符合本采购文件第一章第二款要求,并提供合格有效的证明材料<br>1、满足《中华人民共和国政府采购法》第二十二条规定<br>1具有独立承担
#TODO:资格审查默认加上第一章的内容, 资格审查章节不做跳转逻辑.
#TODO:开评定标那块,不做层次遍历 ing
#TODO:技术要求提取更全面一点 fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628
#good_list 金额 截取上下文 #good_list 金额 截取上下文
if __name__ == "__main__": if __name__ == "__main__":
# 配置日志器 # 配置日志器