diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py index 7639d0d..eac3125 100644 --- a/flask_app/货物标/技术参数要求提取.py +++ b/flask_app/货物标/技术参数要求提取.py @@ -10,6 +10,7 @@ from flask_app.general.通义千问long import qianwen_long, upload_file from flask_app.general.json_utils import clean_json_string, combine_json_results from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main from flask_app.general.doubao import doubao_model, generate_full_user_query, pdf2txt +from flask_app.货物标.技术参数要求提取后处理函数 import postprocess, all_postprocess def generate_key_paths(data, parent_key='', good_list=None, seen=None): @@ -189,64 +190,6 @@ def combine_and_update_results(original_data, updates): recursive_update(original_data, key, value) return original_data -def postprocess(data): - """递归地转换字典中的值为列表,如果所有键对应的值都是'/', '{}' 或 '未知'""" - def convert_dict(value): - # 如果所有值是'/', '{}' 或 '未知' - if all(v in ['/', '未知', {}] for v in value.values()): - return list(value.keys()) - else: - # 如果不满足条件,则递归处理嵌套的字典 - return {k: convert_dict(v) if isinstance(v, dict) else v for k, v in value.items()} - - # 递归处理顶层数据 - return {key: convert_dict(val) if isinstance(val, dict) else val for key, val in data.items()} - -def detect_depth(data): - """ - Detects the depth of the nested dictionary. - """ - if isinstance(data, dict): - return 1 + max((detect_depth(v) for v in data.values()), default=0) - elif isinstance(data, list): - return 1 # Lists are considered the terminal layer - else: - return 0 # Base case for non-nested elements - - -def restructure_data(data): - """ - Restructure data to normalize levels of nesting. - If depth is 2 throughout, return as-is. - If both 2 and 3 levels exist, restructure to align to 3-layer format. - """ - # Check if data contains mixed 2-layer and 3-layer structures - has_two_layers = False - has_three_layers = False - - for key, value in data.items(): - if isinstance(value, dict): - has_three_layers = True - elif isinstance(value, list): - has_two_layers = True - else: - raise ValueError(f"Unexpected data format for key '{key}': {type(value)}") - - # If only 2-layer, return as-is - if has_two_layers and not has_three_layers: - return data - - # If mixed or only 3-layer, normalize to 3-layer - structured_data = {} - for key, value in data.items(): - if isinstance(value, dict): - # Already a 3-layer structure, keep as is - structured_data[key] = value - elif isinstance(value, list): - # Convert 2-layer structure to 3-layer - structured_data[key] = {key: value} - - return structured_data # user_query1 = """ # 请你首先定位该采购文件中的采购清单或采购需求部分,请告诉我需要采购的货物,如果有采购清单,请直接根据清单上的货物(或系统)名称给出结果,注意不要返回'说明'或'规格'或'技术参数'列中的内容;若没有采购清单,你要从表格中或文中摘取需要采购的系统和货物,采购需求中可能包含层次关系,例如采购的某系统中可能包含几种货物,那么你需要用嵌套键值对表示这种关系,且不要遗漏该系统中包含的货物,你的输出请以json格式返回,最外层键名为'采购需求',嵌套键名为对应的系统名称或货物名称,需与原文保持一致,无需给出采购数量和单位。以下为需要考虑的特殊情况:如果采购清单中同一层级(或同一系统)下存在同名货物且它们的采购要求有所不同,请你以'货物名-编号'区分多种型号,编号为从 1 开始的自然数,依次递增,例如若采购清单中有两种型号的'交换机',那么你应返回两个键名,'交换机-1'和'交换机-2';如有未知内容,在对应键值处填'未知'。以下为考虑了特殊情况的示例输出: @@ -493,7 +436,7 @@ def get_technical_requirements(file_path,invalid_path): """根据所有键是否已添加处理技术要求""" # 更新原始采购需求字典 final_res=combine_and_update_results(cleaned_res['采购需求'], technical_requirements_combined_res) - ffinal_res=restructure_data(final_res) + ffinal_res=all_postprocess(final_res) # final_res = postprocess(cleaned_res) ffinal_res["货物列表"] = good_list # 输出最终的 JSON 字符串 diff --git a/flask_app/货物标/技术参数要求提取后处理函数.py b/flask_app/货物标/技术参数要求提取后处理函数.py new file mode 100644 index 0000000..925e4cf --- /dev/null +++ b/flask_app/货物标/技术参数要求提取后处理函数.py @@ -0,0 +1,127 @@ +import json +def postprocess(data): + """递归地转换字典中的值为列表,如果所有键对应的值都是'/', '{}' 或 '未知'""" + def convert_dict(value): + # 如果所有值是'/', '{}' 或 '未知' + if all(v in ['/', '未知', {}] for v in value.values()): + return list(value.keys()) + else: + # 如果不满足条件,则递归处理嵌套的字典 + return {k: convert_dict(v) if isinstance(v, dict) else v for k, v in value.items()} + + # 递归处理顶层数据 + return {key: convert_dict(val) if isinstance(val, dict) else val for key, val in data.items()} + +def all_postprocess(data): + temp=restructure_data(data) + processed_data = {} + for key, value_list in temp.items(): + processed_data[key] = remove_common_prefixes(value_list) + return processed_data +def detect_depth(data): + """ + Detects the depth of the nested dictionary. + """ + if isinstance(data, dict): + return 1 + max((detect_depth(v) for v in data.values()), default=0) + elif isinstance(data, list): + return 1 # Lists are considered the terminal layer + else: + return 0 # Base case for non-nested elements + + +def restructure_data(data): + """ + Restructure data to normalize levels of nesting. + If depth is 2 throughout, return as-is. + If both 2 and 3 levels exist, restructure to align to 3-layer format. + """ + # Check if data contains mixed 2-layer and 3-layer structures + has_two_layers = False + has_three_layers = False + + for key, value in data.items(): + if isinstance(value, dict): + has_three_layers = True + elif isinstance(value, list): + has_two_layers = True + else: + raise ValueError(f"Unexpected data format for key '{key}': {type(value)}") + + # If only 2-layer, return as-is + if has_two_layers and not has_three_layers: + return data + + # If mixed or only 3-layer, normalize to 3-layer + structured_data = {} + for key, value in data.items(): + if isinstance(value, dict): + # Already a 3-layer structure, keep as is + structured_data[key] = value + elif isinstance(value, list): + # Convert 2-layer structure to 3-layer + structured_data[key] = {key: value} + + return structured_data + +# 定义获取所有以':'结尾的前缀的函数 +def get_prefixes(s): + prefixes = [] + for i in range(len(s)): + if s[i] == ':': + prefixes.append(s[:i+1]) + return prefixes + +# 定义删除公共前缀的函数 +def remove_common_prefixes(string_list): + # 构建前缀到字符串集合的映射 + prefix_to_strings = {} + for s in string_list: + prefixes = get_prefixes(s) + unique_prefixes = set(prefixes) + for prefix in unique_prefixes: + if prefix not in prefix_to_strings: + prefix_to_strings[prefix] = set() + prefix_to_strings[prefix].add(s) + # 找出至少在两个字符串中出现的前缀 + prefixes_occuring_in_multiple_strings = [prefix for prefix, strings in prefix_to_strings.items() if len(strings) >=2] + # 对每个字符串,找到其匹配的最长前缀并删除 + new_string_list = [] + for s in string_list: + applicable_prefixes = [prefix for prefix in prefixes_occuring_in_multiple_strings if s.startswith(prefix)] + if applicable_prefixes: + # 找到最长的前缀 + longest_prefix = max(applicable_prefixes, key=len) + # 删除前缀 + new_s = s[len(longest_prefix):] + new_string_list.append(new_s) + else: + new_string_list.append(s) + return new_string_list + +if __name__ == "__main__": + # 示例数据 + sample_data = { + "交通信号机": [ + "★应采用区域控制信号机,并应与广水市交通信号控制系统兼容,信号机能接入已有系统平台,实现联网优化功能。", + "1、控制功能:(1)区域协调控制:可对单个孤立交叉口、干道多个交叉口和关联性较强的交叉口群进行综合性地信号控制。", + "1、控制功能:(2)线性协调控制:可对干道多个相邻交叉口进行协调控制。", + "1、控制功能:(3)多时段控制:可根据交叉口的交通状况,将每天划分为多个不同的时段,每个时段配置不同的控制方案,能设置至少 10个时段、10种以上不同控制方案,能根据不同周日类型对方案进行调整。信号机能够根据内置时钟选择各个时段的控制方案,实现交叉口的合理控制。", + "2、采集功能:(1)信号机支持接入线圈、地磁、视频、微波、超声波检测器、RFID等多种检测方式。", + "2、采集功能:(2)信号机支持交通信息采集与统计,并支持交通流量共享。", + "3、运维功能:(1)信号机能够自动检测地磁故障,若故障,能够自动上传故障信息至监控中心。" + ], + "高清视频抓拍像机": [ + "1:摄像机:有效像素:≥900W像素", + "1:摄像机:最低照度:彩色≤0.001lx", + "1:摄像机:传感器类型:≥1英寸全局曝光 COMS/GMOS/GS COMS", + "1:摄像机:电子快门:至少满足 1/25s至 1/100,000s,可调", + "2:视频图像:视频压缩标准:至少支持 H264、H265等", + "2:视频图像:视频分辨率:≥4096×2160,向下可设置", + ], + } + # 处理数据 + result = all_postprocess(sample_data) + # 输出处理结果 + print(json.dumps(result,ensure_ascii=False,indent=4)) +