11.21 解决了复杂、多层的采购需求bug

This commit is contained in:
zy123 2024-11-21 19:17:52 +08:00
parent eaf46068c8
commit 9c3b56b2f0
4 changed files with 203 additions and 127 deletions

View File

@ -47,8 +47,8 @@ def get_technical_requirements_main(file_path,file_type,unique_id,output_folder)
else: else:
return final_res return final_res
if __name__ == "__main__": if __name__ == "__main__":
file_path=r"C:\Users\Administrator\Desktop\fsdownload\39b0c3b4-1807-456c-8330-c5c7d1b7a2ca\ztbfile.pdf" file_path=r"C:\Users\Administrator\Desktop\fsdownload\db79e9e0-830e-442c-8cb6-1d036215f8ff\ztbfile.pdf"
file_type=2 file_type=2
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\39b0c3b4-1807-456c-8330-c5c7d1b7a2ca\tmp" output_folder = r"C:\Users\Administrator\Desktop\fsdownload\db79e9e0-830e-442c-8cb6-1d036215f8ff\tmp"
res=get_technical_requirements_main(file_path,file_type,"123",output_folder) res=get_technical_requirements_main(file_path,file_type,"123",output_folder)
print(json.dumps(res,ensure_ascii=False,indent=4)) print(json.dumps(res,ensure_ascii=False,indent=4))

View File

@ -1,38 +1,68 @@
import json import json
import re import re
def get_original_order(data, parent_key=''):
"""获取原始字典中所有键的顺序"""
order = []
for key, value in data.items():
current_key = f"{parent_key}.{key}" if parent_key else key
order.append(current_key)
if isinstance(value, dict):
order.extend(get_original_order(value, current_key))
return order
def reorganize_dict(data, original_order):
"""根据原始顺序重新组织字典"""
def get_dict_by_path(d, path):
parts = path.split('.')
current = d
for part in parts[:-1]:
current = current[part]
return current
result = {}
# 按原始顺序重建字典结构
for path in original_order:
parts = path.split('.')
current = result
for i, part in enumerate(parts[:-1]):
if part not in current:
current[part] = {}
current = current[part]
# 获取原始字典中对应路径的值
try:
source_dict = get_dict_by_path(data, path)
current[parts[-1]] = source_dict[parts[-1]]
except KeyError:
# 如果键不存在(可能是被重命名的键),跳过
continue
return result
def generate_key_paths(data, parent_key='', good_list=None, seen=None): def generate_key_paths(data, parent_key='', good_list=None, seen=None):
""" # 保存原始顺序
生成嵌套字典中的键路径并提取最内层的键名 original_order = get_original_order(data)
同时提取特定模式的键 '交换机-1', '交换机-2'的父路径
如果同一层级下只有'交换机-1'但没有'交换机-2'则视为错误输入删除后缀'-1''交换机'加入key_paths
并从data中移除错误的键
参数:
data (dict): 输入的字典数据
parent_key (str): 上级键路径用于递归调用
good_list (list): 用于存储去重后的最内层键名
seen (set): 用于跟踪已添加到 good_list 的元素
返回:
tuple: 包含键路径列表最内层键名列表分组路径列表以及 no_keys_added 的元组
(key_paths, good_list, grouped_paths, no_keys_added)
"""
if good_list is None: if good_list is None:
good_list = [] good_list = []
if seen is None: if seen is None:
seen = set() seen = set()
key_paths = [] key_paths = []
grouped_paths = set() # 使用集合避免重复路径 grouped_paths = set()
no_keys_added = True # 默认假设没有添加任何键 no_keys_added = True
# Step 1: Collect keys that match the pattern # Step 1: Collect keys that match the pattern
pattern = re.compile(r'(.+)-\d+$') # 匹配形如 '交换机-1', '交换机-2' 的键 pattern = re.compile(r'(.+)-\d+$')
prefix_groups = {} prefix_groups = {}
other_keys = [] other_keys = []
for key in list(data.keys()): # 使用 list(data.keys()) 防止修改字典时出错 for key in list(data.keys()):
clean_key = key.replace(" ", "") clean_key = key.replace(" ", "")
match = pattern.match(clean_key) match = pattern.match(clean_key)
if match: if match:
@ -47,101 +77,133 @@ def generate_key_paths(data, parent_key='', good_list=None, seen=None):
for prefix, keys in prefix_groups.items(): for prefix, keys in prefix_groups.items():
current_prefix_path = f"{parent_key}.{prefix}" if parent_key else prefix current_prefix_path = f"{parent_key}.{prefix}" if parent_key else prefix
if len(keys) > 1: if len(keys) > 1:
# 多个键匹配同一前缀:添加到 grouped_paths
grouped_paths.add(current_prefix_path) grouped_paths.add(current_prefix_path)
if prefix not in seen: if prefix not in seen:
good_list.append(prefix) good_list.append(prefix)
seen.add(prefix) seen.add(prefix)
no_keys_added = False no_keys_added = False
else: else:
# 只有一个键匹配:删除后缀并添加到 key_paths同时从 data 中移除该键 old_key = keys[0]
key = keys[0] new_key = prefix
key_path = current_prefix_path # 去掉后缀后,路径为父路径 + 前缀 value = data[old_key]
data[new_key] = value
del data[old_key]
key_path = f"{parent_key}.{new_key}" if parent_key else new_key
key_paths.append(key_path) key_paths.append(key_path)
if prefix not in seen: if prefix not in seen:
good_list.append(prefix) good_list.append(prefix)
seen.add(prefix) seen.add(prefix)
no_keys_added = False no_keys_added = False
# 从 data 中移除错误的键
data.pop(key)
# Step 3: Handle other keys # Step 3: Handle other keys (递归处理其他键)
for key in other_keys: for key in other_keys:
value = data[key] value = data[key]
current_key = f"{parent_key}.{key}" if parent_key else key current_key = f"{parent_key}.{key}" if parent_key else key
if isinstance(value, dict): if isinstance(value, dict):
if value: if value:
# 递归调用,并获取子路径、子 good_list、子分组路径以及子 no_keys_added
sub_key_paths, _, sub_grouped_paths, sub_no_keys_added = generate_key_paths( sub_key_paths, _, sub_grouped_paths, sub_no_keys_added = generate_key_paths(
value, current_key, good_list, seen value, current_key, good_list, seen
) )
key_paths.extend(sub_key_paths) key_paths.extend(sub_key_paths)
grouped_paths.update(sub_grouped_paths) # 合并子分组路径到当前分组路径 grouped_paths.update(sub_grouped_paths)
# 更新 no_keys_added
no_keys_added = no_keys_added and sub_no_keys_added no_keys_added = no_keys_added and sub_no_keys_added
else: else:
# 空字典视为叶子节点
clean_key = key.replace(" ", "") clean_key = key.replace(" ", "")
key_paths.append(current_key.replace(" ", "")) key_paths.append(current_key.replace(" ", ""))
if clean_key not in seen: if clean_key not in seen:
good_list.append(clean_key) # 去掉空格后添加 good_list.append(clean_key)
seen.add(clean_key) seen.add(clean_key)
# 更新 no_keys_added
no_keys_added = False no_keys_added = False
elif isinstance(value, list):
# 列表类型视为叶子节点,无论是否为空
key_paths.append(current_key.replace(" ", ""))
clean_key = key.replace(" ", "")
if clean_key not in seen:
good_list.append(clean_key) # 去掉空格后添加
seen.add(clean_key)
# 更新 no_keys_added
no_keys_added = False
elif value in {"未知", "", "/"}:
# 特定值视为叶子节点
key_paths.append(current_key.replace(" ", ""))
clean_key = key.replace(" ", "")
if clean_key not in seen:
good_list.append(clean_key) # 去掉空格后添加
seen.add(clean_key)
# 更新 no_keys_added
no_keys_added = False
else: else:
# 其他情况视为叶子节点
key_paths.append(current_key.replace(" ", ""))
clean_key = key.replace(" ", "") clean_key = key.replace(" ", "")
key_paths.append(current_key.replace(" ", ""))
if clean_key not in seen: if clean_key not in seen:
good_list.append(clean_key) # 去掉空格后添加 good_list.append(clean_key)
seen.add(clean_key) seen.add(clean_key)
# 更新 no_keys_added
no_keys_added = False no_keys_added = False
# Step 4: 删除 key_paths 中包含在 grouped_paths 中的元素
key_paths = [path for path in key_paths if path not in grouped_paths]
# 根据原始顺序重新组织字典
reorganized_data = reorganize_dict(data, original_order)
data.clear()
data.update(reorganized_data)
return key_paths, good_list, grouped_paths, no_keys_added return key_paths, good_list, grouped_paths, no_keys_added
# 示例使用 # 示例使用
data1 = { data1 = {
"采购需求": { "采购需求": {
"高清数字枪机-1": [], "多媒体会议厅设备": {
"枪机支架-1": [], "LED屏显示设备": {
"高清数字半球机-1": [], "户内全彩LED屏全彩高刷": [],
"网络硬盘录像机-1": [], "发送盒": [],
"监硬控硬盘-1": [], "LED显示屏控制系统": [],
"交换机-1": [], "视频处理器": [],
"交换机-2": [], "智能配电柜": [],
"监视器-1": [], "台式电脑": [],
"电源线-1": [], "控制桌(定制)": []
"网线-1": [], },
"水晶头-1": [], "LED显示屏施工材料、技术服务费、包装费": {
"PVC线槽-1": [], "结构边框": [],
"辅料-1": [], "线材(按需)": [],
"安装调试-1": [], "包装材料": []
"视频图像采集及存储系统": { },
"系统功能": [] "扩声系统": {
"主扩全频专业音箱": [],
"专业功放-1": [],
"专业功放-2": [],
"辅助专业音箱": [],
"壁挂支架": [],
"返听专业音箱": [],
"返听专业功放": [],
"超低频专业音箱": [],
"音箱地插": [],
"调音台": [],
"音频处理器": [],
"抑制器": [],
"无线话筒": [],
"话筒呼叫控制嵌入软件": [],
"天线分配器-1": [],
"话筒天线": [],
"有源监听音箱": [],
"电源时序器": []
},
"辅助材料": {
"机柜": [],
"音频连接线-1": [],
"音频连接线-2": [],
"音频连接线-3": [],
"其它辅材(音箱线、电源线、网线、视频线等)": []
},
"会议座椅": {
"会议座椅": []
},
"电动窗帘": {
"电动窗帘系统": []
}
},
"云平台及备课电脑": {
"备课一体机电脑": [],
"云平台管理软件": [],
"教学互动应用软件": []
},
"办公桌椅": {
"办公桌椅": []
},
"文件柜": {
"文件柜": []
},
"体育运动器材": {
"移动式标准篮球架12座": []
} }
} }
} }
key_paths, good_list, grouped_paths, no_keys_added = generate_key_paths(data1) key_paths, good_list, grouped_paths, no_keys_added = generate_key_paths(data1)
print(json.dumps(data1,ensure_ascii=False,indent=4)) print(json.dumps(data1,ensure_ascii=False,indent=4))
print(key_paths)
print(grouped_paths)

View File

@ -16,62 +16,76 @@ def get_suffix(n):
return suffix return suffix
def extract_matching_keys(data, good_list, special_keys=None): def extract_matching_keys(data, good_list, special_keys=None, parent_key=''):
""" def get_suffix(n):
过滤字典中的键值对保留键名符合good_list中元素的项 suffix = ''
对于重复的键名添加后缀-a, -b, 等以确保唯一性 while n > 0:
对于特殊键使用父键的键格式命名 n, r = divmod(n - 1, 26)
suffix = chr(97 + r) + suffix
return suffix
def count_matching_keys(data, patterns, special_keys, counter=None):
"""递归统计匹配键的出现次数"""
if counter is None:
counter = defaultdict(int)
参数: if isinstance(data, dict):
- data (dict): 输入的嵌套字典 for key, value in data.items():
- good_list (list): 包含需要匹配的键名的列表 if isinstance(value, (dict, list)):
- special_keys (list): 需要特殊处理的键名列表 count_matching_keys(value, patterns, special_keys, counter)
if key not in special_keys and any(pattern.match(key) for pattern in patterns):
counter[key] += 1
elif isinstance(data, list):
for item in data:
if isinstance(item, (dict, list)):
count_matching_keys(item, patterns, special_keys, counter)
return counter
def process_data(data, patterns, special_keys, key_counter, suffix_map, filtered_data, parent_key):
"""递归处理数据并构建结果"""
def get_suffix_label(key):
suffix_map[key] += 1
return get_suffix(suffix_map[key])
if isinstance(data, dict):
for key, value in data.items():
if isinstance(value, list):
# 到达最内层列表,处理数据
if any(pattern.match(key) for pattern in patterns):
new_key = generate_key(key, parent_key, key_counter, suffix_map, special_keys)
filtered_data[new_key] = value
elif isinstance(value, dict):
# 继续递归处理嵌套字典
process_data(value, patterns, special_keys, key_counter, suffix_map,
filtered_data, key if parent_key == '' else f"{parent_key}{key}")
def generate_key(key, parent_key, key_counter, suffix_map, special_keys):
"""生成新的键名"""
if key in special_keys and parent_key:
return f"{parent_key}{key}"
elif key_counter[key] > 1:
suffix = get_suffix(suffix_map[key] + 1)
suffix_map[key] += 1
return f"{key}-{suffix}"
return key
返回:
- dict: 筛选后的字典包含符合条件的键值对
"""
if special_keys is None: if special_keys is None:
special_keys = [] special_keys = []
# 编译正则表达式模式匹配good_list中的元素允许后面跟随“-数字”
patterns = [re.compile(r'^' + re.escape(g) + r'(?:-\d+)?$') for g in good_list] patterns = [re.compile(r'^' + re.escape(g) + r'(?:-\d+)?$') for g in good_list]
# 第一遍遍历:统计每个键名出现的次数(排除特殊键) # 先统计所有匹配键的出现次数
key_counter = defaultdict(int) key_counter = count_matching_keys(data, patterns, special_keys)
for top_key, nested_dict in data.items():
for inner_key in nested_dict.keys():
if inner_key in special_keys:
continue
if any(pattern.match(inner_key) for pattern in patterns):
key_counter[inner_key] += 1
# 初始化用于跟踪每个重复键当前使用的后缀编号 # 初始化后缀映射
suffix_map = {key: 0 for key, count in key_counter.items() if count > 1} suffix_map = {key: 0 for key, count in key_counter.items() if count > 1}
def get_suffix_label(key): # 用于存储最终结果
"""
根据当前计数获取字母后缀并更新suffix_map
"""
suffix_map[key] += 1
return get_suffix(suffix_map[key])
# 第二遍遍历:根据统计结果添加后缀或特殊命名
filtered_data = {} filtered_data = {}
for top_key, nested_dict in data.items(): # 递归处理数据
for inner_key, values in nested_dict.items(): process_data(data, patterns, special_keys, key_counter, suffix_map, filtered_data, parent_key)
if any(pattern.match(inner_key) for pattern in patterns):
if inner_key in special_keys:
# 对于特殊键,使用“父键的键”命名
new_key = f"{top_key}{inner_key}"
else:
if key_counter[inner_key] > 1:
# 对于重复键,添加后缀
suffix = get_suffix_label(inner_key)
new_key = f"{inner_key}-{suffix}"
else:
new_key = inner_key
filtered_data[new_key] = values
return filtered_data return filtered_data
@ -116,11 +130,11 @@ def detect_depth(data):
def restructure_data(data): def restructure_data(data):
""" """
Restructure data to normalize levels of nesting. 重构数据以标准化嵌套层级
If depth is 2 throughout, return as-is. 如果整个数据都是2层结构则保持原样返回
If both 2 and 3 levels exist, restructure to align to 3-layer format. 如果同时存在2层和3层结构则将所有数据重构为3层格式
""" """
# Check if data contains mixed 2-layer and 3-layer structures # 检查数据是否包含混合的2层和3层结构
has_two_layers = False has_two_layers = False
has_three_layers = False has_three_layers = False
@ -130,20 +144,20 @@ def restructure_data(data):
elif isinstance(value, list): elif isinstance(value, list):
has_two_layers = True has_two_layers = True
else: else:
raise ValueError(f"Unexpected data format for key '{key}': {type(value)}") raise ValueError(f"'{key}'的数据格式异常: {type(value)}")
# If only 2-layer, return as-is # 如果只有2层结构直接返回原数据
if has_two_layers and not has_three_layers: if has_two_layers and not has_three_layers:
return data return data
# If mixed or only 3-layer, normalize to 3-layer # 如果是混合结构或仅有3层结构统一标准化为3层
structured_data = {} structured_data = {}
for key, value in data.items(): for key, value in data.items():
if isinstance(value, dict): if isinstance(value, dict):
# Already a 3-layer structure, keep as is # 已经是3层结构保持不变
structured_data[key] = value structured_data[key] = value
elif isinstance(value, list): elif isinstance(value, list):
# Convert 2-layer structure to 3-layer # 将2层结构转换为3层结构
structured_data[key] = {key: value} structured_data[key] = {key: value}
return structured_data return structured_data

View File

@ -242,7 +242,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
#TODO:写个脚本确保技术参数没有嵌套 #TODO:写个脚本确保技术参数没有嵌套
#TODO: start up 结构优化 #TODO: start up 结构优化
#TODO:C:\Users\Administrator\Desktop\fsdownload\16fd6b4e-3975-4c83-8ba6-1bc9263a6a5b 符合性审查未找到
# 小解析也更新偏离表 # 小解析也更新偏离表
#TODO 体育器材 符合性检查的外键 采购需求 #TODO 体育器材 符合性检查的外键 采购需求