diff --git a/flask_app/general/纯技术参数要求提取.py b/flask_app/general/纯技术参数要求提取.py index 33f4c68..5123512 100644 --- a/flask_app/general/纯技术参数要求提取.py +++ b/flask_app/general/纯技术参数要求提取.py @@ -47,8 +47,8 @@ def get_technical_requirements_main(file_path,file_type,unique_id,output_folder) else: return final_res if __name__ == "__main__": - file_path=r"C:\Users\Administrator\Desktop\fsdownload\39b0c3b4-1807-456c-8330-c5c7d1b7a2ca\ztbfile.pdf" + file_path=r"C:\Users\Administrator\Desktop\fsdownload\db79e9e0-830e-442c-8cb6-1d036215f8ff\ztbfile.pdf" file_type=2 - output_folder = r"C:\Users\Administrator\Desktop\fsdownload\39b0c3b4-1807-456c-8330-c5c7d1b7a2ca\tmp" + output_folder = r"C:\Users\Administrator\Desktop\fsdownload\db79e9e0-830e-442c-8cb6-1d036215f8ff\tmp" res=get_technical_requirements_main(file_path,file_type,"123",output_folder) print(json.dumps(res,ensure_ascii=False,indent=4)) diff --git a/flask_app/testdir/test3.py b/flask_app/testdir/test3.py index 8d60938..5ac3c55 100644 --- a/flask_app/testdir/test3.py +++ b/flask_app/testdir/test3.py @@ -1,38 +1,68 @@ import json import re + +def get_original_order(data, parent_key=''): + """获取原始字典中所有键的顺序""" + order = [] + for key, value in data.items(): + current_key = f"{parent_key}.{key}" if parent_key else key + order.append(current_key) + if isinstance(value, dict): + order.extend(get_original_order(value, current_key)) + return order + + +def reorganize_dict(data, original_order): + """根据原始顺序重新组织字典""" + + def get_dict_by_path(d, path): + parts = path.split('.') + current = d + for part in parts[:-1]: + current = current[part] + return current + + result = {} + # 按原始顺序重建字典结构 + for path in original_order: + parts = path.split('.') + current = result + for i, part in enumerate(parts[:-1]): + if part not in current: + current[part] = {} + current = current[part] + + # 获取原始字典中对应路径的值 + try: + source_dict = get_dict_by_path(data, path) + current[parts[-1]] = source_dict[parts[-1]] + except KeyError: + # 如果键不存在(可能是被重命名的键),跳过 + continue + + return result + + def generate_key_paths(data, parent_key='', good_list=None, seen=None): - """ - 生成嵌套字典中的键路径,并提取最内层的键名。 - 同时,提取特定模式的键(如 '交换机-1', '交换机-2')的父路径。 - 如果同一层级下只有'交换机-1'但没有'交换机-2',则视为错误输入,删除后缀'-1',将'交换机'加入key_paths。 - 并从data中移除错误的键。 + # 保存原始顺序 + original_order = get_original_order(data) - 参数: - data (dict): 输入的字典数据 - parent_key (str): 上级键路径,用于递归调用 - good_list (list): 用于存储去重后的最内层键名 - seen (set): 用于跟踪已添加到 good_list 的元素 - - 返回: - tuple: 包含键路径列表、最内层键名列表、分组路径列表以及 no_keys_added 的元组 - (key_paths, good_list, grouped_paths, no_keys_added) - """ if good_list is None: good_list = [] if seen is None: seen = set() key_paths = [] - grouped_paths = set() # 使用集合避免重复路径 - no_keys_added = True # 默认假设没有添加任何键 + grouped_paths = set() + no_keys_added = True # Step 1: Collect keys that match the pattern - pattern = re.compile(r'(.+)-\d+$') # 匹配形如 '交换机-1', '交换机-2' 的键 + pattern = re.compile(r'(.+)-\d+$') prefix_groups = {} other_keys = [] - for key in list(data.keys()): # 使用 list(data.keys()) 防止修改字典时出错 + for key in list(data.keys()): clean_key = key.replace(" ", "") match = pattern.match(clean_key) if match: @@ -47,101 +77,133 @@ def generate_key_paths(data, parent_key='', good_list=None, seen=None): for prefix, keys in prefix_groups.items(): current_prefix_path = f"{parent_key}.{prefix}" if parent_key else prefix if len(keys) > 1: - # 多个键匹配同一前缀:添加到 grouped_paths grouped_paths.add(current_prefix_path) if prefix not in seen: good_list.append(prefix) seen.add(prefix) no_keys_added = False else: - # 只有一个键匹配:删除后缀并添加到 key_paths,同时从 data 中移除该键 - key = keys[0] - key_path = current_prefix_path # 去掉后缀后,路径为父路径 + 前缀 + old_key = keys[0] + new_key = prefix + value = data[old_key] + data[new_key] = value + del data[old_key] + + key_path = f"{parent_key}.{new_key}" if parent_key else new_key key_paths.append(key_path) if prefix not in seen: good_list.append(prefix) seen.add(prefix) no_keys_added = False - # 从 data 中移除错误的键 - data.pop(key) - # Step 3: Handle other keys + # Step 3: Handle other keys (递归处理其他键) for key in other_keys: value = data[key] current_key = f"{parent_key}.{key}" if parent_key else key if isinstance(value, dict): if value: - # 递归调用,并获取子路径、子 good_list、子分组路径以及子 no_keys_added sub_key_paths, _, sub_grouped_paths, sub_no_keys_added = generate_key_paths( value, current_key, good_list, seen ) key_paths.extend(sub_key_paths) - grouped_paths.update(sub_grouped_paths) # 合并子分组路径到当前分组路径 - # 更新 no_keys_added + grouped_paths.update(sub_grouped_paths) no_keys_added = no_keys_added and sub_no_keys_added else: - # 空字典视为叶子节点 clean_key = key.replace(" ", "") key_paths.append(current_key.replace(" ", "")) if clean_key not in seen: - good_list.append(clean_key) # 去掉空格后添加 + good_list.append(clean_key) seen.add(clean_key) - - # 更新 no_keys_added no_keys_added = False - elif isinstance(value, list): - # 列表类型视为叶子节点,无论是否为空 - key_paths.append(current_key.replace(" ", "")) - clean_key = key.replace(" ", "") - if clean_key not in seen: - good_list.append(clean_key) # 去掉空格后添加 - seen.add(clean_key) - # 更新 no_keys_added - no_keys_added = False - elif value in {"未知", "", "/"}: - # 特定值视为叶子节点 - key_paths.append(current_key.replace(" ", "")) - clean_key = key.replace(" ", "") - if clean_key not in seen: - good_list.append(clean_key) # 去掉空格后添加 - seen.add(clean_key) - # 更新 no_keys_added - no_keys_added = False else: - # 其他情况视为叶子节点 - key_paths.append(current_key.replace(" ", "")) clean_key = key.replace(" ", "") + key_paths.append(current_key.replace(" ", "")) if clean_key not in seen: - good_list.append(clean_key) # 去掉空格后添加 + good_list.append(clean_key) seen.add(clean_key) - # 更新 no_keys_added no_keys_added = False + # Step 4: 删除 key_paths 中包含在 grouped_paths 中的元素 + key_paths = [path for path in key_paths if path not in grouped_paths] + + # 根据原始顺序重新组织字典 + reorganized_data = reorganize_dict(data, original_order) + data.clear() + data.update(reorganized_data) + return key_paths, good_list, grouped_paths, no_keys_added # 示例使用 data1 = { "采购需求": { - "高清数字枪机-1": [], - "枪机支架-1": [], - "高清数字半球机-1": [], - "网络硬盘录像机-1": [], - "监硬控硬盘-1": [], - "交换机-1": [], - "交换机-2": [], - "监视器-1": [], - "电源线-1": [], - "网线-1": [], - "水晶头-1": [], - "PVC线槽-1": [], - "辅料-1": [], - "安装调试-1": [], - "视频图像采集及存储系统": { - "系统功能": [] + "多媒体会议厅设备": { + "LED屏显示设备": { + "户内全彩LED屏(全彩高刷)": [], + "发送盒": [], + "LED显示屏控制系统": [], + "视频处理器": [], + "智能配电柜": [], + "台式电脑": [], + "控制桌(定制)": [] + }, + "LED显示屏施工材料、技术服务费、包装费": { + "结构边框": [], + "线材(按需)": [], + "包装材料": [] + }, + "扩声系统": { + "主扩全频专业音箱": [], + "专业功放-1": [], + "专业功放-2": [], + "辅助专业音箱": [], + "壁挂支架": [], + "返听专业音箱": [], + "返听专业功放": [], + "超低频专业音箱": [], + "音箱地插": [], + "调音台": [], + "音频处理器": [], + "抑制器": [], + "无线话筒": [], + "话筒呼叫控制嵌入软件": [], + "天线分配器-1": [], + "话筒天线": [], + "有源监听音箱": [], + "电源时序器": [] + }, + "辅助材料": { + "机柜": [], + "音频连接线-1": [], + "音频连接线-2": [], + "音频连接线-3": [], + "其它辅材(音箱线、电源线、网线、视频线等)": [] + }, + "会议座椅": { + "会议座椅": [] + }, + "电动窗帘": { + "电动窗帘系统": [] + } + }, + "云平台及备课电脑": { + "备课一体机电脑": [], + "云平台管理软件": [], + "教学互动应用软件": [] + }, + "办公桌椅": { + "办公桌椅": [] + }, + "文件柜": { + "文件柜": [] + }, + "体育运动器材": { + "移动式标准篮球架(12座)": [] } } } key_paths, good_list, grouped_paths, no_keys_added = generate_key_paths(data1) print(json.dumps(data1,ensure_ascii=False,indent=4)) +print(key_paths) +print(grouped_paths) diff --git a/flask_app/货物标/技术参数要求提取后处理函数.py b/flask_app/货物标/技术参数要求提取后处理函数.py index e71cf7d..e1d17a3 100644 --- a/flask_app/货物标/技术参数要求提取后处理函数.py +++ b/flask_app/货物标/技术参数要求提取后处理函数.py @@ -16,62 +16,76 @@ def get_suffix(n): return suffix -def extract_matching_keys(data, good_list, special_keys=None): - """ - 过滤字典中的键值对,保留键名符合good_list中元素的项。 - 对于重复的键名,添加后缀-a, -b, 等以确保唯一性。 - 对于特殊键,使用“父键的键”格式命名。 +def extract_matching_keys(data, good_list, special_keys=None, parent_key=''): + def get_suffix(n): + suffix = '' + while n > 0: + n, r = divmod(n - 1, 26) + suffix = chr(97 + r) + suffix + return suffix + def count_matching_keys(data, patterns, special_keys, counter=None): + """递归统计匹配键的出现次数""" + if counter is None: + counter = defaultdict(int) - 参数: - - data (dict): 输入的嵌套字典。 - - good_list (list): 包含需要匹配的键名的列表。 - - special_keys (list): 需要特殊处理的键名列表。 + if isinstance(data, dict): + for key, value in data.items(): + if isinstance(value, (dict, list)): + count_matching_keys(value, patterns, special_keys, counter) + if key not in special_keys and any(pattern.match(key) for pattern in patterns): + counter[key] += 1 + elif isinstance(data, list): + for item in data: + if isinstance(item, (dict, list)): + count_matching_keys(item, patterns, special_keys, counter) + + return counter + + def process_data(data, patterns, special_keys, key_counter, suffix_map, filtered_data, parent_key): + """递归处理数据并构建结果""" + + def get_suffix_label(key): + suffix_map[key] += 1 + return get_suffix(suffix_map[key]) + + if isinstance(data, dict): + for key, value in data.items(): + if isinstance(value, list): + # 到达最内层列表,处理数据 + if any(pattern.match(key) for pattern in patterns): + new_key = generate_key(key, parent_key, key_counter, suffix_map, special_keys) + filtered_data[new_key] = value + elif isinstance(value, dict): + # 继续递归处理嵌套字典 + process_data(value, patterns, special_keys, key_counter, suffix_map, + filtered_data, key if parent_key == '' else f"{parent_key}的{key}") + + def generate_key(key, parent_key, key_counter, suffix_map, special_keys): + """生成新的键名""" + if key in special_keys and parent_key: + return f"{parent_key}的{key}" + elif key_counter[key] > 1: + suffix = get_suffix(suffix_map[key] + 1) + suffix_map[key] += 1 + return f"{key}-{suffix}" + return key - 返回: - - dict: 筛选后的字典,包含符合条件的键值对。 - """ if special_keys is None: special_keys = [] - # 编译正则表达式模式,匹配good_list中的元素,允许后面跟随“-数字” patterns = [re.compile(r'^' + re.escape(g) + r'(?:-\d+)?$') for g in good_list] - # 第一遍遍历:统计每个键名出现的次数(排除特殊键) - key_counter = defaultdict(int) - for top_key, nested_dict in data.items(): - for inner_key in nested_dict.keys(): - if inner_key in special_keys: - continue - if any(pattern.match(inner_key) for pattern in patterns): - key_counter[inner_key] += 1 + # 先统计所有匹配键的出现次数 + key_counter = count_matching_keys(data, patterns, special_keys) - # 初始化用于跟踪每个重复键当前使用的后缀编号 + # 初始化后缀映射 suffix_map = {key: 0 for key, count in key_counter.items() if count > 1} - def get_suffix_label(key): - """ - 根据当前计数获取字母后缀,并更新suffix_map。 - """ - suffix_map[key] += 1 - return get_suffix(suffix_map[key]) - - # 第二遍遍历:根据统计结果添加后缀或特殊命名 + # 用于存储最终结果 filtered_data = {} - for top_key, nested_dict in data.items(): - for inner_key, values in nested_dict.items(): - if any(pattern.match(inner_key) for pattern in patterns): - if inner_key in special_keys: - # 对于特殊键,使用“父键的键”命名 - new_key = f"{top_key}的{inner_key}" - else: - if key_counter[inner_key] > 1: - # 对于重复键,添加后缀 - suffix = get_suffix_label(inner_key) - new_key = f"{inner_key}-{suffix}" - else: - new_key = inner_key - filtered_data[new_key] = values + # 递归处理数据 + process_data(data, patterns, special_keys, key_counter, suffix_map, filtered_data, parent_key) return filtered_data @@ -116,11 +130,11 @@ def detect_depth(data): def restructure_data(data): """ - Restructure data to normalize levels of nesting. - If depth is 2 throughout, return as-is. - If both 2 and 3 levels exist, restructure to align to 3-layer format. + 重构数据以标准化嵌套层级。 + 如果整个数据都是2层结构,则保持原样返回。 + 如果同时存在2层和3层结构,则将所有数据重构为3层格式。 """ - # Check if data contains mixed 2-layer and 3-layer structures + # 检查数据是否包含混合的2层和3层结构 has_two_layers = False has_three_layers = False @@ -130,20 +144,20 @@ def restructure_data(data): elif isinstance(value, list): has_two_layers = True else: - raise ValueError(f"Unexpected data format for key '{key}': {type(value)}") + raise ValueError(f"键'{key}'的数据格式异常: {type(value)}") - # If only 2-layer, return as-is + # 如果只有2层结构,直接返回原数据 if has_two_layers and not has_three_layers: return data - # If mixed or only 3-layer, normalize to 3-layer + # 如果是混合结构或仅有3层结构,统一标准化为3层 structured_data = {} for key, value in data.items(): if isinstance(value, dict): - # Already a 3-layer structure, keep as is + # 已经是3层结构,保持不变 structured_data[key] = value elif isinstance(value, list): - # Convert 2-layer structure to 3-layer + # 将2层结构转换为3层结构 structured_data[key] = {key: value} return structured_data diff --git a/flask_app/货物标/货物标解析main.py b/flask_app/货物标/货物标解析main.py index eaf9be3..680bae0 100644 --- a/flask_app/货物标/货物标解析main.py +++ b/flask_app/货物标/货物标解析main.py @@ -242,7 +242,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id): #TODO:写个脚本确保技术参数没有嵌套 #TODO: start up 结构优化 -#TODO:C:\Users\Administrator\Desktop\fsdownload\16fd6b4e-3975-4c83-8ba6-1bc9263a6a5b 符合性审查未找到 + # 小解析也更新偏离表 #TODO 体育器材 符合性检查的外键 采购需求