import json import logging import re import string from flask_app.general.format_change import pdf2docx, docx2pdf from flask_app.general.通义千问long import upload_file from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main from flask_app.货物标.技术参数要求提取 import get_technical_requirements def get_global_logger(unique_id): if unique_id is None: return logging.getLogger() # 获取默认的日志器 logger = logging.getLogger(unique_id) return logger logger = None def extract_matching_keys(data_dict, good_list): """ 递归遍历data_dict,查找good_list中存在的键(完全匹配或以good_list中的键开头,后跟“-数字”),并将匹配的键及其值添加到结果字典中。 对于重复的键名,添加后缀 -a, -b, -c 等以确保唯一性。 参数: - data_dict (dict): 要遍历的嵌套字典。 - good_list (list): 包含要查找的键的列表。 返回: - dict: 包含所有匹配键及其值的字典。 """ result = {} key_count = {} # 用于统计每个匹配键的出现次数 # 预编译正则模式以提高效率 patterns = [re.compile(r'^' + re.escape(g) + r'(?:-\d+)?$') for g in good_list] #考虑同一系统下有同门设备,如交换机 def matches(key): return any(pattern.match(key) for pattern in patterns) # 第一次遍历:统计每个匹配键的出现次数 def first_pass(current_dict): if isinstance(current_dict, dict): for key, value in current_dict.items(): if matches(key): key_count[key] = key_count.get(key, 0) + 1 first_pass(value) elif isinstance(current_dict, list): for item in current_dict: first_pass(item) first_pass(data_dict) # 初始化用于跟踪每个重复键当前使用的后缀编号 suffix_map = {key: 0 for key, count in key_count.items() if count > 1} def get_suffix(count): """ 根据计数获取字母后缀,例如: 考虑不同系统下都有同门设备,如交换机 1 -> '-a' 2 -> '-b' ... 26 -> '-z' 27 -> '-aa' """ letters = string.ascii_lowercase suffix = '' while count > 0: count, remainder = divmod(count - 1, 26) suffix = letters[remainder] + suffix return '-' + suffix # 第二次遍历:添加后缀并构建结果字典 def recurse(current_dict): if isinstance(current_dict, dict): for key, value in current_dict.items(): if matches(key): if key_count.get(key, 0) > 1: suffix_map[key] += 1 suffix = get_suffix(suffix_map[key]) new_key = f"{key}{suffix}" else: new_key = key result[new_key] = value recurse(value) elif isinstance(current_dict, list): for item in current_dict: recurse(item) # 如果current_dict不是dict或list,则无需进一步处理 recurse(data_dict) return result def get_technical_requirements_main(file_path,file_type,unique_id,output_folder): global logger logger = get_global_logger(unique_id) if file_type == 1: # docx docx_path = file_path pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 elif file_type == 2: # pdf pdf_path = file_path elif file_type == 3: # doc pdf_path = docx2pdf(file_path) else: logger.error("Unsupported file type provided. Preprocessing halted.") return None truncate_file=truncate_pdf_main(pdf_path,output_folder,5)[0] if not truncate_file: truncate_file=pdf_path #直接传整份文件 truncate_file_docx=pdf2docx(truncate_file) file_id=upload_file(truncate_file_docx) # file_id=upload_file(truncate_file) final_res=get_technical_requirements(file_id,pdf_path) # 安全地提取 "技术要求" 内部的字典内容 if isinstance(final_res, dict) and '技术要求' in final_res and isinstance(final_res['技术要求'], dict): technical_requirements = final_res['技术要求'] good_list = technical_requirements.pop('货物列表', []) # 如果 '货物列表' 不存在,返回 [] print(good_list) logger.info("Collected good_list from the processing function: %s", good_list) return extract_matching_keys(technical_requirements,good_list) else: return final_res if __name__ == "__main__": file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf" file_type=2 output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\tmp" res=get_technical_requirements_main(file_path,file_type,"123",output_folder) print(json.dumps(res,ensure_ascii=False,indent=4))