2024-11-06 12:20:24 +08:00
|
|
|
|
import json
|
|
|
|
|
import logging
|
2024-11-17 16:29:02 +08:00
|
|
|
|
import re
|
|
|
|
|
import string
|
2024-11-06 12:20:24 +08:00
|
|
|
|
|
|
|
|
|
from flask_app.general.format_change import pdf2docx, docx2pdf
|
2024-10-27 12:08:54 +08:00
|
|
|
|
from flask_app.general.通义千问long import upload_file
|
|
|
|
|
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
|
|
|
|
|
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
|
|
|
|
|
|
2024-11-06 12:20:24 +08:00
|
|
|
|
def get_global_logger(unique_id):
|
|
|
|
|
if unique_id is None:
|
|
|
|
|
return logging.getLogger() # 获取默认的日志器
|
|
|
|
|
logger = logging.getLogger(unique_id)
|
|
|
|
|
return logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger = None
|
2024-10-27 12:08:54 +08:00
|
|
|
|
def extract_matching_keys(data_dict, good_list):
|
|
|
|
|
"""
|
2024-11-17 16:29:02 +08:00
|
|
|
|
递归遍历data_dict,查找good_list中存在的键(完全匹配或以good_list中的键开头,后跟“-数字”),并将匹配的键及其值添加到结果字典中。
|
|
|
|
|
对于重复的键名,添加后缀 -a, -b, -c 等以确保唯一性。
|
2024-10-27 12:08:54 +08:00
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
- data_dict (dict): 要遍历的嵌套字典。
|
|
|
|
|
- good_list (list): 包含要查找的键的列表。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
- dict: 包含所有匹配键及其值的字典。
|
|
|
|
|
"""
|
|
|
|
|
result = {}
|
2024-11-17 16:29:02 +08:00
|
|
|
|
key_count = {} # 用于统计每个匹配键的出现次数
|
|
|
|
|
|
|
|
|
|
# 预编译正则模式以提高效率
|
|
|
|
|
patterns = [re.compile(r'^' + re.escape(g) + r'(?:-\d+)?$') for g in good_list] #考虑同一系统下有同门设备,如交换机
|
|
|
|
|
|
|
|
|
|
def matches(key):
|
|
|
|
|
return any(pattern.match(key) for pattern in patterns)
|
|
|
|
|
|
|
|
|
|
# 第一次遍历:统计每个匹配键的出现次数
|
|
|
|
|
def first_pass(current_dict):
|
|
|
|
|
if isinstance(current_dict, dict):
|
|
|
|
|
for key, value in current_dict.items():
|
|
|
|
|
if matches(key):
|
|
|
|
|
key_count[key] = key_count.get(key, 0) + 1
|
|
|
|
|
first_pass(value)
|
|
|
|
|
elif isinstance(current_dict, list):
|
|
|
|
|
for item in current_dict:
|
|
|
|
|
first_pass(item)
|
|
|
|
|
|
|
|
|
|
first_pass(data_dict)
|
2024-10-27 12:08:54 +08:00
|
|
|
|
|
2024-11-17 16:29:02 +08:00
|
|
|
|
# 初始化用于跟踪每个重复键当前使用的后缀编号
|
|
|
|
|
suffix_map = {key: 0 for key, count in key_count.items() if count > 1}
|
|
|
|
|
|
|
|
|
|
def get_suffix(count):
|
|
|
|
|
"""
|
|
|
|
|
根据计数获取字母后缀,例如: 考虑不同系统下都有同门设备,如交换机
|
|
|
|
|
1 -> '-a'
|
|
|
|
|
2 -> '-b'
|
|
|
|
|
...
|
|
|
|
|
26 -> '-z'
|
|
|
|
|
27 -> '-aa'
|
|
|
|
|
"""
|
|
|
|
|
letters = string.ascii_lowercase
|
|
|
|
|
suffix = ''
|
|
|
|
|
while count > 0:
|
|
|
|
|
count, remainder = divmod(count - 1, 26)
|
|
|
|
|
suffix = letters[remainder] + suffix
|
|
|
|
|
return '-' + suffix
|
|
|
|
|
|
|
|
|
|
# 第二次遍历:添加后缀并构建结果字典
|
2024-10-27 12:08:54 +08:00
|
|
|
|
def recurse(current_dict):
|
|
|
|
|
if isinstance(current_dict, dict):
|
|
|
|
|
for key, value in current_dict.items():
|
2024-11-17 16:29:02 +08:00
|
|
|
|
if matches(key):
|
|
|
|
|
if key_count.get(key, 0) > 1:
|
|
|
|
|
suffix_map[key] += 1
|
|
|
|
|
suffix = get_suffix(suffix_map[key])
|
|
|
|
|
new_key = f"{key}{suffix}"
|
|
|
|
|
else:
|
|
|
|
|
new_key = key
|
|
|
|
|
result[new_key] = value
|
2024-10-27 12:08:54 +08:00
|
|
|
|
recurse(value)
|
|
|
|
|
elif isinstance(current_dict, list):
|
|
|
|
|
for item in current_dict:
|
|
|
|
|
recurse(item)
|
|
|
|
|
# 如果current_dict不是dict或list,则无需进一步处理
|
2024-11-17 16:29:02 +08:00
|
|
|
|
|
2024-10-27 12:08:54 +08:00
|
|
|
|
recurse(data_dict)
|
|
|
|
|
return result
|
|
|
|
|
|
2024-11-06 12:20:24 +08:00
|
|
|
|
def get_technical_requirements_main(file_path,file_type,unique_id,output_folder):
|
|
|
|
|
global logger
|
|
|
|
|
logger = get_global_logger(unique_id)
|
|
|
|
|
if file_type == 1: # docx
|
|
|
|
|
docx_path = file_path
|
|
|
|
|
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
|
|
|
|
|
elif file_type == 2: # pdf
|
|
|
|
|
pdf_path = file_path
|
|
|
|
|
elif file_type == 3: # doc
|
|
|
|
|
pdf_path = docx2pdf(file_path)
|
|
|
|
|
else:
|
|
|
|
|
logger.error("Unsupported file type provided. Preprocessing halted.")
|
|
|
|
|
return None
|
|
|
|
|
truncate_file=truncate_pdf_main(pdf_path,output_folder,5)[0]
|
2024-11-02 14:29:28 +08:00
|
|
|
|
if not truncate_file:
|
2024-11-06 12:20:24 +08:00
|
|
|
|
truncate_file=pdf_path #直接传整份文件
|
2024-10-29 20:40:14 +08:00
|
|
|
|
truncate_file_docx=pdf2docx(truncate_file)
|
|
|
|
|
file_id=upload_file(truncate_file_docx)
|
2024-10-30 09:23:59 +08:00
|
|
|
|
# file_id=upload_file(truncate_file)
|
2024-11-06 12:20:24 +08:00
|
|
|
|
final_res=get_technical_requirements(file_id,pdf_path)
|
2024-10-27 12:08:54 +08:00
|
|
|
|
# 安全地提取 "技术要求" 内部的字典内容
|
|
|
|
|
if isinstance(final_res, dict) and '技术要求' in final_res and isinstance(final_res['技术要求'], dict):
|
|
|
|
|
technical_requirements = final_res['技术要求']
|
|
|
|
|
good_list = technical_requirements.pop('货物列表', []) # 如果 '货物列表' 不存在,返回 []
|
2024-11-06 12:20:24 +08:00
|
|
|
|
print(good_list)
|
|
|
|
|
logger.info("Collected good_list from the processing function: %s", good_list)
|
2024-10-27 12:08:54 +08:00
|
|
|
|
return extract_matching_keys(technical_requirements,good_list)
|
|
|
|
|
else:
|
|
|
|
|
return final_res
|
|
|
|
|
if __name__ == "__main__":
|
2024-11-06 12:20:24 +08:00
|
|
|
|
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
|
|
|
|
|
file_type=2
|
2024-10-29 20:40:14 +08:00
|
|
|
|
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\tmp"
|
2024-11-06 12:20:24 +08:00
|
|
|
|
res=get_technical_requirements_main(file_path,file_type,"123",output_folder)
|
|
|
|
|
print(json.dumps(res,ensure_ascii=False,indent=4))
|