zbparse/flask_app/general/纯技术参数要求提取.py

128 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import logging
import re
import string
from flask_app.general.format_change import pdf2docx, docx2pdf
from flask_app.general.通义千问long import upload_file
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
logger = None
def extract_matching_keys(data_dict, good_list):
"""
递归遍历data_dict查找good_list中存在的键完全匹配或以good_list中的键开头后跟“-数字”),并将匹配的键及其值添加到结果字典中。
对于重复的键名,添加后缀 -a, -b, -c 等以确保唯一性。
参数:
- data_dict (dict): 要遍历的嵌套字典。
- good_list (list): 包含要查找的键的列表。
返回:
- dict: 包含所有匹配键及其值的字典。
"""
result = {}
key_count = {} # 用于统计每个匹配键的出现次数
# 预编译正则模式以提高效率
patterns = [re.compile(r'^' + re.escape(g) + r'(?:-\d+)?$') for g in good_list] #考虑同一系统下有同门设备,如交换机
def matches(key):
return any(pattern.match(key) for pattern in patterns)
# 第一次遍历:统计每个匹配键的出现次数
def first_pass(current_dict):
if isinstance(current_dict, dict):
for key, value in current_dict.items():
if matches(key):
key_count[key] = key_count.get(key, 0) + 1
first_pass(value)
elif isinstance(current_dict, list):
for item in current_dict:
first_pass(item)
first_pass(data_dict)
# 初始化用于跟踪每个重复键当前使用的后缀编号
suffix_map = {key: 0 for key, count in key_count.items() if count > 1}
def get_suffix(count):
"""
根据计数获取字母后缀,例如: 考虑不同系统下都有同门设备,如交换机
1 -> '-a'
2 -> '-b'
...
26 -> '-z'
27 -> '-aa'
"""
letters = string.ascii_lowercase
suffix = ''
while count > 0:
count, remainder = divmod(count - 1, 26)
suffix = letters[remainder] + suffix
return '-' + suffix
# 第二次遍历:添加后缀并构建结果字典
def recurse(current_dict):
if isinstance(current_dict, dict):
for key, value in current_dict.items():
if matches(key):
if key_count.get(key, 0) > 1:
suffix_map[key] += 1
suffix = get_suffix(suffix_map[key])
new_key = f"{key}{suffix}"
else:
new_key = key
result[new_key] = value
recurse(value)
elif isinstance(current_dict, list):
for item in current_dict:
recurse(item)
# 如果current_dict不是dict或list则无需进一步处理
recurse(data_dict)
return result
def get_technical_requirements_main(file_path,file_type,unique_id,output_folder):
global logger
logger = get_global_logger(unique_id)
if file_type == 1: # docx
docx_path = file_path
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
elif file_type == 2: # pdf
pdf_path = file_path
elif file_type == 3: # doc
pdf_path = docx2pdf(file_path)
else:
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
truncate_file=truncate_pdf_main(pdf_path,output_folder,5)[0]
if not truncate_file:
truncate_file=pdf_path #直接传整份文件
truncate_file_docx=pdf2docx(truncate_file)
file_id=upload_file(truncate_file_docx)
# file_id=upload_file(truncate_file)
final_res=get_technical_requirements(file_id,pdf_path)
# 安全地提取 "技术要求" 内部的字典内容
if isinstance(final_res, dict) and '技术要求' in final_res and isinstance(final_res['技术要求'], dict):
technical_requirements = final_res['技术要求']
good_list = technical_requirements.pop('货物列表', []) # 如果 '货物列表' 不存在,返回 []
print(good_list)
logger.info("Collected good_list from the processing function: %s", good_list)
return extract_matching_keys(technical_requirements,good_list)
else:
return final_res
if __name__ == "__main__":
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
file_type=2
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\tmp"
res=get_technical_requirements_main(file_path,file_type,"123",output_folder)
print(json.dumps(res,ensure_ascii=False,indent=4))