zbparse/flask_app/general/纯技术参数要求提取.py
2024-11-06 12:20:24 +08:00

80 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import logging
from flask_app.general.format_change import pdf2docx, docx2pdf
from flask_app.general.通义千问long import upload_file
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
logger = None
def extract_matching_keys(data_dict, good_list):
"""
递归遍历data_dict查找good_list中存在的键并将匹配的键及其值添加到结果字典中。
参数:
- data_dict (dict): 要遍历的嵌套字典。
- good_list (list): 包含要查找的键的列表。
返回:
- dict: 包含所有匹配键及其值的字典。
"""
result = {}
def recurse(current_dict):
if isinstance(current_dict, dict):
for key, value in current_dict.items():
if key in good_list:
result[key] = value
# 递归遍历子字典
recurse(value)
elif isinstance(current_dict, list):
for item in current_dict:
recurse(item)
# 如果current_dict不是dict或list则无需进一步处理
recurse(data_dict)
return result
def get_technical_requirements_main(file_path,file_type,unique_id,output_folder):
global logger
logger = get_global_logger(unique_id)
if file_type == 1: # docx
docx_path = file_path
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
elif file_type == 2: # pdf
pdf_path = file_path
elif file_type == 3: # doc
pdf_path = docx2pdf(file_path)
else:
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
truncate_file=truncate_pdf_main(pdf_path,output_folder,5)[0]
if not truncate_file:
truncate_file=pdf_path #直接传整份文件
truncate_file_docx=pdf2docx(truncate_file)
file_id=upload_file(truncate_file_docx)
# file_id=upload_file(truncate_file)
final_res=get_technical_requirements(file_id,pdf_path)
# 安全地提取 "技术要求" 内部的字典内容
if isinstance(final_res, dict) and '技术要求' in final_res and isinstance(final_res['技术要求'], dict):
technical_requirements = final_res['技术要求']
good_list = technical_requirements.pop('货物列表', []) # 如果 '货物列表' 不存在,返回 []
print(good_list)
logger.info("Collected good_list from the processing function: %s", good_list)
return extract_matching_keys(technical_requirements,good_list)
else:
return final_res
if __name__ == "__main__":
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
file_type=2
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\tmp"
res=get_technical_requirements_main(file_path,file_type,"123",output_folder)
print(json.dumps(res,ensure_ascii=False,indent=4))