zbparse/flask_app/general/纯技术参数要求提取.py

54 lines
2.3 KiB
Python
Raw Normal View History

2024-10-29 20:40:14 +08:00
from flask_app.general.format_change import pdf2docx
2024-10-27 12:08:54 +08:00
from flask_app.general.通义千问long import upload_file
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
def extract_matching_keys(data_dict, good_list):
"""
递归遍历data_dict查找good_list中存在的键并将匹配的键及其值添加到结果字典中
参数:
- data_dict (dict): 要遍历的嵌套字典
- good_list (list): 包含要查找的键的列表
返回:
- dict: 包含所有匹配键及其值的字典
"""
result = {}
def recurse(current_dict):
if isinstance(current_dict, dict):
for key, value in current_dict.items():
if key in good_list:
result[key] = value
# 递归遍历子字典
recurse(value)
elif isinstance(current_dict, list):
for item in current_dict:
recurse(item)
# 如果current_dict不是dict或list则无需进一步处理
recurse(data_dict)
return result
def get_technical_requirements_main(file_path,output_folder):
2024-10-29 20:40:14 +08:00
truncate_file=truncate_pdf_main(file_path,output_folder,5)[0]
if not truncate_file:
truncate_file=file_path #直接传整份文件
2024-10-29 20:40:14 +08:00
truncate_file_docx=pdf2docx(truncate_file)
file_id=upload_file(truncate_file_docx)
2024-10-30 09:23:59 +08:00
# file_id=upload_file(truncate_file)
2024-10-27 12:08:54 +08:00
final_res=get_technical_requirements(file_id)
# 安全地提取 "技术要求" 内部的字典内容
if isinstance(final_res, dict) and '技术要求' in final_res and isinstance(final_res['技术要求'], dict):
technical_requirements = final_res['技术要求']
good_list = technical_requirements.pop('货物列表', []) # 如果 '货物列表' 不存在,返回 []
return extract_matching_keys(technical_requirements,good_list)
else:
return final_res
if __name__ == "__main__":
2024-10-29 20:40:14 +08:00
file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\tmp"
res=get_technical_requirements_main(file_path,output_folder)
2024-10-30 09:19:10 +08:00
print(res)