zbparse/flask_app/general/纯技术参数要求提取.py

56 lines
2.4 KiB
Python
Raw Normal View History

2024-11-06 12:20:24 +08:00
import json
import logging
2024-11-17 16:29:02 +08:00
import re
import string
2024-11-06 12:20:24 +08:00
from flask_app.general.format_change import pdf2docx, docx2pdf
2024-10-27 12:08:54 +08:00
from flask_app.general.通义千问long import upload_file
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
2024-11-18 16:12:11 +08:00
from flask_app.货物标.技术参数要求提取后处理函数 import extract_matching_keys
2024-10-27 12:08:54 +08:00
2024-11-06 12:20:24 +08:00
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
logger = None
def get_technical_requirements_main(file_path,file_type,unique_id,output_folder):
global logger
logger = get_global_logger(unique_id)
if file_type == 1: # docx
docx_path = file_path
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
elif file_type == 2: # pdf
pdf_path = file_path
elif file_type == 3: # doc
pdf_path = docx2pdf(file_path)
else:
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
truncate_file=truncate_pdf_main(pdf_path,output_folder,5)[0]
if not truncate_file:
2024-11-06 12:20:24 +08:00
truncate_file=pdf_path #直接传整份文件
2024-10-29 20:40:14 +08:00
truncate_file_docx=pdf2docx(truncate_file)
file_id=upload_file(truncate_file_docx)
2024-10-30 09:23:59 +08:00
# file_id=upload_file(truncate_file)
2024-11-06 12:20:24 +08:00
final_res=get_technical_requirements(file_id,pdf_path)
2024-10-27 12:08:54 +08:00
# 安全地提取 "技术要求" 内部的字典内容
if isinstance(final_res, dict) and '技术要求' in final_res and isinstance(final_res['技术要求'], dict):
technical_requirements = final_res['技术要求']
good_list = technical_requirements.pop('货物列表', []) # 如果 '货物列表' 不存在,返回 []
2024-11-06 12:20:24 +08:00
print(good_list)
logger.info("Collected good_list from the processing function: %s", good_list)
2024-10-27 12:08:54 +08:00
return extract_matching_keys(technical_requirements,good_list)
else:
return final_res
if __name__ == "__main__":
2024-11-06 12:20:24 +08:00
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
file_type=2
2024-10-29 20:40:14 +08:00
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\tmp"
2024-11-06 12:20:24 +08:00
res=get_technical_requirements_main(file_path,file_type,"123",output_folder)
print(json.dumps(res,ensure_ascii=False,indent=4))