55 lines
2.4 KiB
Python
55 lines
2.4 KiB
Python
import json
|
|
import logging
|
|
import re
|
|
import string
|
|
|
|
from flask_app.general.format_change import pdf2docx, docx2pdf
|
|
from flask_app.general.通义千问long import upload_file
|
|
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
|
|
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
|
|
from flask_app.货物标.技术参数要求提取后处理函数 import extract_matching_keys
|
|
|
|
|
|
def get_global_logger(unique_id):
|
|
if unique_id is None:
|
|
return logging.getLogger() # 获取默认的日志器
|
|
logger = logging.getLogger(unique_id)
|
|
return logger
|
|
|
|
|
|
logger = None
|
|
def get_technical_requirements_main(file_path,file_type,unique_id,output_folder):
|
|
global logger
|
|
logger = get_global_logger(unique_id)
|
|
if file_type == 1: # docx
|
|
docx_path = file_path
|
|
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
|
|
elif file_type == 2: # pdf
|
|
pdf_path = file_path
|
|
elif file_type == 3: # doc
|
|
pdf_path = docx2pdf(file_path)
|
|
else:
|
|
logger.error("Unsupported file type provided. Preprocessing halted.")
|
|
return None
|
|
truncate_file=truncate_pdf_main(pdf_path,output_folder,5)[0]
|
|
if not truncate_file:
|
|
truncate_file=pdf_path #直接传整份文件
|
|
# truncate_file_docx=pdf2docx(truncate_file)
|
|
# file_id=upload_file(truncate_file_docx)
|
|
# file_id=upload_file(truncate_file)
|
|
final_res=get_technical_requirements(truncate_file,pdf_path)
|
|
# 安全地提取 "技术要求" 内部的字典内容
|
|
if isinstance(final_res, dict) and '采购需求' in final_res and isinstance(final_res['采购需求'], dict):
|
|
technical_requirements = final_res['采购需求']
|
|
good_list = technical_requirements.pop('货物列表', []) # 如果 '货物列表' 不存在,返回 []
|
|
logger.info("Collected good_list from the processing function: %s", good_list)
|
|
return extract_matching_keys(technical_requirements,good_list)
|
|
else:
|
|
return final_res
|
|
if __name__ == "__main__":
|
|
file_path=r"C:\Users\Administrator\Desktop\fsdownload\db79e9e0-830e-442c-8cb6-1d036215f8ff\ztbfile.pdf"
|
|
file_type=2
|
|
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\db79e9e0-830e-442c-8cb6-1d036215f8ff\tmp"
|
|
res=get_technical_requirements_main(file_path,file_type,"123",output_folder)
|
|
print(json.dumps(res,ensure_ascii=False,indent=4))
|