zbparse/flask_app/general/little_zbparse.py
2024-10-18 09:07:30 +08:00

141 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- encoding:utf-8 -*-
import json
import logging
import os
import time
from flask_app.main.format_change import docx2pdf, pdf2docx
from flask_app.main.json_utils import clean_json_string
from flask_app.main.判断是否分包等 import merge_json_to_list, read_questions_from_judge
from flask_app.main.基础信息整合 import judge_consortium_bidding
from flask_app.main.多线程提问 import read_questions_from_file, multi_threading
from flask_app.main.通义千问long import upload_file
from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
from flask_app.货物标.货物标截取pdf import truncate_pdf_specific_goods
from flask_app.main.截取pdf import truncate_pdf_specific_engineering
from flask_app.main.post_processing import inner_post_processing
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
logger = None
def merge(merged):
guarantee_key = '是否递交投标保证金' if '是否递交投标保证金' in merged else '是否递交磋商保证金'
if merged.get(guarantee_key) == '':
return 1,""
elif merged.get(guarantee_key) == '':
guarantee_type = '投标' if '投标' in guarantee_key else '磋商'
merged[f'{guarantee_type}保证金'] = '不提交'
merged[f'退还{guarantee_type}保证金'] = '/'
merged.pop(guarantee_key, None)
return 0,merged
def get_goods_baseinfo(baseinfo_file_path):
file_id = upload_file(baseinfo_file_path)
baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
# baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt'
questions = read_questions_from_file(baseinfo_file_path)
more_query = "请你根据招标文件信息回答以下问题是否需要递交投标保证金或磋商保证金请按json格式给我提供信息键名分为'是否递交投标保证金'(或'是否递交磋商保证金',键值仅限于'','','未知',若存在矛盾信息,请回答'未知'"
questions.append(more_query)
baseinfo_results = multi_threading(questions, "", file_id, 2) # 1代表使用百炼rag 2代表使用qianwen-long
baseinfo_list = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
type,merged=merge(baseinfo_list.pop())
if type:
judge_questions=["根据招标文件第二章投标人须知该项目投标保证金或磋商保证金的内容或要求是什么请按json格式给我提供信息外层键名为'投标保证金'(或'磋商保证金'),若需要以嵌套键值对返回结果,那么嵌套键名为你对相应要求的总结,而对应键值需要完全与原文保持一致。"]
res2 = multi_threading(judge_questions, "", file_id, 2) # 调用千问-long
if not res2:
print("基础信息整合: multi_threading error!")
else:
for question, response in res2:
baseinfo_list.append(clean_json_string(response))
else:
baseinfo_list.append(merged)
return baseinfo_list
def get_engineering_baseinfo(baseinfo_file_path):
return []
#货物标
def little_parse_goods(output_folder,file_path):
files=truncate_pdf_specific_goods(file_path,output_folder)
baseinfo_list=get_goods_baseinfo(files[-1])
aggregated_baseinfo = aggregate_basic_info_goods(baseinfo_list)
return {"基础信息": aggregated_baseinfo}
def little_parse_engineering(output_folder,file_path):
files=truncate_pdf_specific_engineering(file_path,output_folder)
baseinfo_list=get_engineering_baseinfo(files[-1])
aggregated_baseinfo = aggregate_basic_info_goods(baseinfo_list)
return {"基础信息": aggregated_baseinfo}
def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id):
"""
主解析函数根据文件类型和招标类型处理文件并保存结果为JSON文件。
参数:
output_folder (str): 输出文件夹路径。
file_path (str): 待处理文件的路径。
file_type (int): 文件类型1: docx, 2: pdf, 3: doc
zb_type (int): 招标类型2: 货物标, 其他: 工程标)。
返回:
str: 保存的JSON文件的路径。
"""
global logger
logger = get_global_logger(unique_id)
logger.info("zb_type:"+str(zb_type))
# 根据文件类型处理文件路径
if file_type == 1: # docx
docx_path = file_path
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
elif file_type == 2: # pdf
pdf_path = file_path
# docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
elif file_type == 3: # doc
pdf_path = docx2pdf(file_path)
# docx_path = pdf2docx(pdf_path)
else:
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
# 根据招标类型调用相应的解析函数
if zb_type == 2: # 货物标
combined_data = little_parse_goods(output_folder, pdf_path)
logger.info("Parsed goods data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4))
res = inner_post_processing(combined_data["基础信息"])
else:
combined_data = little_parse_engineering(output_folder, pdf_path)
logger.info("Parsed engineering data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4))
res = inner_post_processing(combined_data["基础信息"])
# 定义保存JSON文件的路径
json_path = os.path.join(output_folder, "final_result.json")
# 将结果保存为JSON文件
with open(json_path, 'w', encoding='utf-8') as json_file:
json.dump(res, json_file, ensure_ascii=False, indent=4)
logger.info(f"Extraction results saved to {json_path}")
# 返回JSON文件的路径
return json_path
if __name__ == "__main__":
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\little_parse_output"
start_time = time.time()
file_type = 2 # 1:docx 2:pdf 3:其他
zb_type=2 #1:工程标 2货物标
input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
final_json_path=little_parse_main(output_folder, input_file, file_type, zb_type,"122334")
with open(final_json_path, 'r', encoding='utf-8') as f:
logger.info('final_json_path:' + final_json_path)
zbparse_data = json.load(f)
json_str = json.dumps(zbparse_data, ensure_ascii=False,indent=4)
print(json_str)
end_time = time.time()
elapsed_time = end_time - start_time # 计算耗时
print(f"Function execution took {elapsed_time} seconds.")