119 lines
5.9 KiB
Python
119 lines
5.9 KiB
Python
|
# -*- encoding:utf-8 -*-
|
|||
|
import json
|
|||
|
import logging
|
|||
|
import os
|
|||
|
import time
|
|||
|
|
|||
|
from flask_app.main.format_change import docx2pdf, pdf2docx
|
|||
|
from flask_app.main.json_utils import clean_json_string
|
|||
|
from flask_app.main.判断是否分包等 import merge_json_to_list, read_questions_from_judge
|
|||
|
from flask_app.main.基础信息整合 import judge_consortium_bidding
|
|||
|
from flask_app.main.多线程提问 import read_questions_from_file, multi_threading
|
|||
|
from flask_app.main.通义千问long import upload_file
|
|||
|
from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
|
|||
|
from flask_app.货物标.货物标截取pdf import truncate_pdf_specific
|
|||
|
from flask_app.main.post_processing import inner_post_processing
|
|||
|
|
|||
|
def get_global_logger(unique_id):
|
|||
|
if unique_id is None:
|
|||
|
return logging.getLogger() # 获取默认的日志器
|
|||
|
logger = logging.getLogger(unique_id)
|
|||
|
return logger
|
|||
|
|
|||
|
|
|||
|
logger = None
|
|||
|
def get_base_info(baseinfo_file_path):
|
|||
|
file_id = upload_file(baseinfo_file_path)
|
|||
|
# baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
|
|||
|
baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt'
|
|||
|
questions = read_questions_from_file(baseinfo_file_path)
|
|||
|
more_query = "请你根据招标文件信息,回答以下问题:是否组织踏勘现场?是否召开投标预备会(或投标答疑会)?是否退还投标文件?是否允许分包? 是否需要递交投标保证金(或磋商保证金)?是否需要提交履约保证金(或履约担保)?是否有招标代理服务费(或中标、成交服务费)?请按json格式给我提供信息,键名分别为'是否组织踏勘现场','是否召开投标预备会'(或'是否召开投标答疑会'),'是否退还投标文件',是否允许分包','是否递交投标保证金'(或'是否递交磋商保证金'),'是否提交履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知',若存在矛盾信息,请回答'未知'。"
|
|||
|
questions.append(more_query)
|
|||
|
baseinfo_results = multi_threading(questions, "", file_id, 2) # 1代表使用百炼rag 2代表使用qianwen-long
|
|||
|
baseinfo_list = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
|
|||
|
chosen_numbers, merged = merge_json_to_list(baseinfo_list.pop())
|
|||
|
baseinfo_list.append(merged)
|
|||
|
|
|||
|
# judge_file_path = 'flask_app/static/提示词/是否相关问题货物标.txt'
|
|||
|
judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt'
|
|||
|
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
|||
|
|
|||
|
res2 = multi_threading(judge_questions, "", file_id, 2) # 调用千问-long
|
|||
|
if not res2:
|
|||
|
print("基础信息整合: multi_threading error!")
|
|||
|
else:
|
|||
|
for question, response in res2:
|
|||
|
baseinfo_list.append(clean_json_string(response))
|
|||
|
return baseinfo_list
|
|||
|
|
|||
|
#货物标
|
|||
|
def little_parse_goods(output_folder,file_path):
|
|||
|
files=truncate_pdf_specific(file_path,output_folder)
|
|||
|
baseinfo_list=get_base_info(files[-1])
|
|||
|
aggregated_baseinfo = aggregate_basic_info_goods(baseinfo_list)
|
|||
|
return {"基础信息": aggregated_baseinfo}
|
|||
|
def little_parse_engineering(output_folder,downloaded_filepath):
|
|||
|
return True
|
|||
|
|
|||
|
def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id):
|
|||
|
"""
|
|||
|
主解析函数,根据文件类型和招标类型处理文件,并保存结果为JSON文件。
|
|||
|
|
|||
|
参数:
|
|||
|
output_folder (str): 输出文件夹路径。
|
|||
|
file_path (str): 待处理文件的路径。
|
|||
|
file_type (int): 文件类型(1: docx, 2: pdf, 3: doc)。
|
|||
|
zb_type (int): 招标类型(2: 货物标, 其他: 工程标)。
|
|||
|
|
|||
|
返回:
|
|||
|
str: 保存的JSON文件的路径。
|
|||
|
"""
|
|||
|
global logger
|
|||
|
logger = get_global_logger(unique_id)
|
|||
|
# 根据文件类型处理文件路径
|
|||
|
if file_type == 1: # docx
|
|||
|
docx_path = file_path
|
|||
|
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
|
|||
|
elif file_type == 2: # pdf
|
|||
|
pdf_path = file_path
|
|||
|
# docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
|||
|
elif file_type == 3: # doc
|
|||
|
pdf_path = docx2pdf(file_path)
|
|||
|
# docx_path = pdf2docx(pdf_path)
|
|||
|
else:
|
|||
|
logger.error("Unsupported file type provided. Preprocessing halted.")
|
|||
|
return None
|
|||
|
# 根据招标类型调用相应的解析函数
|
|||
|
if zb_type == 2: # 货物标
|
|||
|
combined_data = little_parse_goods(output_folder, pdf_path)
|
|||
|
logger.info("Parsed goods data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4))
|
|||
|
res = inner_post_processing(combined_data["基础信息"])
|
|||
|
else:
|
|||
|
combined_data = little_parse_engineering(output_folder, pdf_path)
|
|||
|
logger.info("Parsed engineering data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4))
|
|||
|
res = inner_post_processing(combined_data["基础信息"])
|
|||
|
# 定义保存JSON文件的路径
|
|||
|
json_path = os.path.join(output_folder, "final_result.json")
|
|||
|
# 将结果保存为JSON文件
|
|||
|
with open(json_path, 'w', encoding='utf-8') as json_file:
|
|||
|
json.dump(res, json_file, ensure_ascii=False, indent=4)
|
|||
|
logger.info(f"Extraction results saved to {json_path}")
|
|||
|
|
|||
|
# 返回JSON文件的路径
|
|||
|
return json_path
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\little_parse_output"
|
|||
|
start_time = time.time()
|
|||
|
file_type = 2 # 1:docx 2:pdf 3:其他
|
|||
|
zb_type=2 #1:工程标 2:货物标
|
|||
|
input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件.pdf"
|
|||
|
res=little_parse_main(output_folder, input_file, file_type, zb_type)
|
|||
|
print(json.dumps(res, ensure_ascii=False, indent=4))
|
|||
|
end_time = time.time()
|
|||
|
elapsed_time = end_time - start_time # 计算耗时
|
|||
|
print(f"Function execution took {elapsed_time} seconds.")
|
|||
|
|
|||
|
|
|||
|
|