zbparse/flask_app/货物标/基础信息解析货物标版.py

81 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- encoding:utf-8 -*-
import json
import threading
import time
import concurrent.futures
from flask_app.general.json_utils import clean_json_string, add_outer_key
from flask_app.general.通用功能函数 import process_judge_questions, aggregate_basic_info
from flask_app.general.llm.多线程提问 import read_questions_from_file, multi_threading
from flask_app.general.llm.通义千问long import upload_file
from flask_app.old_version.判断是否分包等_old import merge_json_to_list
from flask_app.general.投标人须知正文提取指定内容 import extract_from_notice
from flask_app.货物标.提取采购需求main import fetch_procurement_reqs
def get_base_info(merged_baseinfo_path,clause_path,invalid_path):
file_id = upload_file(merged_baseinfo_path)
baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
# baseinfo_file_path = r'D:\flask_project\flask_app\static\提示词\基本信息货物标.txt'
questions = read_questions_from_file(baseinfo_file_path)
baseinfo_results = multi_threading(questions, "", file_id, 2) # 1代表使用百炼rag 2代表使用qianwen-long
baseinfo_list = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
chosen_numbers, merged = merge_json_to_list(baseinfo_list.pop()) #取出并处理最后一个回答 '是否xxx'
baseinfo_list.append(merged)
judge_file_path = 'flask_app/static/提示词/是否相关问题.txt'
# judge_file_path =r'D:\flask_project\flask_app\static\提示词\是否相关问题.txt'
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
# 提交两个任务
future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, invalid_path,baseinfo_list)
future2 = executor.submit(extract_from_notice, merged_baseinfo_path, clause_path, 3) #重新招标
# 等待两个任务完成并获取结果
future1.result() # process_judge_questions 直接修改 baseinfo_list不需要返回值
rebidding_situation = future2.result()
update_json = add_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
baseinfo_list.append(update_json)
return baseinfo_list
def combine_basic_info(merged_baseinfo_path, procurement_path,clause_path,invalid_path):
baseinfo_list = []
temp_list = []
procurement_reqs = {}
# 定义一个线程函数来获取基础信息
def get_base_info_thread(): #传统的基础信息提取
nonlocal temp_list
temp_list = get_base_info(merged_baseinfo_path,clause_path,invalid_path)
# 定义一个线程函数来获取采购需求
def fetch_procurement_reqs_thread(): #采购要求提取
nonlocal procurement_reqs
procurement_reqs = fetch_procurement_reqs(procurement_path,invalid_path)
# 创建并启动获取基础信息的线程
thread1 = threading.Thread(target=get_base_info_thread)
thread1.start()
# 等待一秒后启动获取采购需求的线程
time.sleep(1)
thread2 = threading.Thread(target=fetch_procurement_reqs_thread)
thread2.start()
# 等待两个线程都完成
thread1.join()
thread2.join()
# 合并结果
baseinfo_list += temp_list # temp_list 是一个列表
baseinfo_list.append(procurement_reqs) # procurement_reqs 是一个字典
aggregated_baseinfo = aggregate_basic_info(baseinfo_list,'goods') #重新组织基础信息结构
return {"基础信息": aggregated_baseinfo}
if __name__ == "__main__":
start_time=time.time()
# baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\ecd3374a-a19e-475e-a4dd-e803a3bc1fbf\ztbfile_merged_baseinfo.pdf"
# procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
clause_path=r'D:\flask_project\flask_app\static\output\output1\ecd3374a-a19e-475e-a4dd-e803a3bc1fbf\clause1.json'
invalid_path=r"D:\flask_project\flask_app\static\output\output1\ecd3374a-a19e-475e-a4dd-e803a3bc1fbf\ztbfile.docx"
procurement_path=r'D:\flask_project\flask_app\static\output\output1\ecd3374a-a19e-475e-a4dd-e803a3bc1fbf\ztbfile_procurement.pdf'
# res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
res=combine_basic_info(merged_baseinfo_path,procurement_path,clause_path,invalid_path)
print("------------------------------------")
print(json.dumps(res, ensure_ascii=False, indent=4))
end_time=time.time()
print("elasped time:"+str(end_time-start_time))