zbparse/flask_app/货物标/基础信息解析main.py

221 lines
12 KiB
Python
Raw Normal View History

2024-10-15 20:57:58 +08:00
# -*- encoding:utf-8 -*-
2024-10-14 17:13:11 +08:00
import json
2024-10-15 20:57:58 +08:00
import threading
import time
2024-10-30 20:41:19 +08:00
import concurrent.futures
2024-10-31 15:03:32 +08:00
from flask_app.general.json_utils import clean_json_string, rename_outer_key
2024-10-30 20:41:19 +08:00
from flask_app.general.通用功能函数 import judge_consortium_bidding, process_judge_questions
2024-10-22 10:06:22 +08:00
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
from flask_app.general.通义千问long import upload_file
2024-12-06 14:40:22 +08:00
from flask_app.工程标.判断是否分包等 import merge_json_to_list, read_questions_from_judge
2024-10-30 20:41:19 +08:00
from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice
2024-10-15 20:57:58 +08:00
from flask_app.货物标.提取采购需求main import fetch_procurement_reqs
2024-10-14 17:13:11 +08:00
2024-10-17 15:33:58 +08:00
def aggregate_basic_info_goods(baseinfo_list):
2024-11-01 17:55:26 +08:00
# for i in baseinfo_list:
# print(json.dumps(i,ensure_ascii=False,indent=4))
2024-10-14 17:13:11 +08:00
"""
将基础信息列表中的数据进行合并和分类
参数
- baseinfo_list (list): 包含多个基础信息的列表
返回
2024-10-15 20:57:58 +08:00
- dict: 合并和分类后的基础信息字典
2024-10-14 17:13:11 +08:00
"""
key_groups = {
2024-10-16 20:18:55 +08:00
"招标人/代理信息": ["招标人", "招标人联系方式", "招标代理机构", "招标代理机构联系方式","项目联系方式"],
2024-10-15 20:57:58 +08:00
"项目信息": ["项目名称", "项目编号", "项目概况", "项目基本情况", "招标控制价", "投标竞争下浮率"],
2024-11-08 15:44:29 +08:00
"采购要求": ["采购需求","技术要求","服务要求","商务要求","其他要求"],
2024-10-14 17:13:11 +08:00
"关键时间/内容": [
"投标文件递交截止日期",
2024-10-16 20:18:55 +08:00
"开标时间",
"开标地点",
2024-10-15 20:57:58 +08:00
"澄清招标文件的截止时间",
2024-10-14 17:13:11 +08:00
"投标有效期",
2024-10-15 20:57:58 +08:00
"信息公示媒介"
2024-10-14 17:13:11 +08:00
],
2024-10-15 20:57:58 +08:00
"保证金相关": ["质量保证金"],
2024-10-14 17:13:11 +08:00
"其他信息": [
2024-10-30 20:41:19 +08:00
"重新招标、不再招标和终止招标",
2024-10-15 20:57:58 +08:00
"投标费用承担",
2024-10-16 20:18:55 +08:00
"招标代理服务费",
"是否退还投标文件"
2024-10-14 17:13:11 +08:00
]
}
combined_data = {}
relevant_keys_detected = set()
# 合并所有基础信息并收集相关键
for baseinfo in baseinfo_list:
2024-10-15 20:57:58 +08:00
combined_data.update(baseinfo)
relevant_keys_detected.update(baseinfo.keys())
2024-10-14 17:13:11 +08:00
# 动态调整键组
dynamic_key_handling(key_groups, relevant_keys_detected)
2024-10-18 18:06:23 +08:00
# 创建一个副本以存储未分类的项目
unclassified_items = {k: v for k, v in combined_data.items() if
k not in [item for sublist in key_groups.values() for item in sublist]}
2024-10-14 17:13:11 +08:00
# 按键组分类并嵌套
for group_name, keys in key_groups.items():
group_data = {key: combined_data.get(key, "未提供") for key in keys}
2024-10-15 20:57:58 +08:00
combined_data[group_name] = group_data
2024-10-18 18:06:23 +08:00
# 从 unclassified_items 中移除已分类的键
2024-10-15 20:57:58 +08:00
for key in keys:
2024-10-18 18:06:23 +08:00
unclassified_items.pop(key, None)
# 将剩余未分类的键值对添加到 "其他信息" 组中
combined_data["其他信息"].update(unclassified_items)
# 移除顶层的未分类键值对
for key in list(combined_data.keys()):
if key not in key_groups:
del combined_data[key]
2024-10-15 20:57:58 +08:00
return combined_data
def dynamic_key_handling(key_groups, detected_keys):
# 检查和调整键组配置
for key in detected_keys:
2024-11-08 15:44:29 +08:00
if "技术、服务要求" in detected_keys:
# 如果检测到“技术、服务要求”,则移除“技术要求”和“服务要求”
if "技术要求" in key_groups["采购要求"]:
key_groups["采购要求"].remove("技术要求")
if "服务要求" in key_groups["采购要求"]:
key_groups["采购要求"].remove("服务要求")
# 确保"技术、服务要求"存在于"采购要求"组中
if "技术、服务要求" not in key_groups["采购要求"]:
key_groups["采购要求"].insert(1, "技术、服务要求")
else:
# 如果不存在“技术、服务要求”,确保“技术要求”和“服务要求”都在"采购要求"组中
if "技术要求" not in key_groups["采购要求"]:
key_groups["采购要求"].insert(1, "技术要求")
if "服务要求" not in key_groups["采购要求"]:
key_groups["采购要求"].insert(2, "服务要求")
# 移除"技术、服务要求"以保持互斥
if "技术、服务要求" in key_groups["采购要求"]:
key_groups["采购要求"].remove("技术、服务要求")
2024-10-16 20:18:55 +08:00
# 处理“保证金相关”组
2024-10-15 20:57:58 +08:00
if "保证金" in key:
2024-10-16 20:18:55 +08:00
group = key_groups["保证金相关"]
insert_before = "质量保证金"
if insert_before in group:
index = group.index(insert_before)
if key not in group: # 避免重复插入
group.insert(index, key)
else:
group.append(key) # 如果没有找到特定键,则追加到末尾
elif "联合体" in key:
2024-10-15 20:57:58 +08:00
key_groups["项目信息"].append(key)
elif "分包" in key:
key_groups["项目信息"].append(key)
elif "踏勘现场" in key:
key_groups["其他信息"].append(key)
elif "投标预备会" in key or "投标答疑会" in key:
key_groups["其他信息"].append(key)
elif "偏离" in key:
key_groups["其他信息"].append(key)
elif "递交方式" in key or "递交地点" in key:
2024-10-16 20:18:55 +08:00
group = key_groups["关键时间/内容"]
insert_after = "投标文件递交截止日期"
if insert_after in group:
index = group.index(insert_after)
# 确保新键不重复
if key not in group:
group.insert(index + 1, key)
else:
# 如果“投标文件递交截止日期”不存在,则追加到末尾
if key not in group:
group.append(key)
2024-10-15 20:57:58 +08:00
2024-11-04 17:13:06 +08:00
def get_base_info(merged_baseinfo_path,clause_path):
file_id = upload_file(merged_baseinfo_path)
2024-10-22 21:03:31 +08:00
baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
# baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt'
2024-10-15 20:57:58 +08:00
questions = read_questions_from_file(baseinfo_file_path)
2024-11-01 14:39:39 +08:00
more_query = "请你根据招标文件信息,回答以下问题:是否组织踏勘现场?是否召开投标预备会(或投标答疑会)?是否退还投标文件?是否允许分包? 是否需要递交投标保证金或磋商保证金是否需要提交履约保证金或履约担保是否有招标代理服务费或中标、成交服务费或采购代理服务费请按json格式给我提供信息键名分别为'是否组织踏勘现场','是否召开投标预备会'(或'是否召开投标答疑会','是否退还投标文件',是否允许分包','是否递交投标保证金'(或'是否递交磋商保证金','是否提交履约保证金','是否有招标代理服务费',键值仅限于'','','未知',若存在矛盾信息,请回答'未知'"
2024-10-15 20:57:58 +08:00
questions.append(more_query)
baseinfo_results = multi_threading(questions, "", file_id, 2) # 1代表使用百炼rag 2代表使用qianwen-long
2024-10-17 15:33:58 +08:00
baseinfo_list = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
chosen_numbers, merged = merge_json_to_list(baseinfo_list.pop())
2024-10-15 20:57:58 +08:00
baseinfo_list.append(merged)
2024-10-22 21:03:31 +08:00
judge_file_path = 'flask_app/static/提示词/是否相关问题货物标.txt'
2024-10-31 15:03:32 +08:00
# judge_file_path ='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt'
2024-10-30 20:41:19 +08:00
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交两个任务
2024-11-04 17:13:06 +08:00
future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, merged_baseinfo_path,
2024-10-30 20:41:19 +08:00
baseinfo_list)
2024-11-04 17:13:06 +08:00
future2 = executor.submit(extract_from_notice, merged_baseinfo_path, clause_path, 3) # 新增的多线程任务
2024-10-30 20:41:19 +08:00
# 等待两个任务完成并获取结果
future1.result() # process_judge_questions 直接修改 baseinfo_list不需要返回值
2024-10-31 15:03:32 +08:00
rebidding_situation = future2.result()
update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
baseinfo_list.append(update_json)
2024-10-30 20:41:19 +08:00
# # judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt'
# judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
# # print(judge_questions)
# judge_consortium = judge_consortium_bidding(baseinfo_list) # 通过招标公告判断是否接受联合体投标
#
# if judge_consortium:
# judge_consortium_question = (
# "该招标文件对于联合体投标的要求是怎样的请按json格式给我提供信息"
# "外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\""
# )
# judge_questions.append(judge_consortium_question)
# res2 = multi_threading(judge_questions, "", file_id, 2) # 调用千问-long
# if not res2:
# print("基础信息整合: multi_threading error!")
# else:
# for question, response in res2:
# baseinfo_list.append(clean_json_string(response))
2024-10-15 20:57:58 +08:00
return baseinfo_list
2024-11-16 16:14:53 +08:00
def combine_basic_info(merged_baseinfo_path, procurement_path,clause_path,invalid_path):
2024-10-15 20:57:58 +08:00
baseinfo_list = []
temp_list = []
procurement_reqs = {}
# 定义一个线程函数来获取基础信息
def get_base_info_thread():
nonlocal temp_list
2024-11-04 17:13:06 +08:00
temp_list = get_base_info(merged_baseinfo_path,clause_path)
2024-10-15 20:57:58 +08:00
# 定义一个线程函数来获取采购需求
def fetch_procurement_reqs_thread():
nonlocal procurement_reqs
2024-11-16 16:14:53 +08:00
procurement_reqs = fetch_procurement_reqs(procurement_path,invalid_path)
2024-10-15 20:57:58 +08:00
# 创建并启动获取基础信息的线程
thread1 = threading.Thread(target=get_base_info_thread)
thread1.start()
# 等待一秒后启动获取采购需求的线程
time.sleep(1)
2024-10-24 11:29:31 +08:00
thread2 = threading.Thread(target=fetch_procurement_reqs_thread)
thread2.start()
2024-10-15 20:57:58 +08:00
# 等待两个线程都完成
thread1.join()
2024-10-24 11:29:31 +08:00
thread2.join()
2024-10-15 20:57:58 +08:00
# 合并结果
baseinfo_list += temp_list # temp_list 是一个列表
2024-10-24 11:29:31 +08:00
baseinfo_list.append(procurement_reqs) # procurement_reqs 是一个字典
2024-10-17 15:33:58 +08:00
aggregated_baseinfo = aggregate_basic_info_goods(baseinfo_list)
2024-10-15 20:57:58 +08:00
return {"基础信息": aggregated_baseinfo}
2024-10-14 17:13:11 +08:00
if __name__ == "__main__":
2024-10-15 20:57:58 +08:00
start_time=time.time()
2024-10-22 21:02:54 +08:00
# baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
2024-11-04 17:13:06 +08:00
merged_baseinfo_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_merged_baseinfo.pdf"
2024-10-15 20:57:58 +08:00
# procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
2024-10-31 15:03:32 +08:00
procurement_file_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
clause_path='D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json'
2024-11-04 17:13:06 +08:00
res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
2024-10-31 15:03:32 +08:00
print("------------------------------------")
2024-10-15 20:57:58 +08:00
print(json.dumps(res, ensure_ascii=False, indent=4))
end_time=time.time()
print("elasped time:"+str(end_time-start_time))