10.14
This commit is contained in:
parent
9a2141c3a9
commit
82de3ae202
@ -5,33 +5,48 @@ from flask_app.main.投标人须知正文提取指定内容 import extract_from_
|
||||
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
|
||||
from flask_app.main.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.main.通义千问long import upload_file
|
||||
def combine_basic_info(baseinfo_list):
|
||||
def aggregate_basic_info(baseinfo_list):
|
||||
"""
|
||||
将基础信息列表中的数据进行合并和分类。
|
||||
|
||||
参数:
|
||||
- baseinfo_list (list): 包含多个基础信息的列表。
|
||||
|
||||
返回:
|
||||
- list: 合并和分类后的基础信息列表。
|
||||
"""
|
||||
combined_baseinfo_list = []
|
||||
key_groups = {
|
||||
"招标人/代理信息": ["招标人","招标人联系方式", "招标代理机构","招标代理机构联系方式"],
|
||||
"项目信息": ["工程名称", "招标编号","工程概况","招标范围","招标控制价","投标竞争下浮率"],
|
||||
"关键时间/内容":["投标文件递交截止日期","递交方式","投标人要求澄清招标文件的截止时间","投标有效期","评标结果公示媒介"],
|
||||
"保证金相关":['质量保证金','退还投标保证金'],
|
||||
"其他信息":["重新招标、不再招标和终止招标","是否退还投标文件","费用承担"]
|
||||
"招标人/代理信息": ["招标人", "招标人联系方式", "招标代理机构", "招标代理机构联系方式"],
|
||||
"项目信息": ["工程名称", "招标编号", "工程概况", "招标范围", "招标控制价", "投标竞争下浮率"],
|
||||
"关键时间/内容": [
|
||||
"投标文件递交截止日期",
|
||||
"递交方式",
|
||||
"投标人要求澄清招标文件的截止时间",
|
||||
"投标有效期",
|
||||
"评标结果公示媒介"
|
||||
],
|
||||
"保证金相关": ["质量保证金", "退还投标保证金"],
|
||||
"其他信息": [
|
||||
"重新招标、不再招标和终止招标",
|
||||
"是否退还投标文件",
|
||||
"费用承担"
|
||||
]
|
||||
}
|
||||
# 将所有基础信息合并到一个字典中
|
||||
|
||||
combined_data = {}
|
||||
relevant_keys_detected = set()
|
||||
|
||||
# 预处理以决定哪些键名将被使用
|
||||
# 合并所有基础信息并收集相关键
|
||||
for baseinfo in baseinfo_list:
|
||||
json_data = clean_json_string(baseinfo)
|
||||
combined_data.update(json_data)
|
||||
relevant_keys_detected.update(json_data.keys())
|
||||
# for key in relevant_keys.keys():
|
||||
# if key in json_data:
|
||||
# relevant_keys[key] = True
|
||||
|
||||
# 根据检测到的键动态调整 key_groups
|
||||
# 动态调整键组
|
||||
dynamic_key_handling(key_groups, relevant_keys_detected)
|
||||
|
||||
|
||||
# 使用合并后的字典创建最终输出
|
||||
# 按键组分类并嵌套
|
||||
for group_name, keys in key_groups.items():
|
||||
group_data = {key: combined_data.get(key, "未提供") for key in keys}
|
||||
combined_json = nest_json_under_key(group_data, group_name)
|
||||
@ -74,69 +89,72 @@ def judge_consortium_bidding(baseinfo_list):
|
||||
# 更新原始列表,如果你想保留修改
|
||||
baseinfo_list[:] = updated_list
|
||||
return accept_bidding
|
||||
def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表
|
||||
# 调用大模型回答项目基础信息
|
||||
def combine_basic_info(knowledge_name, truncate0, output_folder, clause_path):
|
||||
"""
|
||||
综合和处理基础信息,生成最终的基础信息字典。
|
||||
|
||||
参数:
|
||||
- knowledge_name (str): 知识名称。
|
||||
- truncate0 (str): 文件路径。
|
||||
- output_folder (str): 输出文件夹路径。
|
||||
- clause_path (str): 条款路径。
|
||||
|
||||
返回:
|
||||
- dict: 综合后的基础信息。
|
||||
"""
|
||||
baseinfo_list = []
|
||||
baseinfo_file_path='flask_app/static/提示词/前两章提问总结.txt'
|
||||
# baseinfo_file_path='D:\\flask_project\\flask_app\\static\\提示词\\前两章提问总结.txt'
|
||||
baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt'
|
||||
questions = read_questions_from_file(baseinfo_file_path)
|
||||
res1 = multi_threading(questions, knowledge_name)
|
||||
|
||||
for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans
|
||||
for index, response in res1:
|
||||
try:
|
||||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||
if response and len(response) > 1:
|
||||
baseinfo_list.append(response[1])
|
||||
else:
|
||||
print(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.")
|
||||
print(f"基础信息整合: Warning: Missing or incomplete response data for query index {index}.")
|
||||
except Exception as e:
|
||||
print(f"基础信息整合: Error processing response for query index {_}: {e}")
|
||||
print(f"基础信息整合: Error processing response for query index {index}: {e}")
|
||||
|
||||
# 判断是否分包、是否需要递交投标保证金等
|
||||
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
|
||||
chosen_numbers, merged = judge_whether_main(truncate0, output_folder)
|
||||
baseinfo_list.append(merged)
|
||||
judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
|
||||
# judge_file_path='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题.txt'
|
||||
|
||||
judge_file_path = 'flask_app/static/提示词/是否相关问题.txt'
|
||||
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
||||
judge_consortium = judge_consortium_bidding(baseinfo_list) # 通过招标公告判断是否接受联合体投标
|
||||
|
||||
judge_consortium = judge_consortium_bidding(baseinfo_list) #通过招标公告判断是否接受联合体投标
|
||||
if judge_consortium:
|
||||
judge_consortium_question = "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\""
|
||||
judge_consortium_question = (
|
||||
"该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,"
|
||||
"外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\""
|
||||
)
|
||||
judge_questions.append(judge_consortium_question)
|
||||
|
||||
file_id=upload_file(truncate0)
|
||||
res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long
|
||||
file_id = upload_file(truncate0)
|
||||
res2 = multi_threading(judge_questions, "", file_id, 2) # 调用千问-long
|
||||
|
||||
if not res2:
|
||||
print("基础信息整合: multi_threading errror!")
|
||||
print("基础信息整合: multi_threading error!")
|
||||
else:
|
||||
# 打印结果
|
||||
for question, response in res2:
|
||||
baseinfo_list.append(response)
|
||||
# for _, response in res2: # _占位,代表ques;response[0]也是ques;response[1]是ans #调用百炼rag
|
||||
# try:
|
||||
# if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||
# baseinfo_list.append(response[1])
|
||||
# else:
|
||||
# print(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.")
|
||||
# except Exception as e:
|
||||
# print(f"基础信息整合: Error processing response for query index {_}: {e}")
|
||||
|
||||
rebidding_situation = extract_from_notice(clause_path, 3) #"重新招标, 不再招标和终止招标"需从投标人须知正文提取
|
||||
|
||||
update_json=rename_outer_key(rebidding_situation,"重新招标、不再招标和终止招标")
|
||||
rebidding_situation = extract_from_notice(clause_path, 3) # "重新招标, 不再招标和终止招标"需从投标人须知正文提取
|
||||
update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
|
||||
baseinfo_list.append(update_json)
|
||||
|
||||
update_baseinfo_list=combine_basic_info(baseinfo_list) #整合基础信息核心代码
|
||||
aggregated_baseinfo = aggregate_basic_info(baseinfo_list) # 整合基础信息核心代码
|
||||
baseinfo_combined_res = combine_json_results(aggregated_baseinfo) # 返回值是字典
|
||||
|
||||
baseinfo_combined_res = combine_json_results(update_baseinfo_list) # 返回值是字典
|
||||
# return nest_json_under_key(baseinfo_combined_res, "基础信息") #返回值是json字符串
|
||||
return {"基础信息":baseinfo_combined_res}
|
||||
return {"基础信息": baseinfo_combined_res}
|
||||
|
||||
if __name__ == "__main__":
|
||||
knowledge_name = "ztb"
|
||||
output_folder="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405"
|
||||
truncate0="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\ztbfile_tobidders_notice_table.pdf"
|
||||
clause_path="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\clause1.json"
|
||||
res=project_basic_info(knowledge_name,truncate0,output_folder,clause_path)
|
||||
res=combine_basic_info(knowledge_name,truncate0,output_folder,clause_path)
|
||||
print(json.dumps(res,ensure_ascii=False,indent=4))
|
||||
|
||||
|
||||
|
@ -1,10 +1,12 @@
|
||||
# 基于知识库提问的通用模板,
|
||||
# assistant_id
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import queue
|
||||
import concurrent.futures
|
||||
import time
|
||||
|
||||
import requests
|
||||
from dashscope import Assistants, Messages, Runs, Threads
|
||||
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
|
||||
from flask_app.main.通义千问long import qianwen_long, upload_file
|
||||
@ -100,6 +102,98 @@ def rag_assistant(knowledge_name):
|
||||
)
|
||||
return assistant
|
||||
|
||||
#TODO:http格式,有bug还没修改
|
||||
def create_assistant(knowledge_name):
|
||||
"""
|
||||
Create an assistant using DashScope API via HTTP request based on the provided knowledge name.
|
||||
|
||||
Parameters:
|
||||
knowledge_name (str): The name of the knowledge base to associate with the assistant.
|
||||
|
||||
Returns:
|
||||
dict: Response from the API containing assistant details.
|
||||
|
||||
Raises:
|
||||
ValueError: If the DASHSCOPE_API_KEY environment variable is not set.
|
||||
Exception: If any error occurs during the HTTP request.
|
||||
"""
|
||||
# Step 1: Initialize the Retriever and get the Pipeline ID
|
||||
try:
|
||||
retriever = DashScopeCloudRetriever(knowledge_name)
|
||||
pipeline_id = str(retriever.pipeline_id)
|
||||
except Exception as e:
|
||||
print(f"Error retrieving pipeline ID for knowledge '{knowledge_name}': {e}")
|
||||
return None
|
||||
|
||||
# Step 2: Fetch the API Key from Environment Variables
|
||||
api_key = os.getenv("DASHSCOPE_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("DASHSCOPE_API_KEY environment variable is not set.")
|
||||
|
||||
# Step 3: Define the API Endpoint and Headers
|
||||
url = 'https://dashscope.aliyuncs.com/api/v1/assistants'
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}"
|
||||
}
|
||||
|
||||
# Step 4: Construct the Instructions
|
||||
instructions = (
|
||||
"请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}"
|
||||
)
|
||||
|
||||
# Step 5: Define the Tools
|
||||
tools = [
|
||||
{
|
||||
"type": "code_interpreter"
|
||||
},
|
||||
{
|
||||
"type": "rag",
|
||||
"prompt_ra": {
|
||||
"pipeline_id": pipeline_id,
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query_word": {
|
||||
"type": "str",
|
||||
"value": "${documents}"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
# Step 6: Construct the Payload
|
||||
payload = {
|
||||
"model": "qwen-max",
|
||||
"name": "智能小助手", # "Smart Helper" in Chinese
|
||||
"description": "智能助手,支持知识库查询和插件调用。",
|
||||
"temperature": 0.3,
|
||||
"instructions": instructions,
|
||||
"tools": tools,
|
||||
"file_ids": [], # Add file IDs if necessary
|
||||
"metadata": {} # Add metadata if necessary
|
||||
}
|
||||
|
||||
# Optional: If you have specific file_ids or metadata, you can modify the payload accordingly
|
||||
# For example:
|
||||
# payload["file_ids"] = ["file_id_1", "file_id_2"]
|
||||
# payload["metadata"] = {"key1": "value1", "key2": "value2"}
|
||||
|
||||
# Step 7: Make the HTTP POST Request
|
||||
try:
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
|
||||
assistant = response.json()
|
||||
print("Assistant created successfully:")
|
||||
print(json.dumps(assistant, indent=4, ensure_ascii=False))
|
||||
return assistant
|
||||
except requests.exceptions.HTTPError as http_err:
|
||||
print(f"HTTP error occurred: {http_err} - Response: {response.text}")
|
||||
except Exception as err:
|
||||
print(f"An error occurred: {err}")
|
||||
|
||||
|
||||
def pure_assistant():
|
||||
assistant = Assistants.create(
|
||||
@ -119,6 +213,7 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
|
||||
if llm_type==1:
|
||||
print(f"rag_assistant! question:{question}")
|
||||
assistant = rag_assistant(knowledge_name)
|
||||
# assistant=create_assistant(knowledge_name)
|
||||
elif llm_type==2:
|
||||
print(f"qianwen_long! question:{question}")
|
||||
qianwen_res = qianwen_long(file_id,question)
|
||||
@ -185,24 +280,25 @@ if __name__ == "__main__":
|
||||
# print(f"Question: {question}")
|
||||
# print(f"Response: {response}")
|
||||
|
||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_evaluation_method.pdf"
|
||||
# file_id = upload_file(file_path)
|
||||
# questions=["根据该文档中的评标办法前附表,请你列出该文件的技术标,以json的格式返回结果","根据该文档中的评标办法前附表,请你列出该文件的商务标,以json的格式返回结果","根据该文档中的评标办法前附表,请你列出该文件的投标报价,以json的格式返回结果"]
|
||||
# results=multi_threading(questions,"",file_id,2) #1代表使用百炼rag 2代表使用qianwen-long
|
||||
# if not results:
|
||||
# print("errror!")
|
||||
# else:
|
||||
# # 打印结果
|
||||
# for question, response in results:
|
||||
# print(f"Question: {question}")
|
||||
# print(f"Response: {response}")
|
||||
ques=["关于'资格要求',本采购文件第一章第二款要求的内容是怎样的?请按json格式给我提供信息,键名为'资格要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"]
|
||||
# ques=["该招标文件的工程名称(项目名称)是?招标编号是?招标人是?招标代理机构是?请按json格式给我提供信息,键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"]
|
||||
results = multi_threading(ques, "6.2视频会议docx")
|
||||
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件(1)\\6.2定版视频会议磋商文件_1-21.pdf"
|
||||
file_id = upload_file(file_path)
|
||||
questions=["该招标文件的项目名称是?项目编号(或招标编号)是?采购人(或招标人)是?采购代理机构(或招标代理机构)是?请按json格式给我提供信息,键名分别是'项目名称','项目编号','采购人','采购代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的项目概况是?项目基本情况是?请按json格式给我提供信息,键名分别为'项目概况','项目基本情况',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,而嵌套键值必须与原文保持一致,若存在未知信息,在对应的键值中填'未知'。"]
|
||||
results=multi_threading(questions,"",file_id,2) #1代表使用百炼rag 2代表使用qianwen-long
|
||||
if not results:
|
||||
print("errror!")
|
||||
else:
|
||||
# 打印结果
|
||||
for question, response in results:
|
||||
print(f"Question: {question}")
|
||||
print(f"Response: {response}")
|
||||
print(f"Response: {response}")
|
||||
|
||||
# ques=["关于'资格要求',本采购文件第一章第二款要求的内容是怎样的?请按json格式给我提供信息,键名为'资格要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"]
|
||||
# # ques=["该招标文件的工程名称(项目名称)是?招标编号是?招标人是?招标代理机构是?请按json格式给我提供信息,键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"]
|
||||
# results = multi_threading(ques, "6.2视频会议docx")
|
||||
# if not results:
|
||||
# print("errror!")
|
||||
# else:
|
||||
# # 打印结果
|
||||
# for question, response in results:
|
||||
# print(f"Question: {question}")
|
||||
# print(f"Response: {response}")
|
@ -9,11 +9,11 @@ from flask_app.main.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.main.table_content_extraction import extract_tables_main
|
||||
from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge
|
||||
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
|
||||
from flask_app.main.json_utils import nest_json_under_key, transform_json_values, combine_json_results
|
||||
from flask_app.main.json_utils import transform_json_values, combine_json_results
|
||||
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
|
||||
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
|
||||
import concurrent.futures
|
||||
from flask_app.main.基础信息整合 import project_basic_info
|
||||
from flask_app.main.基础信息整合 import combine_basic_info
|
||||
from flask_app.main.资格审查模块 import combine_review_standards
|
||||
from flask_app.main.商务标技术标整合 import combine_evaluation_standards
|
||||
from flask_app.main.format_change import pdf2docx, docx2pdf
|
||||
@ -96,7 +96,7 @@ def post_processing(data,includes):
|
||||
# 基本信息
|
||||
def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表
|
||||
logger.info("starting基础信息...")
|
||||
basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path)
|
||||
basic_res = combine_basic_info(knowledge_name, truncate0, output_folder, clause_path)
|
||||
logger.info("基础信息done")
|
||||
return basic_res
|
||||
|
||||
@ -149,18 +149,14 @@ def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpat
|
||||
def fetch_bidding_documents_requirements(clause_path):
|
||||
logger.info("starting投标文件要求...")
|
||||
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
|
||||
qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求")
|
||||
logger.info("投标文件要求done...")
|
||||
# return qualify_nested_res
|
||||
return {"投标文件要求":fetch_bidding_documents_requirements_json}
|
||||
|
||||
# 开评定标流程
|
||||
def fetch_bid_opening(clause_path):
|
||||
logger.info("starting开评定标流程...")
|
||||
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
|
||||
qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程")
|
||||
logger.info("开评定标流程done...")
|
||||
# return qualify_nested_res
|
||||
return {"开评定标流程":fetch_bid_opening_json}
|
||||
|
||||
# def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf
|
||||
|
@ -65,12 +65,41 @@ def read_tables_from_docx(file_path):
|
||||
# 返回符合条件的单元格内容
|
||||
return cell_contents
|
||||
|
||||
|
||||
def read_docx_by_paragraphs(file_path):
|
||||
"""
|
||||
按段落读取指定路径的 .docx 文件。
|
||||
|
||||
参数:
|
||||
file_path (str): .docx 文件的路径。
|
||||
|
||||
返回:
|
||||
list: 包含所有段落文本的列表。
|
||||
"""
|
||||
try:
|
||||
# 打开文档
|
||||
doc = Document(file_path)
|
||||
|
||||
# 读取所有段落的文本
|
||||
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
|
||||
|
||||
return paragraphs
|
||||
except Exception as e:
|
||||
print(f"读取 .docx 文件时发生错误: {e}")
|
||||
return []
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx'
|
||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp" # 前附表json文件
|
||||
# read_docx(file_path)
|
||||
read_docx_tables(file_path)
|
||||
list=read_tables_from_docx(file_path)
|
||||
for i in list:
|
||||
print(i)
|
||||
print("--------------")
|
||||
file_path = 'D:\\flask_project\\flask_app\\static\\output\\015d997e-c32c-49d1-a611-a2e817ace6a1\\ztbfile.docx'
|
||||
read_docx(file_path) #按行读取
|
||||
|
||||
# paragraphs = read_docx_by_paragraphs(file_path) #按段落读取
|
||||
#
|
||||
# print(f"共读取到 {len(paragraphs)} 个段落。\n")
|
||||
# for idx, para in enumerate(paragraphs, 1):
|
||||
# print(f"段落 {idx}: {para}\n")
|
||||
|
||||
# read_docx_tables(file_path)
|
||||
# list=read_tables_from_docx(file_path)
|
||||
# for i in list:
|
||||
# print(i)
|
||||
# print("--------------")
|
26
flask_app/static/提示词/基本信息货物标.txt
Normal file
26
flask_app/static/提示词/基本信息货物标.txt
Normal file
@ -0,0 +1,26 @@
|
||||
1.该招标文件的项目名称是?项目编号(或招标编号)是?采购人(或招标人)是?采购代理机构(或招标代理机构)是?请按json格式给我提供信息,键名分别是'项目名称','项目编号','采购人','采购代理机构',若存在未知信息,在对应的键值中填'未知'。
|
||||
|
||||
2.该招标文件的项目概况是?项目基本情况是?请按json格式给我提供信息,键名分别为'项目概况','项目基本情况',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,而嵌套键值必须与原文保持一致,若存在未知信息,在对应的键值中填'未知'。
|
||||
|
||||
3.该招标文件的最高限价(或招标控制价)是?请按json格式给我提供信息,键名为'招标控制价',若存在未知信息,在对应的键值中填'未知'。
|
||||
|
||||
4.投标文件(或响应文件)递交截止时间是?递交地点(或方式)是?请按json格式给我提供信息,键名分别是'投标文件递交截止日期','递交地点'(或'递交方式'),若存在未知信息,在对应的键值中填'未知'。
|
||||
|
||||
5.采购人(招标人)和采购代理机构(或招标代理机构)的联系方式是?请按json格式给我提供信息,键名分别是'采购人联系方式','采购代理机构联系方式',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。
|
||||
|
||||
6.该招标文件的信息公示媒介在哪?请按json格式给我提供信息,键名是'信息公示媒介',若存在未知信息,在对应的键值中填'未知'。
|
||||
|
||||
7.该招标文件的投标竞争下浮率是多少?请按json格式给我提供信息,键名是'投标竞争下浮率',若存在未知信息,在对应的键值中填'未知'。
|
||||
|
||||
8.该项目的投标有效期(或响应文件有效期)是什么?请按json格式给我提供信息,键名是'投标有效期',若存在未知信息,在对应的键值中填'未知'。
|
||||
|
||||
9.该招标文件对投标人准备和参加投标活动发生的费用是如何规定的?请以json的格式给我提供信息,键名是'费用承担',若存在未知信息,在对应的键值中填'未知'。
|
||||
|
||||
10.求澄清的招标文件截止时间是?请以json的格式给我提供信息,键名是'投标人要求澄清招标文件的截止时间',若存在未知信息,在对应的键值中填'未知'。
|
||||
|
||||
11.该文档要求扣留的质量保证金百分比是多少,请以json格式给我提供信息,键名为'质量保证金',如果没有则以'未知'填充。
|
||||
|
||||
12.该项目是否接受联合体投标?请按json格式给我提供信息,键名为'是否接受联合体投标','是否接受联合体投标'的键值仅限于'是'、'否'、'未知'。
|
||||
|
||||
|
||||
|
@ -1,4 +1,89 @@
|
||||
import json
|
||||
|
||||
from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results
|
||||
from flask_app.main.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.main.通义千问long import upload_file
|
||||
|
||||
|
||||
def dynamic_key_handling(key_groups, detected_keys):
|
||||
# 检查和调整键组配置
|
||||
for key in detected_keys:
|
||||
if "投标保证金" in key or "履约保证金" in key:
|
||||
key_groups["保证金相关"].append(key)
|
||||
elif "是否接受联合体" in key:
|
||||
key_groups["项目信息"].append(key)
|
||||
elif "联合体投标要求" in key:
|
||||
key_groups["项目信息"].append(key)
|
||||
elif "分包" in key:
|
||||
key_groups["项目信息"].append(key)
|
||||
elif "踏勘现场" in key:
|
||||
key_groups["其他信息"].append(key)
|
||||
elif "投标预备会" in key:
|
||||
key_groups["其他信息"].append(key)
|
||||
elif "偏离" in key:
|
||||
key_groups["其他信息"].append(key)
|
||||
def aggregate_basic_info(baseinfo_list):
|
||||
"""
|
||||
将基础信息列表中的数据进行合并和分类。
|
||||
|
||||
参数:
|
||||
- baseinfo_list (list): 包含多个基础信息的列表。
|
||||
|
||||
返回:
|
||||
- list: 合并和分类后的基础信息列表。
|
||||
"""
|
||||
combined_baseinfo_list = []
|
||||
key_groups = {
|
||||
"招标人/代理信息": ["招标人", "招标人联系方式", "招标代理机构", "招标代理机构联系方式"],
|
||||
"项目信息": ["工程名称", "招标编号", "工程概况", "招标范围", "招标控制价", "投标竞争下浮率"],
|
||||
"关键时间/内容": [
|
||||
"投标文件递交截止日期",
|
||||
"递交方式",
|
||||
"投标人要求澄清招标文件的截止时间",
|
||||
"投标有效期",
|
||||
"评标结果公示媒介"
|
||||
],
|
||||
"保证金相关": ["质量保证金", "退还投标保证金"],
|
||||
"其他信息": [
|
||||
"重新招标、不再招标和终止招标",
|
||||
"是否退还投标文件",
|
||||
"费用承担"
|
||||
]
|
||||
}
|
||||
|
||||
combined_data = {}
|
||||
relevant_keys_detected = set()
|
||||
|
||||
# 合并所有基础信息并收集相关键
|
||||
for baseinfo in baseinfo_list:
|
||||
json_data = clean_json_string(baseinfo)
|
||||
combined_data.update(json_data)
|
||||
relevant_keys_detected.update(json_data.keys())
|
||||
|
||||
# 动态调整键组
|
||||
dynamic_key_handling(key_groups, relevant_keys_detected)
|
||||
|
||||
# 按键组分类并嵌套
|
||||
for group_name, keys in key_groups.items():
|
||||
group_data = {key: combined_data.get(key, "未提供") for key in keys}
|
||||
combined_json = nest_json_under_key(group_data, group_name)
|
||||
combined_baseinfo_list.append(combined_json)
|
||||
|
||||
return combined_baseinfo_list
|
||||
|
||||
def combine_basic_info(knowledge_name,output_folder,clause_path):
|
||||
# file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件(1)\\6.2定版视频会议磋商文件_1-21.pdf"
|
||||
# file_id = upload_file(file_path)
|
||||
# baseinfo_file_path='flask_app/static/提示词/前两章提问总结.txt'
|
||||
# questions=read_questions_from_file(baseinfo_file_path)
|
||||
# results = multi_threading(questions, "", file_id, 2) # 1代表使用百炼rag 2代表使用qianwen-long
|
||||
# if not results:
|
||||
# print("errror!")
|
||||
# else:
|
||||
# # 打印结果
|
||||
# for question, response in results:
|
||||
# print(f"Question: {question}")
|
||||
# print(f"Response: {response}")
|
||||
baseinfo_combined_res={
|
||||
"招标人/代理信息": {
|
||||
"招标人": "黄石临空建设管理有限公司",
|
||||
@ -27,4 +112,12 @@ def combine_basic_info(knowledge_name,output_folder,clause_path):
|
||||
}
|
||||
}
|
||||
}
|
||||
return {"基础信息":baseinfo_combined_res}
|
||||
return {"基础信息":baseinfo_combined_res}
|
||||
|
||||
if __name__ == "__main__":
|
||||
knowledge_name = "ztb"
|
||||
output_folder="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405"
|
||||
truncate0="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\ztbfile_tobidders_notice_table.pdf"
|
||||
clause_path="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\clause1.json"
|
||||
res=combine_basic_info(knowledge_name,output_folder,clause_path)
|
||||
print(json.dumps(res,ensure_ascii=False,indent=4))
|
@ -57,12 +57,12 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
||||
extracted_paragraphs[active_key] = [text]
|
||||
if match_keywords(text, follow_up_keywords):
|
||||
continue_collecting = True
|
||||
section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text)
|
||||
section_number = re.match(r'^(\d+(\s*\.\s*\d+)*)\s*、', text) # 修改后的正则,支持 '数字 、' 格式
|
||||
if section_number:
|
||||
current_section_number = section_number.group(1)
|
||||
level_count = current_section_number.count('.')
|
||||
|
||||
# Pattern to match current level, e.g., 3.4.5
|
||||
# Pattern to match current level, e.g., 3.4.5 或者 3
|
||||
pattern = r'^' + (r'\d+\s*\.\s*') * level_count + r'\d+'
|
||||
|
||||
# Generate patterns for next section at same level and parent level
|
||||
@ -71,20 +71,23 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
||||
|
||||
# Next section at same level
|
||||
parts[-1] = str(int(parts[-1]) + 1)
|
||||
next_pattern = r'^' + r'\s*\.\s*'.join(parts)
|
||||
next_pattern = r'^' + r'\.\s*'.join(parts)
|
||||
matched_patterns.append(next_pattern)
|
||||
|
||||
# Parent section (if applicable)
|
||||
if len(parts) > 1:
|
||||
parent_section_parts = parts[:-1]
|
||||
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
|
||||
parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts)
|
||||
parent_pattern = r'^' + r'\.\s*'.join(parent_section_parts)
|
||||
matched_patterns.append(parent_pattern)
|
||||
|
||||
# 添加对 '数字 、' 格式的支持
|
||||
digit_comma_pattern = r'^\d+\s*、'
|
||||
matched_patterns.append(digit_comma_pattern)
|
||||
|
||||
# Combine the patterns
|
||||
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
|
||||
current_section_pattern = re.compile(combined_pattern)
|
||||
|
||||
else:
|
||||
found_next_number = False
|
||||
current_section_pattern = None
|
||||
@ -93,7 +96,8 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
||||
current_index += 1
|
||||
next_text = doc.paragraphs[current_index].text.strip()
|
||||
if not found_next_number:
|
||||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text)
|
||||
# 修改后的正则,支持 '数字 、' 格式
|
||||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))|(\d+\s*、)', next_text)
|
||||
if next_section_number:
|
||||
found_next_number = True
|
||||
if next_section_number.group(1):
|
||||
@ -101,12 +105,14 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
||||
dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
|
||||
elif next_section_number.group(2):
|
||||
dynamic_pattern = r'^[\(\(]\d+[\)\)]'
|
||||
elif next_section_number.group(3):
|
||||
dynamic_pattern = r'^\d+\s*、'
|
||||
current_section_pattern = re.compile(dynamic_pattern)
|
||||
if current_section_pattern and re.match(current_section_pattern, next_text):
|
||||
extracted_paragraphs[active_key].append(next_text)
|
||||
else:
|
||||
continue_collecting = False
|
||||
active_key=None
|
||||
active_key = None
|
||||
break
|
||||
|
||||
return current_index
|
||||
@ -118,6 +124,8 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
||||
|
||||
return extracted_paragraphs
|
||||
|
||||
|
||||
|
||||
"""
|
||||
eg:
|
||||
text_list = ["这是第一句。 1. 接下来是第二句! (3) 最后一句。"]
|
||||
@ -143,6 +151,7 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
|
||||
for key, text_list in extracted_contents.items():
|
||||
if len(text_list) == 1:
|
||||
for data in text_list:
|
||||
# print(data)
|
||||
# 检查是否包含任何需要排除的字符串
|
||||
if any(exclude in data for exclude in excludes):
|
||||
continue # 如果包含任何排除字符串,跳过这个数据
|
||||
|
@ -464,9 +464,10 @@ def truncate_pdf_multiple(input_path, output_folder):
|
||||
|
||||
# TODO:交通智能系统和招标(1)(1)文件有问题 sele=4的时候excludsion有问题
|
||||
if __name__ == "__main__":
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\outputtest"
|
||||
# files=truncate_pdf_multiple(input_path,output_folder)
|
||||
# print(files)
|
||||
selection = 1 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\ztbfile.pdf"
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9"
|
||||
files=truncate_pdf_multiple(input_path,output_folder)
|
||||
print(files)
|
||||
|
||||
# selection = 1 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2
|
||||
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
|
@ -36,6 +36,9 @@ def preprocess_files(output_folder, file_path, file_type, unique_id):
|
||||
elif file_type == 2: # pdf
|
||||
pdf_path = file_path
|
||||
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
||||
elif file_type ==3: #doc
|
||||
pdf_path=docx2pdf(file_path)
|
||||
docx_path=pdf2docx(pdf_path)
|
||||
else:
|
||||
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||
return None
|
||||
|
Loading…
x
Reference in New Issue
Block a user