zbparse/flask_app/main/多线程提问.py
2024-10-16 20:18:55 +08:00

323 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 基于知识库提问的通用模板,
# assistant_id
import json
import os
import re
import queue
import concurrent.futures
import time
import requests
from dashscope import Assistants, Messages, Runs, Threads
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
from flask_app.main.通义千问long import qianwen_long, upload_file
prompt = """
# 角色
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
## 技能
### 技能 1文档解析与摘要
- 深入理解并分析${documents}的内容,提取关键信息。
- 根据需求生成简洁明了的摘要,保持原文核心意义不变。
### 技能 2信息检索与关联
- 在${documents}中高效检索特定信息或关键词。
- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。
## 限制
- 所有操作均需基于${documents}的内容,不可超出此范围创造信息。
- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。
- 确保所有生成或改编的内容逻辑连贯,无误导性信息。
请注意,上述技能执行时将直接利用并参考${documents}的具体内容,以确保所有产出紧密相关且高质量。
"""
prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}'
def read_questions_from_file(file_path):
questions = []
current_question = ""
current_number = 0
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
line = line.strip()
if not line: # 跳过空行
continue
# 检查是否是新的问题编号
match = re.match(r'^(\d+)\.', line)
if match:
# 如果有之前的问题,保存它
if current_question:
questions.append(current_question.strip())
# 开始新的问题
current_number = int(match.group(1))
current_question = line.split('.', 1)[1].strip() + "\n"
else:
# 继续添加到当前问题
current_question += line + "\n"
# 添加最后一个问题
if current_question:
questions.append(current_question.strip())
return questions
#正文和文档名之间的内容
def send_message(assistant, message='百炼是什么?'):
ans = []
print(f"Query: {message}")
# create thread.
thread = Threads.create()
print(thread)
# create a message.
message = Messages.create(thread.id, content=message)
# create run
run = Runs.create(thread.id, assistant_id=assistant.id)
# print(run)
# wait for run completed or requires_action
run_status = Runs.wait(run.id, thread_id=thread.id)
# print(run_status)
# get the thread messages.
msgs = Messages.list(thread.id)
for message in msgs['data'][::-1]:
ans.append(message['content'][0]['text']['value'])
return ans
def rag_assistant(knowledge_name):
retriever = DashScopeCloudRetriever(knowledge_name)
pipeline_id = str(retriever.pipeline_id)
assistant = Assistants.create(
model='qwen-max',
name='smart helper',
description='智能助手,支持知识库查询和插件调用。',
temperature='0.3',
instructions="请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}",
tools=[
{
"type": "code_interpreter"
},
{
"type": "rag",
"prompt_ra": {
"pipeline_id": pipeline_id,
"parameters": {
"type": "object",
"properties": {
"query_word": {
"type": "str",
"value": "${documents}"
}
}
}
}
}]
)
return assistant
#TODO:http格式有bug还没修改
def create_assistant(knowledge_name):
"""
Create an assistant using DashScope API via HTTP request based on the provided knowledge name.
Parameters:
knowledge_name (str): The name of the knowledge base to associate with the assistant.
Returns:
dict: Response from the API containing assistant details.
Raises:
ValueError: If the DASHSCOPE_API_KEY environment variable is not set.
Exception: If any error occurs during the HTTP request.
"""
# Step 1: Initialize the Retriever and get the Pipeline ID
try:
retriever = DashScopeCloudRetriever(knowledge_name)
pipeline_id = str(retriever.pipeline_id)
except Exception as e:
print(f"Error retrieving pipeline ID for knowledge '{knowledge_name}': {e}")
return None
# Step 2: Fetch the API Key from Environment Variables
api_key = os.getenv("DASHSCOPE_API_KEY")
if not api_key:
raise ValueError("DASHSCOPE_API_KEY environment variable is not set.")
# Step 3: Define the API Endpoint and Headers
url = 'https://dashscope.aliyuncs.com/api/v1/assistants'
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
# Step 4: Construct the Instructions
instructions = (
"请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}"
)
# Step 5: Define the Tools
tools = [
{
"type": "code_interpreter"
},
{
"type": "rag",
"prompt_ra": {
"pipeline_id": pipeline_id,
"parameters": {
"type": "object",
"properties": {
"query_word": {
"type": "str",
"value": "${documents}"
}
}
}
}
}
]
# Step 6: Construct the Payload
payload = {
"model": "qwen-max",
"name": "智能小助手", # "Smart Helper" in Chinese
"description": "智能助手,支持知识库查询和插件调用。",
"temperature": 0.3,
"instructions": instructions,
"tools": tools,
"file_ids": [], # Add file IDs if necessary
"metadata": {} # Add metadata if necessary
}
# Optional: If you have specific file_ids or metadata, you can modify the payload accordingly
# For example:
# payload["file_ids"] = ["file_id_1", "file_id_2"]
# payload["metadata"] = {"key1": "value1", "key2": "value2"}
# Step 7: Make the HTTP POST Request
try:
response = requests.post(url, headers=headers, data=json.dumps(payload))
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
assistant = response.json()
print("Assistant created successfully:")
print(json.dumps(assistant, indent=4, ensure_ascii=False))
return assistant
except requests.exceptions.HTTPError as http_err:
print(f"HTTP error occurred: {http_err} - Response: {response.text}")
except Exception as err:
print(f"An error occurred: {err}")
def pure_assistant():
assistant = Assistants.create(
model='qwen-max',
name='smart helper',
description='智能助手,能基于用户的要求精准简洁地回答用户的提问',
instructions='智能助手,能基于用户的要求精准简洁地回答用户的提问',
tools=[
{
"type": "code_interpreter"
},
]
)
return assistant
def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type):
if llm_type==1:
print(f"rag_assistant! question:{question}")
assistant = rag_assistant(knowledge_name)
# assistant=create_assistant(knowledge_name)
elif llm_type==2:
print(f"qianwen_long! question:{question}")
qianwen_res = qianwen_long(file_id,question)
result_queue.put((ans_index,(question,qianwen_res)))
return
else :
assistant = pure_assistant()
ans = send_message(assistant, message=question)
result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans)
def multi_threading(queries, knowledge_name="", file_id="", llm_type=1):
if not queries:
return []
print("多线程提问starting multi_threading...")
result_queue = queue.Queue()
# 使用 ThreadPoolExecutor 管理线程
with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
# 逐个提交任务每提交一个任务后休眠1秒
future_to_query = {}
for index, query in enumerate(queries):
future = executor.submit(llm_call, query, knowledge_name, file_id, result_queue, index, llm_type)
future_to_query[future] = index
time.sleep(1) # 每提交一个任务后等待1秒
# 收集每个线程的结果
for future in concurrent.futures.as_completed(future_to_query):
index = future_to_query[future]
try:
future.result() # 捕获异常或确认任务完成
except Exception as exc:
print(f"Query {index} generated an exception: {exc}")
# 确保在异常情况下也向 result_queue 添加占位符
result_queue.put((index, None))
# 从队列中获取所有结果并按索引排序
results = [None] * len(queries)
while not result_queue.empty():
index, result = result_queue.get()
results[index] = result
# 检查是否所有结果都是 None
if all(result is None for result in results):
return []
# 过滤掉None值
results = [r for r in results if r is not None]
# 返回一个保证是列表的结构
return results
if __name__ == "__main__":
start_time=time.time()
# # 读取问题列表
baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\前两章提问总结.txt'
questions =read_questions_from_file(baseinfo_file_path)
knowledge_name = "招标解析5word"
llm_type=1
results = multi_threading(questions, knowledge_name)
end_time = time.time()
if not results:
print("errror!")
else:
print("elapsed time:"+str(end_time-start_time))
# 打印结果
for question, response in results:
print(f"Response: {response}")
# file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件(1)\\6.2定版视频会议磋商文件_1-21.pdf"
# file_id = upload_file(file_path)
# questions=["该招标文件的项目名称是项目编号或招标编号采购人或招标人采购代理机构或招标代理机构请按json格式给我提供信息键名分别是'项目名称','项目编号','采购人','采购代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的项目概况是项目基本情况是请按json格式给我提供信息键名分别为'项目概况','项目基本情况',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,而嵌套键值必须与原文保持一致,若存在未知信息,在对应的键值中填'未知'。"]
# results=multi_threading(questions,"",file_id,2) #1代表使用百炼rag 2代表使用qianwen-long
# if not results:
# print("errror!")
# else:
# # 打印结果
# for question, response in results:
# print(f"Question: {question}")
# print(f"Response: {response}")
# ques=["关于'资格要求',本采购文件第一章第二款要求的内容是怎样的请按json格式给我提供信息键名为'资格要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"]
# # ques=["该招标文件的工程名称项目名称招标编号是招标人是招标代理机构是请按json格式给我提供信息键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的工程概况或项目概况招标范围是请按json格式给我提供信息键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"]
# results = multi_threading(ques, "6.2视频会议docx")
# if not results:
# print("errror!")
# else:
# # 打印结果
# for question, response in results:
# print(f"Question: {question}")
# print(f"Response: {response}")