217 lines
8.5 KiB
Python
217 lines
8.5 KiB
Python
#基于多线程提问,现已废弃
|
||
# assistant_id
|
||
import queue
|
||
import concurrent.futures
|
||
from dashscope import Assistants, Messages, Runs, Threads
|
||
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
|
||
from json_utils import extract_content_from_json
|
||
prompt = """
|
||
# 角色
|
||
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
||
|
||
## 技能
|
||
### 技能 1:文档解析与摘要
|
||
- 深入理解并分析${document1}的内容,提取关键信息。
|
||
- 根据需求生成简洁明了的摘要,保持原文核心意义不变。
|
||
|
||
### 技能 2:信息检索与关联
|
||
- 在${document1}中高效检索特定信息或关键词。
|
||
- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。
|
||
|
||
## 限制
|
||
- 所有操作均需基于${document1}的内容,不可超出此范围创造信息。
|
||
- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。
|
||
- 确保所有生成或改编的内容逻辑连贯,无误导性信息。
|
||
|
||
请注意,上述技能执行时将直接利用并参考${document1}的具体内容,以确保所有产出紧密相关且高质量。
|
||
"""
|
||
prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}'
|
||
|
||
|
||
|
||
|
||
#正文和文档名之间的内容
|
||
def extract_content_between_tags(text):
|
||
results = []
|
||
|
||
# 使用“【正文】”来分割文本
|
||
parts = text.split('【正文】')[1:] # 跳过第一个分割结果,因为它前面不会有内容
|
||
|
||
for index, part in enumerate(parts):
|
||
# 查找“【文档名】”标签的位置
|
||
doc_name_index = part.find('【文档名】')
|
||
# 查找 'file_ids' 标签的位置
|
||
file_ids_index = part.find("'file_ids'")
|
||
|
||
# 根据是否找到“【文档名】”来决定提取内容的截止点
|
||
if doc_name_index != -1:
|
||
end_index = doc_name_index
|
||
elif file_ids_index != -1:
|
||
end_index = file_ids_index
|
||
else:
|
||
end_index = len(part)
|
||
|
||
# 提取内容
|
||
content = part[:end_index].strip()
|
||
results.append(content)
|
||
|
||
# 如果存在 file_ids,处理最后一部分特别提取 file_ids 前的内容
|
||
if "'file_ids'" in parts[-1]:
|
||
file_ids_index = parts[-1].find("'file_ids'")
|
||
if file_ids_index != -1:
|
||
last_content = parts[-1][:file_ids_index].strip()
|
||
results[-1] = last_content # 更新最后一部分的内容,确保只到 file_ids
|
||
|
||
return results
|
||
|
||
def find_references_in_extracted(formatted_ans, extracted_references):
|
||
results = {} # 用来存储匹配结果的字典
|
||
|
||
# 递归函数,用于处理多层嵌套的字典
|
||
def recurse_through_dict(current_dict, path=[]):
|
||
for key, value in current_dict.items():
|
||
# 检查值是否还是字典,如果是,进行递归
|
||
if isinstance(value, dict):
|
||
recurse_through_dict(value, path + [key])
|
||
else:
|
||
# 特定值处理:如果值为'未知',直接设置索引为-1
|
||
if value == '未知':
|
||
results['.'.join(path + [key])] = -1
|
||
else:
|
||
# 进行匹配检查
|
||
found = False
|
||
for index, reference in enumerate(extracted_references):
|
||
if str(value) in reference: # 转换为字符串,确保兼容性
|
||
results['.'.join(path + [key])] = index # 使用点表示法记录路径
|
||
found = True
|
||
break
|
||
if not found:
|
||
results['.'.join(path + [key])] = None
|
||
|
||
# 从根字典开始递归
|
||
recurse_through_dict(formatted_ans)
|
||
return results
|
||
|
||
def send_message(assistant, message='百炼是什么?'):
|
||
ans = []
|
||
print(f"Query: {message}")
|
||
|
||
# create thread.
|
||
thread = Threads.create()
|
||
print(thread)
|
||
|
||
# create a message.
|
||
message = Messages.create(thread.id, content=message)
|
||
# create run
|
||
|
||
run = Runs.create(thread.id, assistant_id=assistant.id)
|
||
print(run)
|
||
|
||
# wait for run completed or requires_action
|
||
run_status = Runs.wait(run.id, thread_id=thread.id)
|
||
print(run_status)
|
||
reference_txt = str(run_status)
|
||
extracted_references = extract_content_between_tags(reference_txt) #引用的文章来源list
|
||
# get the thread messages.
|
||
msgs = Messages.list(thread.id)
|
||
for message in msgs['data'][::-1]:
|
||
ans.append(message['content'][0]['text']['value'])
|
||
return ans,extracted_references
|
||
|
||
def rag_assistant(knowledge_name):
|
||
retriever = DashScopeCloudRetriever(knowledge_name)
|
||
pipeline_id = str(retriever.pipeline_id)
|
||
assistant = Assistants.create(
|
||
model='qwen-max',
|
||
name='smart helper',
|
||
description='智能助手,支持知识库查询和插件调用。',
|
||
temperature='0.3',
|
||
instructions=prom,
|
||
tools=[
|
||
{
|
||
"type": "code_interpreter"
|
||
},
|
||
{
|
||
"type": "rag",
|
||
"prompt_ra": {
|
||
"pipeline_id": pipeline_id,
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"query_word": {
|
||
"type": "str",
|
||
"value": "${document1}"
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}]
|
||
)
|
||
return assistant
|
||
|
||
|
||
def pure_assistant():
|
||
assistant = Assistants.create(
|
||
model='qwen-max',
|
||
name='smart helper',
|
||
description='智能助手,能基于用户的要求精准简洁地回答用户的提问',
|
||
instructions='智能助手,能基于用户的要求精准简洁地回答用户的提问',
|
||
tools=[
|
||
{
|
||
"type": "code_interpreter"
|
||
},
|
||
]
|
||
)
|
||
return assistant
|
||
|
||
def llm_call(question, knowledge_name, result_queue, ans_index, use_rag=True):
|
||
if use_rag:
|
||
assistant = rag_assistant(knowledge_name)
|
||
else:
|
||
assistant = pure_assistant()
|
||
ans,extracted_references = send_message(assistant, message=question)
|
||
for index, reference in enumerate(extracted_references, start=0):
|
||
print(f"{index}. {reference}")
|
||
formatted_ans=extract_content_from_json(ans[1])
|
||
print(formatted_ans)
|
||
|
||
results = find_references_in_extracted(formatted_ans, extracted_references)
|
||
for key, index in results.items():
|
||
print(f"{key}: Found at index {index}")
|
||
result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans)
|
||
|
||
def multi_threading(queries, knowledge_name, use_rag=True):
|
||
result_queue = queue.Queue()
|
||
|
||
# 使用 ThreadPoolExecutor 管理线程
|
||
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
||
# 使用字典保存每个提交的任务的Future对象,以便按顺序访问结果
|
||
future_to_query = {executor.submit(llm_call, query, knowledge_name, result_queue, index, use_rag): index for
|
||
index, query in enumerate(queries)}
|
||
|
||
# 收集每个线程的结果
|
||
for future in concurrent.futures.as_completed(future_to_query):
|
||
index = future_to_query[future]
|
||
# 由于 llm_call 函数本身会处理结果,这里只需要确保任务执行完成
|
||
try:
|
||
future.result() # 可以用来捕获异常或确认任务完成
|
||
except Exception as exc:
|
||
print(f"Query {index} generated an exception: {exc}")
|
||
|
||
# 从队列中获取所有结果并按索引排序
|
||
results = [None] * len(queries)
|
||
while not result_queue.empty():
|
||
index, result = result_queue.get()
|
||
results[index] = result
|
||
return results
|
||
|
||
if __name__ == "__main__":
|
||
# 读取问题列表
|
||
questions = ["该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"]
|
||
knowledge_name = "招标解析5word"
|
||
results = multi_threading(questions, knowledge_name, use_rag=True)
|
||
# 打印结果
|
||
for question, response in results:
|
||
print(f"Question: {question}")
|
||
print(f"Response: {response}")
|