12.20 豆包大模型bug解决
This commit is contained in:
parent
9c3c6d889e
commit
5dcbaa5eb5
@ -1,5 +1,8 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
|
||||
import fitz
|
||||
import PyPDF2
|
||||
import tempfile
|
||||
@ -184,6 +187,24 @@ def read_txt_to_string(file_path):
|
||||
except Exception as e:
|
||||
return f"错误:读取文件时发生错误。详细信息:{e}"
|
||||
|
||||
def count_tokens(text):
|
||||
"""
|
||||
统计文本中的 tokens 数量:
|
||||
1. 英文字母+数字作为一个 token(如 DN90)。
|
||||
2. 数字+小数点/百分号作为一个 token(如 0.25%)。
|
||||
3. 单个中文字符作为一个 token。
|
||||
4. 单个符号或标点符号作为一个 token。
|
||||
5. 忽略空白字符(空格、空行等)。
|
||||
"""
|
||||
# 正则表达式:
|
||||
# - 英文字母和数字组合:DN90
|
||||
# - 数字+小数点/百分号组合:0.25%
|
||||
# - 单个中文字符:[\u4e00-\u9fff]
|
||||
# - 单个非空白符号:[^\s]
|
||||
token_pattern = r'[a-zA-Z0-9]+(?:\.\d+)?%?|[\u4e00-\u9fff]|[^\s]'
|
||||
tokens = re.findall(token_pattern, text)
|
||||
return len(tokens)# 返回 tokens 数量和匹配的 token 列表
|
||||
|
||||
@sleep_and_retry
|
||||
@limits(calls=10, period=1) # 每秒最多调用10次
|
||||
def doubao_model(full_user_query):
|
||||
@ -199,7 +220,8 @@ def doubao_model(full_user_query):
|
||||
}
|
||||
|
||||
# 判断用户查询字符串的长度
|
||||
if len(full_user_query) > 32000:
|
||||
token_count = count_tokens(full_user_query)
|
||||
if token_count > 35000:
|
||||
selected_model = models["pro_128k"] # 如果长度超过32k,直接使用128k模型
|
||||
else:
|
||||
selected_model = models["pro_32k"] # 默认使用32k模型
|
||||
@ -210,10 +232,12 @@ def doubao_model(full_user_query):
|
||||
"Authorization": "Bearer " + doubao_api_key
|
||||
}
|
||||
|
||||
max_retries = 1 # 最大重试次数
|
||||
max_retries_429 = 2 # 针对 429 错误的最大重试次数
|
||||
max_retries_other = 1 # 针对其他错误的最大重试次数
|
||||
attempt = 0
|
||||
response = None # 确保 response 被定义
|
||||
|
||||
while attempt <= max_retries:
|
||||
while True:
|
||||
# 请求数据
|
||||
data = {
|
||||
"model": selected_model,
|
||||
@ -231,13 +255,35 @@ def doubao_model(full_user_query):
|
||||
# 返回模型的回复内容
|
||||
return response.json()["choices"][0]["message"]["content"]
|
||||
except requests.exceptions.RequestException as e:
|
||||
attempt += 1
|
||||
if attempt > max_retries:
|
||||
print(f"请求失败,尝试了 {attempt} 次。错误信息:{e}")
|
||||
return None # 或者根据需要返回其他默认值
|
||||
else:
|
||||
print(f"请求出错:{e}。正在尝试第 {attempt} 次重试,继续使用模型 {selected_model}...")
|
||||
# 获取状态码并处理不同的重试逻辑
|
||||
status_code = response.status_code if response is not None else None
|
||||
print(f"请求失败,状态码: {status_code}")
|
||||
print("请求失败,完整的响应内容如下:")
|
||||
print(response.text) # 打印原始的响应内容,可能是 JSON 格式,也可能是其他格式
|
||||
|
||||
# 如果是 429 错误
|
||||
if status_code == 429:
|
||||
if attempt < max_retries_429:
|
||||
wait_time = 2 if attempt == 0 else 4
|
||||
print(f"状态码为 429,等待 {wait_time} 秒后重试...")
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
print(f"状态码为 429,已达到最大重试次数 {max_retries_429} 次。")
|
||||
break # 超过最大重试次数,退出循环
|
||||
else:
|
||||
# 针对其他错误
|
||||
if attempt < max_retries_other:
|
||||
print("非 429 错误,等待 1 秒后重试...")
|
||||
time.sleep(1)
|
||||
else:
|
||||
print(f"非 429 错误,已达到最大重试次数 {max_retries_other} 次。")
|
||||
break # 超过最大重试次数,退出循环
|
||||
|
||||
attempt += 1 # 增加重试计数
|
||||
|
||||
# 如果到这里,说明所有尝试都失败了
|
||||
print(f"请求失败,已达到最大重试次数。")
|
||||
return None
|
||||
|
||||
def generate_full_user_query(file_path, prompt_template):
|
||||
"""
|
||||
|
@ -108,7 +108,7 @@ def extract_content_from_json(string, length_threshold=5000):
|
||||
2. 如果失败,并且字符串长度超过阈值,返回原始字符串。
|
||||
3. 如果失败且字符串长度不超过阈值,返回空字典。
|
||||
"""
|
||||
if not string.strip():
|
||||
if not string or not string.strip():
|
||||
return {}
|
||||
|
||||
# 提取第一个匹配的 JSON 对象
|
||||
|
@ -24,8 +24,7 @@ def generate_continue_query(original_query, original_answer):
|
||||
已有的被截断的回答:
|
||||
{original_answer}
|
||||
|
||||
请接着该内容输出,无需重复该内容,因为我需要拼接该内容和你的回答,使得它们总体为json格式即可。
|
||||
"""
|
||||
请接着已有回答继续输出后续内容。你的回答无需是完整的 JSON 格式,我会将你的回答与已有内容拼接为完整的 JSON。请专注于生成后续内容即可。"""
|
||||
return continue_query
|
||||
|
||||
|
||||
@ -44,8 +43,8 @@ def continue_answer(original_query, original_answer, model_type=1, file_id=None)
|
||||
"""
|
||||
# 生成继续查询的 Prompt
|
||||
continue_query = generate_continue_query(original_query, original_answer)
|
||||
print("继续问答")
|
||||
print(continue_query)
|
||||
# print("继续问答")
|
||||
# print(continue_query)
|
||||
|
||||
# 根据模型类型选择调用的模型
|
||||
if model_type == 1: # 使用 doubao_model
|
||||
@ -70,14 +69,23 @@ def continue_answer(original_query, original_answer, model_type=1, file_id=None)
|
||||
|
||||
# 拼接字符串
|
||||
full_answer = clean_original + clean_model
|
||||
|
||||
# 打印拼接后的字符串
|
||||
print("拼接后的完整回答字符串:")
|
||||
print(full_answer)
|
||||
# print("原来的回答:")
|
||||
# print(clean_original)
|
||||
# print("---------------------")
|
||||
# print("第二次回答")
|
||||
# print(clean_model)
|
||||
# print("---------------------")
|
||||
# # 打印拼接后的字符串
|
||||
# print("拼接后的完整回答字符串:")
|
||||
# print(full_answer)
|
||||
|
||||
# 尝试解析为 JSON
|
||||
try:
|
||||
json_data = clean_json_string(full_answer)
|
||||
if not isinstance(json_data, dict):
|
||||
print(f"警告: clean_json_string 返回的类型为 {type(json_data)},预期为 dict。")
|
||||
print(json_data)
|
||||
return {}
|
||||
print("JSON 拼接成功且有效!")
|
||||
return json_data
|
||||
except json.JSONDecodeError as e:
|
||||
@ -118,9 +126,13 @@ def process_continue_answers(questions_to_continue, model_type, file_id):
|
||||
try:
|
||||
continued_result = future.result()
|
||||
if continued_result: # 确保结果不为空
|
||||
if isinstance(continued_result, dict):
|
||||
continued_results.update(continued_result)
|
||||
else:
|
||||
print(f"警告: continued_result 不是字典类型,无法更新。")
|
||||
except Exception as e:
|
||||
print(f"在处理问题 '{original_query}' 时发生错误: {e}") #TODO:排查一下
|
||||
return {}
|
||||
|
||||
return continued_results
|
||||
|
||||
|
@ -279,8 +279,10 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
||||
#TODO:1.先截取合同、投标文件格式之前的页码 即 invalid 如果页码小于50页,那么剩下的不切了直接仍。
|
||||
# 2.废标项这边,考虑大模型+正则并用
|
||||
# 3.限制评分项的因素。
|
||||
#商务标这里改为列表最里层
|
||||
#good_list 金额 截取上下文
|
||||
|
||||
#TODO:评分、开评定标这边也加上超长逻辑
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 配置日志器
|
||||
unique_id = "uuidzyzy11"
|
||||
|
4078
flask_app/test_case/test_doubao.py
Normal file
4078
flask_app/test_case/test_doubao.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -352,6 +352,6 @@ if __name__ == "__main__":
|
||||
# truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf"
|
||||
# file_id = upload_file(truncate_file)
|
||||
# processed_filepath = pdf2txt(procurement_path)
|
||||
processed_filepath=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\extract1.txt'
|
||||
processed_filepath=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\extract1.txt'
|
||||
final_res= get_business_requirements(procurement_path,processed_filepath,1)
|
||||
print(json.dumps(final_res, ensure_ascii=False, indent=4))
|
||||
|
Loading…
x
Reference in New Issue
Block a user