12.20 豆包大模型bug解决

2024-12-20 12:32:06 +08:00 · 2024-12-20 12:32:06 +08:00 · 5dcbaa5eb5
commit 5dcbaa5eb5
parent 9c3c6d889e
6 changed files with 4160 additions and 22 deletions
--- a/flask_app/general/doubao.py
+++ b/flask_app/general/doubao.py
@ -1,5 +1,8 @@
 import json
 import os
+import re
+import time
+
 import fitz
 import PyPDF2
 import tempfile
@ -184,6 +187,24 @@ def read_txt_to_string(file_path):
    except Exception as e:
        return f"错误：读取文件时发生错误。详细信息：{e}"

+def count_tokens(text):
+    """
+    统计文本中的 tokens 数量：
+    1. 英文字母+数字作为一个 token（如 DN90）。
+    2. 数字+小数点/百分号作为一个 token（如 0.25%）。
+    3. 单个中文字符作为一个 token。
+    4. 单个符号或标点符号作为一个 token。
+    5. 忽略空白字符（空格、空行等）。
+    """
+    # 正则表达式：
+    # - 英文字母和数字组合：DN90
+    # - 数字+小数点/百分号组合：0.25%
+    # - 单个中文字符：[\u4e00-\u9fff]
+    # - 单个非空白符号：[^\s]
+    token_pattern = r'[a-zA-Z0-9]+(?:\.\d+)?%?|[\u4e00-\u9fff]|[^\s]'
+    tokens = re.findall(token_pattern, text)
+    return len(tokens)# 返回 tokens 数量和匹配的 token 列表
+
@sleep_and_retry
@limits(calls=10, period=1)  # 每秒最多调用10次
 def doubao_model(full_user_query):
@ -199,7 +220,8 @@ def doubao_model(full_user_query):
    }

    # 判断用户查询字符串的长度
-    if len(full_user_query) > 32000:
+    token_count = count_tokens(full_user_query)
+    if token_count > 35000:
        selected_model = models["pro_128k"]  # 如果长度超过32k，直接使用128k模型
    else:
        selected_model = models["pro_32k"]  # 默认使用32k模型
@ -210,10 +232,12 @@ def doubao_model(full_user_query):
        "Authorization": "Bearer " + doubao_api_key
    }

-    max_retries = 1  # 最大重试次数
+    max_retries_429 = 2  # 针对 429 错误的最大重试次数
+    max_retries_other = 1  # 针对其他错误的最大重试次数
    attempt = 0
+    response = None  # 确保 response 被定义

-    while attempt <= max_retries:
+    while True:
        # 请求数据
        data = {
            "model": selected_model,
@ -231,13 +255,35 @@ def doubao_model(full_user_query):
            # 返回模型的回复内容
            return response.json()["choices"][0]["message"]["content"]
        except requests.exceptions.RequestException as e:
-            attempt += 1
-            if attempt > max_retries:
-                print(f"请求失败，尝试了 {attempt} 次。错误信息：{e}")
-                return None  # 或者根据需要返回其他默认值
-            else:
-                print(f"请求出错：{e}。正在尝试第 {attempt} 次重试，继续使用模型 {selected_model}...")
+            # 获取状态码并处理不同的重试逻辑
+            status_code = response.status_code if response is not None else None
+            print(f"请求失败，状态码: {status_code}")
+            print("请求失败，完整的响应内容如下：")
+            print(response.text)  # 打印原始的响应内容，可能是 JSON 格式，也可能是其他格式

+            # 如果是 429 错误
+            if status_code == 429:
+                if attempt < max_retries_429:
+                    wait_time = 2 if attempt == 0 else 4
+                    print(f"状态码为 429，等待 {wait_time} 秒后重试...")
+                    time.sleep(wait_time)
+                else:
+                    print(f"状态码为 429，已达到最大重试次数 {max_retries_429} 次。")
+                    break  # 超过最大重试次数，退出循环
+            else:
+                # 针对其他错误
+                if attempt < max_retries_other:
+                    print("非 429 错误，等待 1 秒后重试...")
+                    time.sleep(1)
+                else:
+                    print(f"非 429 错误，已达到最大重试次数 {max_retries_other} 次。")
+                    break  # 超过最大重试次数，退出循环
+
+            attempt += 1  # 增加重试计数
+
+    # 如果到这里，说明所有尝试都失败了
+    print(f"请求失败，已达到最大重试次数。")
+    return None

 def generate_full_user_query(file_path, prompt_template):
    """
--- a/flask_app/general/json_utils.py
+++ b/flask_app/general/json_utils.py
@ -108,7 +108,7 @@ def extract_content_from_json(string, length_threshold=5000):
    2. 如果失败，并且字符串长度超过阈值，返回原始字符串。
    3. 如果失败且字符串长度不超过阈值，返回空字典。
    """
-    if not string.strip():
+    if not string or not string.strip():
        return {}

    # 提取第一个匹配的 JSON 对象
--- a/flask_app/general/model_continue_query.py
+++ b/flask_app/general/model_continue_query.py
@ -24,8 +24,7 @@ def generate_continue_query(original_query, original_answer):
 已有的被截断的回答：
 {original_answer}

-请接着该内容输出，无需重复该内容，因为我需要拼接该内容和你的回答，使得它们总体为json格式即可。
-"""
+请接着已有回答继续输出后续内容。你的回答无需是完整的 JSON 格式，我会将你的回答与已有内容拼接为完整的 JSON。请专注于生成后续内容即可。"""
    return continue_query


@ -44,8 +43,8 @@ def continue_answer(original_query, original_answer, model_type=1, file_id=None)
    """
    # 生成继续查询的 Prompt
    continue_query = generate_continue_query(original_query, original_answer)
-    print("继续问答")
-    print(continue_query)
+    # print("继续问答")
+    # print(continue_query)

    # 根据模型类型选择调用的模型
    if model_type == 1:  # 使用 doubao_model
@ -70,14 +69,23 @@ def continue_answer(original_query, original_answer, model_type=1, file_id=None)

    # 拼接字符串
    full_answer = clean_original + clean_model
-
-    # 打印拼接后的字符串
-    print("拼接后的完整回答字符串：")
-    print(full_answer)
+    # print("原来的回答：")
+    # print(clean_original)
+    # print("---------------------")
+    # print("第二次回答")
+    # print(clean_model)
+    # print("---------------------")
+    # # 打印拼接后的字符串
+    # print("拼接后的完整回答字符串：")
+    # print(full_answer)

    # 尝试解析为 JSON
    try:
        json_data = clean_json_string(full_answer)
+        if not isinstance(json_data, dict):
+            print(f"警告: clean_json_string 返回的类型为 {type(json_data)}，预期为 dict。")
+            print(json_data)
+            return {}
        print("JSON 拼接成功且有效！")
        return json_data
    except json.JSONDecodeError as e:
@ -118,9 +126,13 @@ def process_continue_answers(questions_to_continue, model_type, file_id):
            try:
                continued_result = future.result()
                if continued_result:  # 确保结果不为空
+                    if isinstance(continued_result, dict):
                        continued_results.update(continued_result)
+                    else:
+                        print(f"警告: continued_result 不是字典类型，无法更新。")
            except Exception as e:
                print(f"在处理问题 '{original_query}' 时发生错误: {e}")    #TODO:排查一下
+                return {}

    return continued_results

--- a/flask_app/routes/货物标解析main.py
+++ b/flask_app/routes/货物标解析main.py
@ -279,8 +279,10 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
 #TODO:1.先截取合同、投标文件格式之前的页码 即 invalid 如果页码小于50页，那么剩下的不切了直接仍。
 #   2.废标项这边，考虑大模型+正则并用
 #   3.限制评分项的因素。
-#商务标这里改为列表最里层
-#good_list 金额  截取上下文
+
+#TODO:评分、开评定标这边也加上超长逻辑
+
+
 if __name__ == "__main__":
    # 配置日志器
    unique_id = "uuidzyzy11"
--- a/flask_app/test_case/test_doubao.py
+++ b/flask_app/test_case/test_doubao.py
--- a/flask_app/货物标/商务服务其他要求提取.py
+++ b/flask_app/货物标/商务服务其他要求提取.py
@ -352,6 +352,6 @@ if __name__ == "__main__":
    # truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf"
    # file_id = upload_file(truncate_file)
    # processed_filepath = pdf2txt(procurement_path)
-    processed_filepath=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\extract1.txt'
+    processed_filepath=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\extract1.txt'
    final_res= get_business_requirements(procurement_path,processed_filepath,1)
    print(json.dumps(final_res, ensure_ascii=False, indent=4))