11.2 技术参数提取解决究极bug，增加了多线程提问中的超时重问处理

2024-11-02 14:29:28 +08:00 · 2024-11-02 14:29:28 +08:00 · 6e05393961
commit 6e05393961
parent 56f44018fe
6 changed files with 60 additions and 33 deletions
--- a/flask_app/general/json_utils.py
+++ b/flask_app/general/json_utils.py
@ -19,12 +19,14 @@ def extract_content_from_json(string):
            return json.loads(json_data)
        except json.JSONDecodeError as original_error:
            print(f"原始 JSON 解析失败: {original_error}")
            print(string)
            try:
                # 尝试修复缺失的逗号
                fixed_json = insert_missing_commas(json_data)
                return json.loads(fixed_json)
            except json.JSONDecodeError as fixed_error:
                print(f"修复后的 JSON 解析失败: {fixed_error}")
                print(string)
                # 可选：返回空字典或其他默认值
                return {}
    else:
--- a/flask_app/general/little_zbparse.py
+++ b/flask_app/general/little_zbparse.py
@ -36,12 +36,14 @@ def little_parse_goods(output_folder, file_path):
        dict: 包含 '基础信息' 的字典。
    """
    # 截取特定的货物 PDF 文件
-    selections = [4, 1]  # 仅处理 selection 1和4  #公告+投标人须知
+    selections = [1,4]  # 仅处理 selection 1和4  #公告+投标人须知
    files = truncate_pdf_specific_goods(file_path, output_folder,selections)
    if not files:
        raise ValueError("未找到截取后的文件。")
    # 假设最后一个文件是需要处理的基础信息文件
    baseinfo_file_path = files[-1]
    if not baseinfo_file_path:
        baseinfo_file_path=file_path     #截取失败就传整份文件
    # 上传文件并获取文件 ID
    file_id = upload_file(baseinfo_file_path)
    # 注意：以下路径被硬编码，确保该路径存在并且正确
@ -76,6 +78,8 @@ def little_parse_engineering(output_folder, file_path):
        raise ValueError("未找到截取后的文件。")
    # 假设最后一个文件是需要处理的基础信息文件
    baseinfo_file_path = files[-1]
    if not baseinfo_file_path:
        baseinfo_file_path=file_path     #截取失败就传整份文件
    # 上传文件并获取文件 ID
    file_id = upload_file(baseinfo_file_path)
    # 注意：以下路径被硬编码，确保该路径存在并且正确
--- a/flask_app/general/多线程提问.py
+++ b/flask_app/general/多线程提问.py
@ -255,25 +255,35 @@ def multi_threading(queries, knowledge_name="", file_id="", llm_type=1):
    print("多线程提问：starting multi_threading...")
    result_queue = queue.Queue()
-
+    max_retries = 2  # 设置最大重试次数
    # 使用 ThreadPoolExecutor 管理线程
-    with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
        # 逐个提交任务，每提交一个任务后休眠1秒
        future_to_query = {}
        for index, query in enumerate(queries):
            future = executor.submit(llm_call, query, knowledge_name, file_id, result_queue, index, llm_type)
            future_to_query[future] = index
-            time.sleep(1)  # 每提交一个任务后等待1秒
+            time.sleep(0.5)  # 每提交一个任务后等待1秒
        # 收集每个线程的结果
        for future in concurrent.futures.as_completed(future_to_query):
            index = future_to_query[future]
            retries = 0
            try:
                future.result()  # 捕获异常或确认任务完成
            except Exception as exc:
                print(f"Query {index} generated an exception: {exc}")
                retries += 1  # 增加重试计数
                # 确保在异常情况下也向 result_queue 添加占位符
                result_queue.put((index, None))
                if retries < max_retries:
                    print(f"Retrying query {index} (attempt {retries + 1})...")
                    # 重新提交任务
                    future = executor.submit(llm_call, queries[index], knowledge_name, file_id, result_queue, index, llm_type)      #可能遇到阿里服务器挂壁的情况，重试一下
                    future_to_query[future] = index
                else:
                    print(f"Query {index} failed after {max_retries} attempts.")
                    break  # 超过最大重试次数，退出循环
    # 从队列中获取所有结果并按索引排序
    results = [None] * len(queries)
--- a/flask_app/general/纯技术参数要求提取.py
+++ b/flask_app/general/纯技术参数要求提取.py
@ -28,12 +28,13 @@ def extract_matching_keys(data_dict, good_list):
            for item in current_dict:
                recurse(item)
        # 如果current_dict不是dict或list，则无需进一步处理
    recurse(data_dict)
    return result
 def get_technical_requirements_main(file_path,output_folder):
    truncate_file=truncate_pdf_main(file_path,output_folder,5)[0]
    if not truncate_file:
        truncate_file=file_path    #直接传整份文件
    truncate_file_docx=pdf2docx(truncate_file)
    file_id=upload_file(truncate_file_docx)
    # file_id=upload_file(truncate_file)
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -731,16 +731,16 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
 # TODO:交通智能系统和招标(1)(1)文件有问题  包头 绍兴    资格审查文件可能不需要默认与"evaluation"同一章  无效投标可能也要考虑 “more”的情况，类似工程标   唐山投标只有正文，没有附表
 if __name__ == "__main__":
-    input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\货物标\\HBDL-2024-0519-001-招标文件.pdf"
+    # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\货物标\\HBDL-2024-0519-001-招标文件.pdf"
    # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.pdf"
-    # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
+    input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
    # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
    # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
    output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
-    files = truncate_pdf_multiple(input_path, output_folder)
+    # files = truncate_pdf_multiple(input_path, output_folder)
-    # selections = [1,4]
+    selections = [1,4]
-    # files=truncate_pdf_specific_goods(input_path,output_folder,selections)
+    files=truncate_pdf_specific_goods(input_path,output_folder,selections)
    print(files)
    # selection = 4# 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
    # generated_files = truncate_pdf_main(input_path, output_folder, selection)
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -36,29 +36,29 @@ def generate_key_paths(data, parent_key=''):
                no_keys_added = no_keys_added and sub_no_keys_added
            else:
                # 空字典视为叶子节点
-                key_paths.append(current_key)
+                key_paths.append(current_key.replace(" ",""))
-                good_list.append(key)
+                good_list.append(key.replace(" ", ""))  # 去掉空格后添加
                no_keys_added = False
        elif isinstance(value, list):
            if value:
                # 非空列表视为叶子节点
-                key_paths.append(current_key)
+                key_paths.append(current_key.replace(" ",""))
-                good_list.append(key)
+                good_list.append(key.replace(" ", ""))  # 去掉空格后添加
                no_keys_added = False
            else:
                # 空列表也视为叶子节点（根据需求可以调整）
-                key_paths.append(current_key)
+                key_paths.append(current_key.replace(" ",""))
-                good_list.append(key)
+                good_list.append(key.replace(" ", ""))  # 去掉空格后添加
                no_keys_added = False
        elif value in {"未知", "", "/"}:
            # 特定值视为叶子节点
-            key_paths.append(current_key)
+            key_paths.append(current_key.replace(" ",""))
-            good_list.append(key)
+            good_list.append(key.replace(" ", ""))  # 去掉空格后添加
            no_keys_added = False
        else:
            # 其他情况不视为叶子节点
-            key_paths.append(current_key)
+            key_paths.append(current_key.replace(" ",""))
-            good_list.append(key)
+            good_list.append(key.replace(" ", ""))  # 去掉空格后添加
            no_keys_added = False
    return key_paths, good_list, no_keys_added
@ -91,19 +91,18 @@ def postprocess(data):
 def get_technical_requirements(file_id):
    user_query1 = """
-    这是一份货物标中采购要求部分的内容，请告诉我需要采购的货物，如果有采购清单，请直接根据清单上的货物名称给出结果，若没有采购清单，你要从表格或文中摘取需要采购的系统（或货物），采购需求中可能包含层次关系，例如采购的某大系统中可能包含几种货物，那么你需要用嵌套键值对表示这种关系，且不要遗漏该系统中包含的货物，你的输出请以json格式返回，最外层键名为'采购需求'，嵌套键名为对应的系统名称或货物名称，需与原文保持一致，无需给出采购数量和单位，如有未知内容，在对应键值处填'未知'。以下为示例输出：
+这是一份货物标中采购要求部分的内容，请告诉我需要采购的货物，如果有采购清单，请直接根据清单上的货物（或系统）名称给出结果，若没有采购清单，你要从表格中或文中摘取需要采购的系统（或货物），采购需求中可能包含层次关系，例如采购的某大系统中可能包含几种货物，那么你需要用嵌套键值对表示这种关系，且不要遗漏该系统中包含的货物，你的输出请以json格式返回，最外层键名为'采购需求'，嵌套键名为对应的系统名称或货物名称，需与原文保持一致，无需给出采购数量和单位，如有未知内容，在对应键值处填'未知'。以下为示例输出：
-    {
+{
-  "采购需求": {
+    "采购需求": {
-    "交通信号灯": {},
+        "门禁管理系统": {},
-    "交通监控视频子系统": {
+        "交通监控视频子系统": {
-        "高清视频抓拍像机":{},
+            "高清视频抓拍像机":{},
-        "补光灯":{}
+            "补光灯":{}
-    },
+        },
    "LED全彩显示屏": {},
    "数字会议发言主机": {},
-  }
+    }
 }
    """
    res = qianwen_long(file_id, user_query1)
    print(res)
@ -115,13 +114,22 @@ def get_technical_requirements(file_id):
    if no_keys_added:
        final_res = postprocess(cleaned_res)
    else:
-        user_query_template = "这是一份货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求），请以json格式返回结果，外层键名为\"{}\", 键值对中的键是你对该要求的总结，而值需要完全与原文保持一致，不可擅自总结删减。"
+        # user_query_template = "请你根据该货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求），请以json格式返回结果，外层键名为\"{}\", 键值对中的键是你对该要求的总结，而值需要完全与原文保持一致，不可擅自总结删减。"
        user_query_template = """
 请你根据该货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求），请以json格式返回结果，键名为\"{}\", 键值为一个列表，列表中包含若干描述\"{}\"的技术参数（或采购要求）的字符串。示例格式如下：
 {{
    "摄像机控制键盘": [
        "支持串行 RS232/RS422 和 IP 混合控制，允许在一个控制器上使用 RS232/RS422/IP 控制单个系统中的摄像机；",
        "支持 2 组 RS422 串口 VISCA 协议菊花链控制 2x7 台摄像机。"
    ]
 }}
            """
        queries = []
        for key in keys_list:
            # 将键中的 '.' 替换为 '下的'
            modified_key = key.replace('.', '下的')
            # 使用修改后的键填充第一个占位符，原始键填充第二个占位符
-            new_query = user_query_template.format(modified_key, key)
+            new_query = user_query_template.format(modified_key, key,modified_key)
            queries.append(new_query)
        results = multi_threading(queries, "", file_id, 2)
        technical_requirements = []
@ -164,7 +172,9 @@ def test_all_files_in_folder(input_folder, output_folder):
                print(f"处理文件 {file_path} 时出错: {e}")
 if __name__ == "__main__":
-    truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
+    # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
    # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(107国道).docx"
    truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\招标文件(107国道)_procurement.docx"
    output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"
    file_id = upload_file(truncate_file)
    res=get_technical_requirements(file_id)