From 6e053939614294c25f46d2b3bdb61e5ace646b97 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Sat, 2 Nov 2024 14:29:28 +0800
Subject: [PATCH] =?UTF-8?q?11.2=20=E6=8A=80=E6=9C=AF=E5=8F=82=E6=95=B0?=
 =?UTF-8?q?=E6=8F=90=E5=8F=96=E8=A7=A3=E5=86=B3=E7=A9=B6=E6=9E=81bug?=
 =?UTF-8?q?=EF=BC=8C=E5=A2=9E=E5=8A=A0=E4=BA=86=E5=A4=9A=E7=BA=BF=E7=A8=8B?=
 =?UTF-8?q?=E6=8F=90=E9=97=AE=E4=B8=AD=E7=9A=84=E8=B6=85=E6=97=B6=E9=87=8D?=
 =?UTF-8?q?=E9=97=AE=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/general/json_utils.py         |  2 +
 flask_app/general/little_zbparse.py     |  6 ++-
 flask_app/general/多线程提问.py         | 16 +++++--
 flask_app/general/纯技术参数要求提取.py |  3 +-
 flask_app/货物标/截取pdf货物标版.py     | 10 ++---
 flask_app/货物标/技术参数要求提取.py    | 56 +++++++++++++++----------
 6 files changed, 60 insertions(+), 33 deletions(-)

diff --git a/flask_app/general/json_utils.py b/flask_app/general/json_utils.py
index d7b1d7f..b331fd2 100644
--- a/flask_app/general/json_utils.py
+++ b/flask_app/general/json_utils.py
@@ -19,12 +19,14 @@ def extract_content_from_json(string):
             return json.loads(json_data)
         except json.JSONDecodeError as original_error:
             print(f"原始 JSON 解析失败: {original_error}")
+            print(string)
             try:
                 # 尝试修复缺失的逗号
                 fixed_json = insert_missing_commas(json_data)
                 return json.loads(fixed_json)
             except json.JSONDecodeError as fixed_error:
                 print(f"修复后的 JSON 解析失败: {fixed_error}")
+                print(string)
                 # 可选：返回空字典或其他默认值
                 return {}
     else:
diff --git a/flask_app/general/little_zbparse.py b/flask_app/general/little_zbparse.py
index 8ca7e7c..64f766f 100644
--- a/flask_app/general/little_zbparse.py
+++ b/flask_app/general/little_zbparse.py
@@ -36,12 +36,14 @@ def little_parse_goods(output_folder, file_path):
         dict: 包含 '基础信息' 的字典。
     """
     # 截取特定的货物 PDF 文件
-    selections = [4, 1]  # 仅处理 selection 1和4  #公告+投标人须知
+    selections = [1,4]  # 仅处理 selection 1和4  #公告+投标人须知
     files = truncate_pdf_specific_goods(file_path, output_folder,selections)
     if not files:
         raise ValueError("未找到截取后的文件。")
     # 假设最后一个文件是需要处理的基础信息文件
     baseinfo_file_path = files[-1]
+    if not baseinfo_file_path:
+        baseinfo_file_path=file_path     #截取失败就传整份文件
     # 上传文件并获取文件 ID
     file_id = upload_file(baseinfo_file_path)
     # 注意：以下路径被硬编码，确保该路径存在并且正确
@@ -76,6 +78,8 @@ def little_parse_engineering(output_folder, file_path):
         raise ValueError("未找到截取后的文件。")
     # 假设最后一个文件是需要处理的基础信息文件
     baseinfo_file_path = files[-1]
+    if not baseinfo_file_path:
+        baseinfo_file_path=file_path     #截取失败就传整份文件
     # 上传文件并获取文件 ID
     file_id = upload_file(baseinfo_file_path)
     # 注意：以下路径被硬编码，确保该路径存在并且正确
diff --git a/flask_app/general/多线程提问.py b/flask_app/general/多线程提问.py
index 098d47b..8c14688 100644
--- a/flask_app/general/多线程提问.py
+++ b/flask_app/general/多线程提问.py
@@ -255,25 +255,35 @@ def multi_threading(queries, knowledge_name="", file_id="", llm_type=1):
 
     print("多线程提问：starting multi_threading...")
     result_queue = queue.Queue()
-
+    max_retries = 2  # 设置最大重试次数
     # 使用 ThreadPoolExecutor 管理线程
-    with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
         # 逐个提交任务，每提交一个任务后休眠1秒
         future_to_query = {}
         for index, query in enumerate(queries):
             future = executor.submit(llm_call, query, knowledge_name, file_id, result_queue, index, llm_type)
             future_to_query[future] = index
-            time.sleep(1)  # 每提交一个任务后等待1秒
+            time.sleep(0.5)  # 每提交一个任务后等待1秒
 
         # 收集每个线程的结果
         for future in concurrent.futures.as_completed(future_to_query):
             index = future_to_query[future]
+            retries = 0
             try:
                 future.result()  # 捕获异常或确认任务完成
             except Exception as exc:
                 print(f"Query {index} generated an exception: {exc}")
+                retries += 1  # 增加重试计数
                 # 确保在异常情况下也向 result_queue 添加占位符
                 result_queue.put((index, None))
+                if retries < max_retries:
+                    print(f"Retrying query {index} (attempt {retries + 1})...")
+                    # 重新提交任务
+                    future = executor.submit(llm_call, queries[index], knowledge_name, file_id, result_queue, index, llm_type)      #可能遇到阿里服务器挂壁的情况，重试一下
+                    future_to_query[future] = index
+                else:
+                    print(f"Query {index} failed after {max_retries} attempts.")
+                    break  # 超过最大重试次数，退出循环
 
     # 从队列中获取所有结果并按索引排序
     results = [None] * len(queries)
diff --git a/flask_app/general/纯技术参数要求提取.py b/flask_app/general/纯技术参数要求提取.py
index d6d54d0..1856a09 100644
--- a/flask_app/general/纯技术参数要求提取.py
+++ b/flask_app/general/纯技术参数要求提取.py
@@ -28,12 +28,13 @@ def extract_matching_keys(data_dict, good_list):
             for item in current_dict:
                 recurse(item)
         # 如果current_dict不是dict或list，则无需进一步处理
-
     recurse(data_dict)
     return result
 
 def get_technical_requirements_main(file_path,output_folder):
     truncate_file=truncate_pdf_main(file_path,output_folder,5)[0]
+    if not truncate_file:
+        truncate_file=file_path    #直接传整份文件
     truncate_file_docx=pdf2docx(truncate_file)
     file_id=upload_file(truncate_file_docx)
     # file_id=upload_file(truncate_file)
diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py
index 7c28cf1..4f54f66 100644
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@@ -731,16 +731,16 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
 
 # TODO:交通智能系统和招标(1)(1)文件有问题  包头 绍兴    资格审查文件可能不需要默认与"evaluation"同一章  无效投标可能也要考虑 “more”的情况，类似工程标   唐山投标只有正文，没有附表
 if __name__ == "__main__":
-    input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\货物标\\HBDL-2024-0519-001-招标文件.pdf"
+    # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\货物标\\HBDL-2024-0519-001-招标文件.pdf"
     # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.pdf"
-    # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
+    input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
     # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
     # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
     # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
     output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
-    files = truncate_pdf_multiple(input_path, output_folder)
-    # selections = [1,4]
-    # files=truncate_pdf_specific_goods(input_path,output_folder,selections)
+    # files = truncate_pdf_multiple(input_path, output_folder)
+    selections = [1,4]
+    files=truncate_pdf_specific_goods(input_path,output_folder,selections)
     print(files)
     # selection = 4# 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
     # generated_files = truncate_pdf_main(input_path, output_folder, selection)
diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py
index d9e906d..03d47a4 100644
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@@ -36,29 +36,29 @@ def generate_key_paths(data, parent_key=''):
                 no_keys_added = no_keys_added and sub_no_keys_added
             else:
                 # 空字典视为叶子节点
-                key_paths.append(current_key)
-                good_list.append(key)
+                key_paths.append(current_key.replace(" ",""))
+                good_list.append(key.replace(" ", ""))  # 去掉空格后添加
                 no_keys_added = False
         elif isinstance(value, list):
             if value:
                 # 非空列表视为叶子节点
-                key_paths.append(current_key)
-                good_list.append(key)
+                key_paths.append(current_key.replace(" ",""))
+                good_list.append(key.replace(" ", ""))  # 去掉空格后添加
                 no_keys_added = False
             else:
                 # 空列表也视为叶子节点（根据需求可以调整）
-                key_paths.append(current_key)
-                good_list.append(key)
+                key_paths.append(current_key.replace(" ",""))
+                good_list.append(key.replace(" ", ""))  # 去掉空格后添加
                 no_keys_added = False
         elif value in {"未知", "", "/"}:
             # 特定值视为叶子节点
-            key_paths.append(current_key)
-            good_list.append(key)
+            key_paths.append(current_key.replace(" ",""))
+            good_list.append(key.replace(" ", ""))  # 去掉空格后添加
             no_keys_added = False
         else:
             # 其他情况不视为叶子节点
-            key_paths.append(current_key)
-            good_list.append(key)
+            key_paths.append(current_key.replace(" ",""))
+            good_list.append(key.replace(" ", ""))  # 去掉空格后添加
             no_keys_added = False
 
     return key_paths, good_list, no_keys_added
@@ -91,19 +91,18 @@ def postprocess(data):
 
 def get_technical_requirements(file_id):
     user_query1 = """
-    这是一份货物标中采购要求部分的内容，请告诉我需要采购的货物，如果有采购清单，请直接根据清单上的货物名称给出结果，若没有采购清单，你要从表格或文中摘取需要采购的系统（或货物），采购需求中可能包含层次关系，例如采购的某大系统中可能包含几种货物，那么你需要用嵌套键值对表示这种关系，且不要遗漏该系统中包含的货物，你的输出请以json格式返回，最外层键名为'采购需求'，嵌套键名为对应的系统名称或货物名称，需与原文保持一致，无需给出采购数量和单位，如有未知内容，在对应键值处填'未知'。以下为示例输出：
-    {
-  "采购需求": {
-    "交通信号灯": {},
-    "交通监控视频子系统": {
-        "高清视频抓拍像机":{},
-        "补光灯":{}
-    },
+这是一份货物标中采购要求部分的内容，请告诉我需要采购的货物，如果有采购清单，请直接根据清单上的货物（或系统）名称给出结果，若没有采购清单，你要从表格中或文中摘取需要采购的系统（或货物），采购需求中可能包含层次关系，例如采购的某大系统中可能包含几种货物，那么你需要用嵌套键值对表示这种关系，且不要遗漏该系统中包含的货物，你的输出请以json格式返回，最外层键名为'采购需求'，嵌套键名为对应的系统名称或货物名称，需与原文保持一致，无需给出采购数量和单位，如有未知内容，在对应键值处填'未知'。以下为示例输出：
+{
+    "采购需求": {
+        "门禁管理系统": {},
+        "交通监控视频子系统": {
+            "高清视频抓拍像机":{},
+            "补光灯":{}
+        },
     "LED全彩显示屏": {},
     "数字会议发言主机": {},
-  }
+    }
 }
-    
     """
     res = qianwen_long(file_id, user_query1)
     print(res)
@@ -115,13 +114,22 @@ def get_technical_requirements(file_id):
     if no_keys_added:
         final_res = postprocess(cleaned_res)
     else:
-        user_query_template = "这是一份货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求），请以json格式返回结果，外层键名为\"{}\", 键值对中的键是你对该要求的总结，而值需要完全与原文保持一致，不可擅自总结删减。"
+        # user_query_template = "请你根据该货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求），请以json格式返回结果，外层键名为\"{}\", 键值对中的键是你对该要求的总结，而值需要完全与原文保持一致，不可擅自总结删减。"
+        user_query_template = """
+请你根据该货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求），请以json格式返回结果，键名为\"{}\", 键值为一个列表，列表中包含若干描述\"{}\"的技术参数（或采购要求）的字符串。示例格式如下：
+{{
+    "摄像机控制键盘": [
+        "支持串行 RS232/RS422 和 IP 混合控制，允许在一个控制器上使用 RS232/RS422/IP 控制单个系统中的摄像机；",
+        "支持 2 组 RS422 串口 VISCA 协议菊花链控制 2x7 台摄像机。"
+    ]
+}}
+            """
         queries = []
         for key in keys_list:
             # 将键中的 '.' 替换为 '下的'
             modified_key = key.replace('.', '下的')
             # 使用修改后的键填充第一个占位符，原始键填充第二个占位符
-            new_query = user_query_template.format(modified_key, key)
+            new_query = user_query_template.format(modified_key, key,modified_key)
             queries.append(new_query)
         results = multi_threading(queries, "", file_id, 2)
         technical_requirements = []
@@ -164,7 +172,9 @@ def test_all_files_in_folder(input_folder, output_folder):
                 print(f"处理文件 {file_path} 时出错: {e}")
 
 if __name__ == "__main__":
-    truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
+    # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
+    # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(107国道).docx"
+    truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\招标文件(107国道)_procurement.docx"
     output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"
     file_id = upload_file(truncate_file)
     res=get_technical_requirements(file_id)